Robert Smith avatar Robert Smith committed 9f31b79

Add median exercise.

Comments (0)

Files changed (1)

miscellaneous_exercises/median.lisp

+;;;; median.lisp
+;;;; Copyright (c) 2013 Robert Smith
+
+;;;; PROBLEM
+;;;;
+;;;; Given M machines with an average of N B-bit integers per machine,
+;;;; find the median.
+
+;;;; SOLUTION
+;;;;
+;;;; In the following solution, a machine is represented simply by a
+;;;; vector of integers (typically fixnums), and a collection of
+;;;; machines is a list of vectors.
+;;;;
+;;;; The overall strategy is, instead of holistically computing the
+;;;; median, we ask a machine if a particular number N is the median,
+;;;; and it "responds" with yes or no, and if not, it gives us a hint
+;;;; about what the median is relative to N (i.e., larger or
+;;;; smaller). If we know the range of values the median can take (we
+;;;; do, since it is a B-bit integer), this allows for a binary
+;;;; search.
+;;;;
+;;;; When multiple machines are involved, instead of returning a
+;;;; simple answer, we return a sort of distribution, describing the
+;;;; relative position of a number in the data set. These
+;;;; distributions can be combined with other machines' distributions,
+;;;; and the fincal result can be tested to see if the distribution
+;;;; represents a median.
+;;;;
+;;;; Unfortunately, as will be seen, there are lots of corner
+;;;; cases. For example, the median of a data set with an even number
+;;;; of elements is the average of the two middle numbers.
+
+
+;;; Using a whole separate DIST struct is a little wasteful. We could
+;;; just use 3-element vectors, so we can do MAP on them.
+(defstruct dist
+  "A simple tuple of numbers representing the relative distribution of
+a number."
+  (less-than    0 :type unsigned-byte)
+  (equal-to     0 :type unsigned-byte)
+  (greater-than 0 :type unsigned-byte))
+
+(defun compute-single-dist (n machine)
+  "Compute the distribution information about a number N of a single
+machine MACHINE. The distribution information tells us how many
+numbers in the machine are less than, equal to, or greater than the
+number N."
+  ;; In a functional language that's smart about immutable data
+  ;; structures, this would be better represented as a fold.
+  (let ((<n 0) (=n 0) (>n 0))
+    (map nil
+         (lambda (x)
+           (cond
+             ((< x n) (incf <n))
+             ((> x n) (incf >n))
+             (t       (incf =n))))
+         machine)
+    
+    ;; Construct and return the distribution.
+    (make-dist :less-than    <n
+               :equal-to     =n
+               :greater-than >n)))
+
+(defun add-dists (d1 d2)
+  "Add two distributions D1 and D2."
+  (make-dist
+   :less-than    (+ (dist-less-than d1) (dist-less-than d2))
+   :equal-to     (+ (dist-equal-to d1) (dist-equal-to d2))
+   :greater-than (+ (dist-greater-than d1) (dist-greater-than d2))))
+
+(defun compute-dist (n machines)
+  "Compute the distribution information of N on the list of machines
+MACHINES."
+  (let ((dist (reduce #'add-dists machines :key (lambda (d) 
+                                                  (compute-single-dist n d)))))
+    (values (dist-less-than    dist)
+            (dist-equal-to     dist)
+            (dist-greater-than dist))))
+
+(defun compute-median (machines &key (min most-negative-fixnum)
+                                     (max most-positive-fixnum))
+  "Compute the median of the list of machines MACHINES, where the
+minimum number on any machine is expressed by MIN and the maximum by
+MAX."
+  (labels ((half (x)
+             (round x 2))
+           
+           (find-lub (n data)
+             "Find the smallest number greater than N in the data set
+              DATA."
+             (loop :with lub = nil
+                   :for x :across data
+                   :do (when (and (or (null lub) (< x lub))
+                                  (> x n))
+                         (setf lub x))
+                   :finally (return lub)))
+           
+           (find-glb (n data)
+             "Find the largest number less than N in the data set
+              DATA."
+             (loop :with glb = nil
+                   :for x :across data
+                   :do (when (and (or (null glb) (> x glb))
+                                  (< x n))
+                         (setf glb x))
+                   :finally (return glb)))
+           
+           (bisect (n min max)
+             (format t "Trying N=~A in (~A, ~A)~%" n min max)
+             ;; Have we closed our interval to [X, X+1]?
+             (if (= (1+ min) max)
+                 (/ (+ min max) 2)    ; Found median.
+                 (multiple-value-bind (<n =n >n)
+                     (compute-dist n machines)
+                   (if (zerop =n)
+                       ;; We did not find N in the data set, so we
+                       ;; either need to balance out <N and >N by
+                       ;; bisecting, or we are dead in the middle, and
+                       ;; we need to find the numbers closest to N.
+                       (cond
+                         ((< <n >n) (bisect (half (+ n max)) n max))
+                         ((> <n >n) (bisect (half (+ n min)) min n))
+                         (t            
+                          ;; This case sort of sucks. We are in the
+                          ;; middle of the data set with our number N
+                          ;; (which isn't actually a part of the
+                          ;; data), and we need to find the two
+                          ;; numbers P and Q such that P is the
+                          ;; greatest number < N, and Q is the
+                          ;; smallest number > N.
+                          ;;
+                          ;; We have to do this MAP+REDUCE shenanigans
+                          ;; instead of a simple REDUCE+KEY since we
+                          ;; need to remove null values possibly
+                          ;; produced by FIND-* functions.
+                          (let ((lub (reduce #'min
+                                             (remove-if #'null
+                                                        (mapcar (lambda (machine)
+                                                                  (find-lub n machine))
+                                                                machines))))
+                                (glb (reduce #'max
+                                             (remove-if #'null
+                                                        (mapcar (lambda (machine)
+                                                                  (find-glb n machine))
+                                                                machines)))))
+                            (/ (+ lub glb) 2)))) ; Found median.
+                       
+                       ;; We did find N in the data set. Suppose
+                       ;; 
+                       ;;     A = count of numbers < N,
+                       ;;     B = count of numbers > N, and
+                       ;;     C = count of numbers = N,
+                       ;;     
+                       ;; then we want A and B balanced up to C. In
+                       ;; other words, we want
+                       ;;
+                       ;;     | A - B | < C.
+                       ;;
+                       ;; If they are balanced up to C, then N is the
+                       ;; median. Otherwise, we rebalance by bisecting
+                       ;; (which is done by checking if A - B is
+                       ;; positive or negative, i.e., if A > B, vice
+                       ;; versa.
+                       (let ((delta (- <n >n)))
+                         (cond
+                           ((< (abs delta) =n) n)  ; Found median.
+                           ((plusp delta) (bisect (half (+ n min)) min n))
+                           (t             (bisect (half (+ n max)) n max)))))))))
+    (bisect (half (+ max min)) min max)))
+
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.