+;;;; Copyright (c) 2013 Robert Smith

+;;;; Given M machines with an average of N B-bit integers per machine,

+;;;; In the following solution, a machine is represented simply by a

+;;;; vector of integers (typically fixnums), and a collection of

+;;;; machines is a list of vectors.

+;;;; The overall strategy is, instead of holistically computing the

+;;;; median, we ask a machine if a particular number N is the median,

+;;;; and it "responds" with yes or no, and if not, it gives us a hint

+;;;; about what the median is relative to N (i.e., larger or

+;;;; smaller). If we know the range of values the median can take (we

+;;;; do, since it is a B-bit integer), this allows for a binary

+;;;; When multiple machines are involved, instead of returning a

+;;;; simple answer, we return a sort of distribution, describing the

+;;;; relative position of a number in the data set. These

+;;;; distributions can be combined with other machines' distributions,

+;;;; and the fincal result can be tested to see if the distribution

+;;;; represents a median.

+;;;; Unfortunately, as will be seen, there are lots of corner

+;;;; cases. For example, the median of a data set with an even number

+;;;; of elements is the average of the two middle numbers.

+;;; Using a whole separate DIST struct is a little wasteful. We could

+;;; just use 3-element vectors, so we can do MAP on them.

+ "A simple tuple of numbers representing the relative distribution of

+ (less-than 0 :type unsigned-byte)

+ (equal-to 0 :type unsigned-byte)

+ (greater-than 0 :type unsigned-byte))

+(defun compute-single-dist (n machine)

+ "Compute the distribution information about a number N of a single

+machine MACHINE. The distribution information tells us how many

+numbers in the machine are less than, equal to, or greater than the

+ ;; In a functional language that's smart about immutable data

+ ;; structures, this would be better represented as a fold.

+ (let ((<n 0) (=n 0) (>n 0))

+ ;; Construct and return the distribution.

+ (make-dist :less-than <n

+(defun add-dists (d1 d2)

+ "Add two distributions D1 and D2."

+ :less-than (+ (dist-less-than d1) (dist-less-than d2))

+ :equal-to (+ (dist-equal-to d1) (dist-equal-to d2))

+ :greater-than (+ (dist-greater-than d1) (dist-greater-than d2))))

+(defun compute-dist (n machines)

+ "Compute the distribution information of N on the list of machines

+ (let ((dist (reduce #'add-dists machines :key (lambda (d)

+ (compute-single-dist n d)))))

+ (values (dist-less-than dist)

+ (dist-greater-than dist))))

+(defun compute-median (machines &key (min most-negative-fixnum)

+ (max most-positive-fixnum))

+ "Compute the median of the list of machines MACHINES, where the

+minimum number on any machine is expressed by MIN and the maximum by

+ "Find the smallest number greater than N in the data set

+ :do (when (and (or (null lub) (< x lub))

+ :finally (return lub)))

+ "Find the largest number less than N in the data set

+ :do (when (and (or (null glb) (> x glb))

+ :finally (return glb)))

+ (format t "Trying N=~A in (~A, ~A)~%" n min max)

+ ;; Have we closed our interval to [X, X+1]?

+ (/ (+ min max) 2) ; Found median.

+ (multiple-value-bind (<n =n >n)

+ (compute-dist n machines)

+ ;; We did not find N in the data set, so we

+ ;; either need to balance out <N and >N by

+ ;; bisecting, or we are dead in the middle, and

+ ;; we need to find the numbers closest to N.

+ ((< <n >n) (bisect (half (+ n max)) n max))

+ ((> <n >n) (bisect (half (+ n min)) min n))

+ ;; This case sort of sucks. We are in the

+ ;; middle of the data set with our number N

+ ;; (which isn't actually a part of the

+ ;; data), and we need to find the two

+ ;; numbers P and Q such that P is the

+ ;; greatest number < N, and Q is the

+ ;; smallest number > N.

+ ;; We have to do this MAP+REDUCE shenanigans

+ ;; instead of a simple REDUCE+KEY since we

+ ;; need to remove null values possibly

+ ;; produced by FIND-* functions.

+ (let ((lub (reduce #'min

+ (mapcar (lambda (machine)

+ (mapcar (lambda (machine)

+ (/ (+ lub glb) 2)))) ; Found median.

+ ;; We did find N in the data set. Suppose

+ ;; A = count of numbers < N,

+ ;; B = count of numbers > N, and

+ ;; C = count of numbers = N,

+ ;; then we want A and B balanced up to C. In

+ ;; other words, we want

+ ;; If they are balanced up to C, then N is the

+ ;; median. Otherwise, we rebalance by bisecting

+ ;; (which is done by checking if A - B is

+ ;; positive or negative, i.e., if A > B, vice

+ (let ((delta (- <n >n)))

+ ((< (abs delta) =n) n) ; Found median.

+ ((plusp delta) (bisect (half (+ n min)) min n))

+ (t (bisect (half (+ n max)) n max)))))))))

+ (bisect (half (+ max min)) min max)))