Commits

Torbjorn Tornkvist  committed 5ad95d3

Finished the K-means clustering example.

  • Participants
  • Parent commits fffeb9d

Comments (0)

Files changed (1)

File src/clusters.erl

 
 -include_lib("eunit/include/eunit.hrl").
 
--import(lists, [sum/1,foldl/3,sort/1,reverse/1]).
+-import(lists, [sum/1,foldl/3,sort/1,reverse/1,zip/2,seq/2]).
 -import(math,  [pow/2,sqrt/1]).
 
+-define(p(Fs,As), true).
+%-define(p(Fs,As), io:format("~p: "++Fs,[time()|As])).
+
+%% @doc Print the blog names of a cluster
+%% Try: 
+%%
+%%  {RowNames,ColNames,Data} = clusters:blogdata().
+%%  C=clusters:kcluster(Data,10).
+%%  clusters:blognames(RowNames, array:get(0,C)).
+%%  [clusters:blognames(RowNames, array:get(J,C)) || J <- lists:seq(0,9)].
+%%
+blognames(RowNames,Indices) ->
+    [e(I,RowNames) || I <- Indices].
+    
+
+%% @doc K-means clustering
+%%
+%% Begin with K randomly placed Centroids (points in space that represent
+%% the center of a cluster). Assign every item to the nearest centroid.
+%% Move the centroid to the average position if its cluster. Repeat until
+%% no more changes occurs.
+%%
+kcluster(Rows)              -> kcluster(Rows, 4).
+kcluster(Rows, K)           -> kcluster(Rows, K, pearson).
+kcluster(Rows, K, Distance) ->
+    RowsTuple = list_to_tuple(Rows),
+
+    ?p("Determine the minimum and maximum values for each column.~n",[]),
+    Columns = rows2columns(Rows),
+    ColRanges = [min_max(Column) || Column <- Columns],
+
+    ?p("Create k randomly placed centroids.~n",[]),
+    random:seed(erlang:now()),
+    Clusters = [[random:uniform()*(Max-Min) + Min
+                  || {Min,Max} <- ColRanges]
+                || _J <- seq(1,K)],
+
+    try 
+        ?p("Begin iterate over the positions of the centroids.~n",[]),
+        e(2,foldl(fun(_,{Cs,Matches}) ->
+                          case bestmatches(Rows, Distance, K, Cs) of
+                              Matches     -> throw({finished,Matches});
+                              BestMatches -> 
+                                  {move_centroids(RowsTuple,K,BestMatches),
+                                   BestMatches}
+                          end
+                  end, {Clusters,mk_array(K)}, seq(1,100)))
+    catch 
+        throw:{finished,Ms} -> Ms 
+    end.
+                                  
+
+%% @doc Move the centroids to the average of their members,
+%%
+move_centroids(RowsTuple,K, Bestmatches) ->
+    ?p("Move the centroids to the average of their members.~n",[]),
+    [[average(Column) 
+      || Column <- rows2columns([e(I,RowsTuple) 
+                                 || I <- array:get(J,Bestmatches)])]
+     || J <- seq(0,K-1)].
+
+average(L)             -> average(L,0,0).
+average([H|T],Sum,Len) -> average(T,H+Sum,1+Len);
+average([],Sum,Len)    -> Sum/Len.
+    
+
+%% @doc Group the rows around the closest centroid.
+%% Return an array of size K, where each entry I contains a list
+%% of the rows being closest to the I:th centroid.
+%%
+bestmatches(Rows, Distance, K, Clusters) ->
+    ?p("Group the rows around their closest centroid.~n",[]),
+    e(2, foldl(fun(Row,{J,Arr}) -> 
+                       I = bestmatch(Clusters, Row, Distance),
+                       {J+1,append_array_val(I,J,Arr)}
+               end, {1, mk_array(K)}, Rows)).
+
+
+%% @doc Transpose a list of rows to a list of columns.
+%% In:  [[C1,C2,...Cn] = R1, R2,...Rn] = Rows
+%% Out: [[R1C1,R2C1,...RnC1],...[R1Cn,R2Cn,...RmCn]] = Columns
+%%
+rows2columns(Rows) when is_list(Rows) ->
+    ?p("Transpose a list of rows to a list of columns.~n",[]),
+    [[e(I,RowTuple) || RowTuple <- [list_to_tuple(Row) || Row <- Rows]]
+     || I <- seq(1,length(hd(Rows)))].
+
+mk_array(K) ->    
+    array:new([{size,K},{fixed,true},{default,[]}]).
+
+append_array_val(K,V,Arr) ->
+    array:set(K,[V|array:get(K, Arr)],Arr).
+
+
+%% @doc Return which centroid is the closest to the row
+%%
+bestmatch(Clusters, Row, Distance) ->
+    R = foldl(fun(Crow, {J,Bestmatch,Bdist}) ->
+                      D = ?MODULE:Distance(Crow,Row),
+                      if (D<Bdist) -> {J+1, J+1, D};
+                         true   -> {J+1, Bestmatch, Bdist}
+                      end
+              end, {0, 0, ?MODULE:Distance(hd(Clusters),Row)}, tl(Clusters)),
+    e(2,R).
+                          
+min_max([H|T]) -> min_max(T,H,H).
+
+min_max([H|T],Min,Max) when H=<Min -> min_max(T,H,Max);
+min_max([H|T],Min,Max) when H>Max  -> min_max(T,Min,H);
+min_max([_|T],Min,Max)             -> min_max(T,Min,Max);
+min_max([],Min,Max)                -> {Min,Max}.
+    
+e(I,T) when is_integer(I), is_tuple(T) ->
+    element(I,T).
 
 %% @doc Pearson Correlation Score
 %%      
 
     % Sum of the squares
     Sum1Sq = sum([pow(S1,2) || S1 <- V1]),
-    Sum2Sq = sum([pow(S2,2) || S2 <- V1]),
+    Sum2Sq = sum([pow(S2,2) || S2 <- V2]),
 
     % Sum up the products
-    Psum = sum([S1*S2 || {S1,S2} <- lists:zip(V1,V2)]),
+    Psum = sum([S1*S2 || {S1,S2} <- zip(V1,V2)]),
     
     % Calculate r (Pearson score)
     N   = length(V1),
 %% @doc Read the blogdata, taken from http://kiwitobes.com/clusters/blogdata.txt
 %%
 %% Return: {RowNames::tuple(), ColNames::tuple(), RowData::datarows()}
-%% where: datarows() ::= {rowdata::list(), ... }
+%% where: datarows() ::= [rowdata::list(), ... ]
 %%
 blogdata() ->
     {ok, Bin} = ci:get_blogdata(),
     % Return
     {list_to_tuple(RowNames), 
      list_to_tuple(string:tokens(ColNames,"\t")), 
-     list_to_tuple(Data)}.
+     Data}.