Anonymous avatar Anonymous committed fffeb9d

Preparing for the cluster examples.
* * *
Adding pearson correlation score.

Comments (0)

Files changed (4)

Binary file added.

+%%%-------------------------------------------------------------------
+%%% Created :  1 Apr 2010 by Torbjorn Tornkvist <tobbe@tornkvist.org>
+%%%
+%%% @doc Examples from the O'Reilly book: "Collective Intelligence"
+%%%
+%%%-------------------------------------------------------------------
+-module(ci).
+-export([get_u_item/0
+        ,get_u_data/0
+        ,get_blogdata/0
+        ]).
+
+get_u_item()   -> zip_read_file("priv/u.item.zip").
+get_u_data()   -> zip_read_file("priv/u.data.zip").
+get_blogdata() -> zip_read_file("priv/blogdata.txt.zip").
+
+zip_read_file(Fname) ->
+    {ok,Zbin}=file:read_file(Fname),
+    {ok,[{_Fname2,Bin}]} = zip:unzip(Zbin,[memory]),
+    {ok,Bin}.
+
+%%%-------------------------------------------------------------------
+%%% Created :  1 Apr 2010 by Torbjorn Tornkvist <tobbe@tornkvist.org>
+%%%
+%%% @doc Examples from the O'Reilly book: "Collective Intelligence"
+%%%
+%%%-------------------------------------------------------------------
+-module(clusters).
+%-export([sim_distance/2]).
+-compile(export_all).
+
+-include_lib("eunit/include/eunit.hrl").
+
+-import(lists, [sum/1,foldl/3,sort/1,reverse/1]).
+-import(math,  [pow/2,sqrt/1]).
+
+
+%% @doc Pearson Correlation Score
+%%      
+%% Takes two list of numbers and return their correlation score.
+%% Nb: smaller == more similar
+%% 
+pearson(V1,V2) ->
+    % Simple sums
+    Sum1 = sum(V1),
+    Sum2 = sum(V2),
+
+    % Sum of the squares
+    Sum1Sq = sum([pow(S1,2) || S1 <- V1]),
+    Sum2Sq = sum([pow(S2,2) || S2 <- V1]),
+
+    % Sum up the products
+    Psum = sum([S1*S2 || {S1,S2} <- lists:zip(V1,V2)]),
+    
+    % Calculate r (Pearson score)
+    N   = length(V1),
+    Num = Psum - (Sum1*Sum2/N),
+    Den = sqrt( (Sum1Sq - pow(Sum1,2)/N) * (Sum2Sq - pow(Sum2,2)/N) ),
+
+    % Return a smaller distance between items that are more similar.
+    if (Den == 0) -> 0;
+       true       -> 1.0 - Num/Den
+    end.
+
+-ifdef(EUNIT).
+pearson_test() ->
+    {_RowNames,_ColNames,Data} = clusters:blogdata(),
+    V1 = element(1, Data),
+    V2 = element(2, Data),
+    ?assertMatch(0.9853239168490683, clusters:pearson(V1, V2)).
+-endif.
+
+
+%% @doc Read the blogdata, taken from http://kiwitobes.com/clusters/blogdata.txt
+%%
+%% Return: {RowNames::tuple(), ColNames::tuple(), RowData::datarows()}
+%% where: datarows() ::= {rowdata::list(), ... }
+%%
+blogdata() ->
+    {ok, Bin} = ci:get_blogdata(),
+    AllLines  = string:tokens(binary_to_list(Bin), "\r\n"),
+    
+    % First line is the column titles
+    [ColNames|Lines] = AllLines,
+    {RowNames, Data} = 
+        foldl(fun(Line, {RowNames_,Data_}) ->
+                      % First column in each row is the rowname
+                      [Row|Ds] = string:tokens(Line, "\t\r"),
+                      {[Row|RowNames_], 
+                       % The data is a list of floats
+                       [[list_to_integer(D)*1.0 || D <- Ds]|Data_]}
+              end, {[],[]}, Lines),
+    % Return
+    {list_to_tuple(RowNames), 
+     list_to_tuple(string:tokens(ColNames,"\t")), 
+     list_to_tuple(Data)}.

src/recommendations.erl

 %%
 load_movie_lens() ->
     %% Get the movie titles.
-    {ok,Iz}=file:read_file("priv/u.item.zip"),
-    {ok,[{_Fname,IzBin}]} = zip:unzip(Iz,[memory]),
+    {ok,IzBin} = ci:get_u_item(),
     IzLines = string:tokens(binary_to_list(IzBin), "\n"),
     Movies = foldl(fun(Line,Mtid) -> 
                            [Id,Title|_] = string:tokens(Line,"|"),
                    end, ets:new(?MODULE,[]), IzLines),
     
     %% Load data
-    {ok,Dz}=file:read_file("priv/u.data.zip"),
-    {ok,[{_Fname2,DzBin}]} = zip:unzip(Dz,[memory]),
+    {ok,DzBin} = ci:get_u_data(),
     DzLines = string:tokens(binary_to_list(DzBin), "\n"),
     foldl(fun(Line,Ptid) -> 
                   [User,Mid,Rating,_Ts|_] = string:tokens(Line,"\t"),
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.