Commits

izazi  committed 89f2060

Initial project creation, stand alone TFIDF working now

  • Participants

Comments (0)

Files changed (7)

+# tfidf-cascalog
+
+A Clojure library designed to ... well, that part is up to you.
+
+## Usage
+
+FIXME
+
+## License
+
+Copyright © 2013 FIXME
+
+Distributed under the Eclipse Public License, the same as Clojure.

File data/en.stop

+stop
+a
+about
+after
+all
+along
+an
+and
+any
+are
+around
+as
+asked
+at
+away
+back
+be
+been
+before
+between
+both
+but
+by
+can
+could
+did
+do
+even
+few
+for
+from
+get
+got
+had
+hand
+has
+have
+he
+he
+her
+here
+high
+him
+his
+how
+i
+if
+in
+into
+is
+it
+its
+just
+large
+like
+long
+man
+many
+more
+most
+much
+my
+near
+new
+next
+no
+not
+now
+of
+off
+on
+one
+or
+other
+our
+out
+over
+right
+said
+see
+she
+side
+small
+so
+some
+than
+that
+the
+their
+them
+then
+there
+these
+they
+this
+those
+through
+time
+to
+too
+two
+up
+us
+used
+was
+way
+we
+were
+what
+when
+where
+which
+while
+who
+will
+with
+within
+would
+you
+your

File data/rain.txt

+doc_id	text
+doc01	A rain shadow is a dry area on the lee back side of a mountainous area.
+doc02	This sinking, dry air produces a rain shadow, or area in the lee of a mountain with less rain and cloudcover.
+doc03	A rain shadow is an area of dry land that lies on the leeward (or downwind) side of a mountain.
+doc04	This is known as the rain shadow effect and is the primary cause of leeward deserts of mountain ranges, such as California's Death Valley.
+doc05	Two Women. Secrets. A Broken Land. [DVD Australia]

File doc/intro.md

+# Introduction to tfidf-cascalog
+
+TODO: write [great documentation](http://jacobian.org/writing/great-documentation/what-to-write/)
+(defproject tfidf-cascalog "0.1.0-SNAPSHOT"
+  :description "FIXME: write description"
+  :url "http://example.com/FIXME"
+  :license {:name "Eclipse Public License"
+            :url "http://www.eclipse.org/legal/epl-v10.html"}
+  :dependencies [[org.clojure/clojure "1.4.0"]])

File src/clj/tfidf_cascalog/core.clj

+(ns tfidf-cascalog.core
+  (:require [clojurewerkz.cassaforte.client :as cc]
+            [clojurewerkz.cassaforte.schema :as sch]
+            [clojurewerkz.cassaforte.conversion :as cconv]
+            [clojurewerkz.cassaforte.cql    :as cql]
+            [clojurewerkz.cassaforte.bytes  :as bytes]
+
+            [clojurewerkz.cassaforte.thrift.core :as thrift]
+            [clojurewerkz.cassaforte.thrift.column-definition :as cd]
+            [clojurewerkz.cassaforte.thrift.column-family-definition :as cfd]
+            )
+  (:use cascalog.api
+        clojure.test
+        [midje sweet cascalog]
+        [cascalog.more-taps :only (hfs-delimited)])
+  (:require [cascalog.io :as io]
+            [clojure.string :as s]
+            [cascalog.ops :as c])
+  (:import [cascading.tuple Fields]
+           [cascading.scheme Scheme]
+           [com.ifesdjeen.cascading.cassandra CassandraTap CassandraScheme]
+           [org.apache.cassandra.utils ByteBufferUtil]
+           [org.apache.cassandra.thrift Column]))
+
+(defmacro with-thrift-exception-handling
+[& forms]
+`(try
+   (do ~@forms)
+   (catch org.apache.cassandra.thrift.InvalidRequestException ire#
+     (println (.getWhy ire#)))))
+
+(defn create-test-column-family
+  []
+  (with-thrift-exception-handling
+    (cql/drop-column-family "libraries"))
+  (cql/create-column-family "libraries"
+                            {:name      "varchar"
+                             :language  "varchar"
+                             :votes     "int"}
+                            :primary-key :name))
+
+(defn create-tap
+  ([]
+     (create-tap ["name" "language" "votes"] {"name"     "?value1"
+                                              "language" "?value2"
+                                              "votes"    "?value3"}))
+  ([columns mappings]
+      (let [keyspace      "CascalogSandbox"
+            column-family "libraries"
+            scheme        (CassandraScheme. "127.0.0.1"
+                                            "9160"
+                                            keyspace
+                                            column-family
+                                            "name"
+                                            columns
+                                            mappings
+                                            {"cassandra.inputPartitioner" "org.apache.cassandra.dht.RandomPartitioner"
+                                             "cassandra.outputPartitioner" "org.apache.cassandra.dht.RandomPartitioner"})
+            tap           (CassandraTap. scheme)]
+        tap)))
+
+(defn run-query []
+  (let [test-data [["Riak" "Erlang"]
+                   ["Cassaforte" "Clojure"]]]
+
+    (?<- (create-tap ["name" "language"] {"name"     "?value1"
+                                          "language" "?value2"})
+         [?value1 ?value2]
+         (test-data ?value1 ?value2))))
+
+(defmapcatop split [line]
+  "reads in a line of string and splits it by regex"
+  (s/split line #"[\[\]\\\(\),.)\s]+"))
+
+(defn etl-docs-gen [rain stop]
+  (<- [?doc-id ?word]
+      (rain ?doc-id ?line)
+      (split ?line :> ?word-dirty)
+      ((c/comp s/trim s/lower-case) ?word-dirty :> ?word)
+      (stop ?word :> false)))
+
+(defn word-count [src]
+  "simple word count across all documents"
+  (<- [?word ?count]
+      (src _ ?word)
+      (c/count ?count)))
+
+(defn D [src]
+  (let [src  (select-fields src ["?doc-id"])]
+    (<- [?n-docs]
+        (src ?doc-id)
+        (c/distinct-count ?doc-id :> ?n-docs))))
+
+(defn DF [src]
+  (<- [?df-word ?df-count]
+      (src ?doc-id ?df-word)
+      (c/distinct-count ?doc-id ?df-word :> ?df-count)))
+
+(defn TF [src]
+  (<- [?doc-id ?tf-word ?tf-count]
+      (src ?doc-id ?tf-word)
+      (c/count ?tf-count)))
+
+(defn tf-idf-formula [tf-count df-count n-docs]
+  (->> (+ 1.0 df-count)
+    (div n-docs)
+    (Math/log)
+    (* tf-count)))
+
+(defn TF-IDF [src]
+  (let [n-doc (first (flatten (??- (D src))))]
+    (<- [?key ?doc-id ?tf-idf-str ?tf-word]
+        ((TF src) ?doc-id ?tf-word ?tf-count)
+        ((DF src) ?tf-word ?df-count)
+        (tf-idf-formula ?tf-count ?df-count n-doc :> ?tf-idf)
+        (str ?doc-id ?tf-word :> ?key)
+        (str ?tf-idf :> ?tf-idf-str))))
+
+(defn create-tfidf-tap
+  ([]
+     (create-tfidf-tap ["key" "doc-id" "tf-idf" "tf-word"] {"key"     "?key"
+                                              "doc-id" "?doc-id"
+                                              "tf-idf"    "?tf-idf-str"
+                                              "tf-word" "?tf-word"}))
+  ([columns mappings]
+      (let [keyspace      "CascalogSandbox"
+            column-family "tfidf"
+            scheme        (CassandraScheme. "127.0.0.1"
+                                            "9160"
+                                            keyspace
+                                            column-family
+                                            "key"
+                                            columns
+                                            mappings
+                                            {"cassandra.inputPartitioner" "org.apache.cassandra.dht.RandomPartitioner"
+                                             "cassandra.outputPartitioner" "org.apache.cassandra.dht.RandomPartitioner"})
+            tap           (CassandraTap. scheme)]
+        tap)))
+
+(defn execute [in out stop tfidf]
+  (cc/connect! "127.0.0.1")
+  (sch/set-keyspace "CascalogSandbox")
+  (let [rain (hfs-delimited in :skip-header? true)
+        stop (hfs-delimited stop :skip-header? true)
+        src  (etl-docs-gen rain stop)]
+    (?- (create-tfidf-tap)
+        (TF-IDF src))))
+
+(defn -main [in out stop tfidf & args]
+  (execute in out stop tfidf))
+

File test/tfidf_cascalog/core_test.clj

+(ns tfidf-cascalog.core-test
+  (:use clojure.test
+        tfidf-cascalog.core))
+
+(deftest a-test
+  (testing "FIXME, I fail."
+    (is (= 0 1))))