Changed (Δ2.1 KB):

raw changeset »

03/clusters.rb (65 lines added, 0 lines removed)

03/generatefeedvector-jp.rb (6 lines added, 1 lines removed)

Up to file-list 03/clusters.rb:

@@ -8,6 +8,69 @@ include Magick
8
8
9
9
class Clusters
10
10
11
  def kcluster(rows, distance=:pearson, k=4)
12
    # それぞれのポイントの最小値と最大値を決める
13
    ranges = Array.new
14
    for i in 0...rows[0].length
15
      cols = Array.new
16
      for row in rows
17
        cols.push(row[i])
18
      end
19
      ranges.push([cols.min, cols.max])
20
    end
21
    
22
    # 重心をランダムにk個配置する
23
    clusters = Array.new
24
    k.times do
25
      tmp = Array.new
26
      for i in 0...rows[0].length
27
        tmp.push( rand()*(ranges[i].max-ranges[i].min)+ranges[i].min )
28
      end
29
      clusters.push(tmp)
30
    end
31
    
32
    # 重心を再計算しながら100回、もしくは変化が無くなるまで移動させる
33
    lastmatches = nil
34
    for t in 0...100
35
      puts 'Iteration ' + t.to_s
36
      bestmatches = Array.new # 空の配列作成
37
      k.times do bestmatches.push(Array.new) end
38
      
39
      # それぞれの行に対して、もっとも近い重心を探し出す
40
      for j in 0...rows.length
41
        row = rows[j]
42
        bestmatch = 0
43
        for i in 0...k
44
          d = self.method(distance).call(clusters[i],row) # 距離計算
45
          bestmatch = i if d < self.method(distance).call(clusters[bestmatch],row)
46
        end
47
        bestmatches[bestmatch].push(j)
48
      end
49
      
50
      # 結果が前回と同じであれば完了
51
      return bestmatches if bestmatches == lastmatches
52
      lastmatches = bestmatches
53
54
      # 重心をそのメンバーの平均に移動する
55
      for i in 0...k
56
        avgs = Array.new(rows[0].length, 0.0) # 初期値0.0の配列をいくつか作る
57
        if bestmatches[i].length > 0
58
          for rowid in bestmatches[i]
59
            for m in 0...rows[rowid].length
60
              avgs[m] += rows[rowid][m]
61
            end
62
          end
63
          for j in 0...avgs.length
64
            avgs[j] /= bestmatches[j].length if bestmatches[j] != nil
65
          end
66
          clusters[i] = avgs
67
        end
68
        #return bestmatches
69
      end
70
    end
71
72
  end
73
  
11
74
  # 行列の入れ替え
12
75
  def rotatematrix(data)
13
76
    newdata = Array.new
@@ -174,6 +237,8 @@ class Clusters
174
237
  
175
238
  # ピアソン相関距離を計算
176
239
  def pearson(v1,v2)
240
    v1 = [v1] if v1.class != Array
241
    v2 = [v2] if v2.class != Array
177
242
    # 単純な合計
178
243
    sum1 = 0
179
244
    v1.each{ |n|

Up to file-list 03/generatefeedvector-jp.rb:

@@ -35,7 +35,7 @@ class FeedVectorGeneratorJp
35
35
  def getWordsByKind(node, kind)
36
36
    list = Array.new
37
37
    while node do
38
      f = node.feature.split(/,/) 
38
      f = node.feature.split(/,/)
39
39
      if /#{kind}/ =~ f[0]
40
40
        list.push(node.surface)
41
41
      end
@@ -56,6 +56,11 @@ class FeedVectorGeneratorJp
56
56
    n = mecab.parseToNode( CGI.unescapeHTML(txt.toutf8) )
57
57
    # verbs = getWordsByKind(n, '動詞')
58
58
    nouns = getWordsByKind(n, '名詞') # 名詞のみ
59
    
60
    words = Array.new
61
    nouns.each{ |w|
62
      words.push(w) if w =~ /\w{2,}/ # 2文字以上
63
    }
59
64
    return nouns
60
65
  end
61
66