Commits

Anonymous committed 595499b

improved internal representation of Document and Corpus. Optimized term frequency matrix creation

Comments (0)

Files changed (5)

src/basicTypes/basic_document.hh

                 return BasicDocument::word2id_[word];
             }
 
+            static id_type getNbTerms()
+            {
+                return BasicDocument::id2word_.size();
+            }
+
         protected:
 
             template <typename Document>
                     BasicDocument::id2word_.push_back(word);
                     ++BasicDocument::nextId_;
                 }
-                else
-                {
-                    document_.insert(document_.end(), it->second);
-                }
+                document_.insert(document_.end(), it->second);
             }
 
             // Constructors - Only created by factory

src/basicTypes/corpus.hh

     {
         private:
             typedef std::vector<D>                  doc_container_;
+
         public:
             // Types
             typedef typename D::id_type             id_type;
                 return documents_.cend();
             }
 
+            // Static methods
+
+            static const std::string& getUrl(id_type id)
+            {
+                return Corpus::id2doc_[id];
+            }
+
+            static id_type getId(const std::string& doc)
+            {
+                return Corpus::doc2id_[doc];
+            }
+
 
         private:
+            // Only created by a factory
             Corpus() = default;
 
+            // Add a document to the Corpus
+            void addDocument(D&& document)
+            {
+                const auto& url = document.getUrl();
+                auto it = Corpus::doc2id_.find(url);
+
+                if (it == Corpus::doc2id_.end())
+                {
+                    Corpus::doc2id_.emplace(url, Corpus::nextId_);
+                    Corpus::id2doc_.push_back(url);
+                    ++Corpus::nextId_;
+                }
+                documents_.emplace_back(std::move(document));
+            }
+
+            // Static attributes
+            static std::vector<std::string>                  id2doc_;
+            static std::unordered_map<std::string, id_type>  doc2id_;
+            static id_type                                   nextId_;
+
             template <typename Document>
             friend class CorpusFactory;
 
             doc_container_ documents_;
     };
+
+    // Declare static members of Corpus
+
+    template <typename D>
+    typename std::vector<std::string> Corpus<D>::id2doc_;
+
+    template <typename D>
+    typename std::unordered_map<std::string, typename Corpus<D>::id_type> Corpus<D>::doc2id_;
+
+    template <typename D>
+    typename Corpus<D>::id_type Corpus<D>::nextId_{0};
 }
 
 #endif /* !CORPUS_HH_ */

src/basicTypes/corpus_factory.hh

                         if (ent->d_type == DT_REG)
                         {
                             // Add the document in the corpus
-                            corpus.documents_.emplace_back(DocumentFactory<D>::from_file(ent->d_name));
+                            corpus.addDocument(DocumentFactory<D>::from_file(ent->d_name));
                         }
                     }
                     closedir(dir);

src/documentStatistics/tf.hh

 
 namespace stat
 {
-    template <typename Corpus>
-    TermDocumentMatrix compute_tf(const Corpus& corpus);
+    template <typename C>
+    TermDocumentMatrix compute_tf(const C& corpus);
 }
 
 // Implementation

src/documentStatistics/tf.hxx

 
 namespace stat
 {
-    template <typename Corpus>
-    TermDocumentMatrix compute_tf(const Corpus& corpus)
+    template <typename C>
+    TermDocumentMatrix compute_tf(const C& corpus)
     {
-        std::unordered_map<typename Corpus::id_type, std::vector<int>> frequencies;
-
+        TermDocumentMatrix result = TermDocumentMatrix::Zero(corpus.size(), C::document_type::getNbTerms());
         int doc_id = 0;
-        int nb_doc = corpus.size();
 
         // Compute term frequency
         for (const auto& document: corpus)
         {
             for (const auto& word_id: document)
             {
-                auto it = frequencies.find(word_id);
-
-                if (it == frequencies.end())
-                {
-                    it = std::get<0>(frequencies.emplace(word_id, std::vector<int>(nb_doc, 0)));
-                }
-                ++(it->second[doc_id]);
+                ++result(doc_id, word_id);
             }
 
             ++doc_id;
         }
 
-        TermDocumentMatrix result(nb_doc, frequencies.size());
-        int word_id = 0;
-
-        for (const auto& freq: frequencies)
-        {
-            for (int i = 0; i < nb_doc; ++i)
-            {
-                result(i, word_id) = freq.second[i];
-            }
-
-            ++word_id;
-        }
-
         return result;
     }
 }