Commits

Juliette De Maeyer committed 1a5a2e2

allegedly fixed all errors

Comments (0)

Files changed (3)

csxj/datasources/septsursept.py

     
     from pprint import pprint
     import json
-    # f = open("/Users/judemaey/code/2012-09-02/7sur7.json")
-    # urls = json.load(f)
+    f = open("/Users/judemaey/code/2012-09-02/7sur7.json")
+    urls = json.load(f)
 
-    # for x in urls:
-    #     for y in x[1]:
-    #         url = y[1]
-    #         article_data, html = extract_article_data(url)
-    #         print article_data.title
-    #         print article_data.url
-    #         pprint(article_data.links)
-    #         print len(article_data.links)
-    #         print "\n"
-    #         print "******************************"
-    #         print "\n"
+    for x in urls:
+        for y in x[1]:
+            url = y[1]
+            article_data, html = extract_article_data(url)
+            print article_data.title
+            print article_data.url
+            pprint(article_data.links)
+            print len(article_data.links)
+            print "\n"
+            print "******************************"
+            print "\n"
 
     # for url in urls:
     #     article_data, html = extract_article_data(url)
     #             print article_data.title
     #             print len(article_data.links)
 
-    article_data, html = extract_article_data(url15)
-    if article_data:
-        print article_data.title
-        pprint(article_data.links)
-        print len(article_data.links)
+    # article_data, html = extract_article_data(url15)
+    # if article_data:
+    #     print article_data.title
+    #     pprint(article_data.links)
+    #     print len(article_data.links)
 
     
 

csxj/datasources/sudinfo.py

 }
 
 
-def extract_associated_links(hxs, source_url):
+def extract_associated_links(hxs):
     links = hxs.select("//div[@id='picture']/descendant::div[@class='bloc-01']//a")
 
     all_tagged_urls = []
     #show_frontpage_toc()
     #download_one_article()
     #show_frontpage_articles()
-    show_article()
+
+    url = "/Volumes/Curst/json_db_0_5/sudinfo/2012-06-05/14.05.07/raw_data/18.html"
+    f = open(url,"r")
+
+    article_data, content_html = extract_article_data(f)
+    article_data.print_summary()
+
+
+

scripts/csxj_reprocess_entire_database.py

         for url, raw_file in index:
             raw_filepath = os.path.join(raw_data_dir, raw_file)
             try:
-                print "    Reprocessing: {1} ({0})".format(url, raw_filepath)
+                print u"    Reprocessing: {1} ({0})".format(url, raw_filepath)
                 #reprocessed_articles.append(raw_filepath)
                 with open(raw_filepath, 'r') as raw_html:
                     article_data, html = datasource_parser.extract_article_data(raw_html)
                     reprocessed_articles.append((article_data, html))
             except Exception as e:
                 stacktrace = traceback.format_exc()
-                print "!!! FAIL", e.message
+                print u"!!! FAIL", e.message
                 errors_encountered.append((url, raw_filepath, stacktrace))
 
     return reprocessed_articles, errors_encountered
     if not os.path.exists(batch_root_dir):
         os.makedirs(batch_root_dir)
 
-    print "+++ Saving {0} articles to {1}".format(len(articles), batch_root_dir)
+    print u"+++ Saving {0} articles to {1}".format(len(articles), batch_root_dir)
     articles_json_data = {"articles": [article.to_json() for article, raw_html in articles],
                           "errors": []}
     articles_filepath = os.path.join(batch_root_dir, csxjdb.constants.ARTICLES_FILENAME)
     if not os.path.exists(raw_data_dir):
         os.makedirs(raw_data_dir)
 
-    print "^^^ Saving raw html to {0}".format(raw_data_dir)
+    print u"^^^ Saving raw html to {0}".format(raw_data_dir)
     raw_data = [(a[0].url, a[1], "{0}.html".format(i)) for (i, a) in enumerate(articles)]
     for url, raw_html, raw_html_filename in raw_data:
         raw_filepath = os.path.join(raw_data_dir, raw_html_filename)
     after = datetime.now()
     dt = after - before
 
-    print "Total time for {0} articles: {1} seconds".format(n_samples, dt.seconds)
+    print u"Total time for {0} articles: {1} seconds".format(n_samples, dt.seconds)
     avg_time = float(dt.seconds) / n_samples
-    print "Avg time per articles: {0} seconds".format(avg_time)
-
-    projected_article_count = 200000
-    projected_time = avg_time * projected_article_count
-
-    print "Projection for {0} articles:".format(projected_article_count), time.strftime("%H:%M:%S", time.gmtime(projected_time))
+    print u"Avg time per articles: {0} seconds".format(avg_time)
 
     write_dict_to_file(errors_by_source, os.path.join(dest, os.path.pardir), os.path.basename(dest) + "_errors.json")