Commits

Juliette De Maeyer committed b02441d Merge

Merge

Comments (0)

Files changed (2)

csxj/datasources/dhnet.py

                         tags |= set(['script', 'embedded'])
                         tagged_urls.append(make_tagged_url(url, title, tags))
                     else:
-                        pass
-                elif div.find('noscript'):
-                    noscript = div.find('noscript')
-                    link = noscript.find('a')
-                    if link:
-                        url = link.get('href')
-                        title = remove_text_formatting_markup_from_fragments(link.contents)
-                        all_tags = classify_and_tag(url, DHNET_NETLOC, DHNET_INTERNAL_SITES)
-                        all_tags |= set(['script', 'embedded'])
-                        tagged_urls.append(make_tagged_url(url, title, all_tags))
-                    else:
-                        print ValueError("No link was found in the <noscript> section")
+                        if div.find('noscript'):
+                            noscript = div.find('noscript')
+                            link = noscript.find('a')
+                            if link:
+                                url = link.get('href')
+                                title = remove_text_formatting_markup_from_fragments(link.contents)
+                                all_tags = classify_and_tag(url, DHNET_NETLOC, DHNET_INTERNAL_SITES)
+                                all_tags |= set(['script', 'embedded'])
+                                tagged_urls.append(make_tagged_url(url, title, all_tags))
+                            else:
+                                raise ValueError("No link was found in the <noscript> section. Update the parser.")
+                        else:
+                            raise ValueError("Embedded script of unknown type was detected ('{0}'). Update the parser.".format(script_url))
                 else:
-                    print ValueError("Could not extract fallback noscript url for this embedded javascript object")
+                    raise ValueError("Could not extract fallback noscript url for this embedded javascript object. Update the parser.")
             else:
-                print ValueError("Unknown media type with class: {0}".format(div.get('class')))
+                raise ValueError("Unknown media type with class: {0}. Update the parser.".format(div.get('class')))
 
-
-    print tagged_urls
     return tagged_urls
 
 

scripts/csxj_reprocess_entire_database.py

 
     errors_by_day = dict()
     all_days = p.get_all_days()
-    n_days = len(all_days)
-    subset = int(float(n_days) * 0.05)
-    print("    total days: {0}".format(n_days))
-    print("    picking {0} random days".format(subset))
-    random.shuffle(all_days)
-    for day in all_days[:subset]:
+#    n_days = len(all_days)
+#    subset = int(float(n_days) * 0.05)
+#    print("    total days: {0}".format(n_days))
+#    print("    picking {0} random days".format(subset))
+#    random.shuffle(all_days)
+    for day in all_days[:]:
         errors_by_batch = dict()
         for batch_hour in p.get_all_batch_hours(day):
             batch_root_dir = os.path.join(p.directory, day, batch_hour)
 
 
 if __name__ == "__main__":
-#    import argparse
-#    parser = argparse.ArgumentParser(description='Utility functions to troubleshoot queue management')
-#    parser.add_argument('--source_jsondb', type=str, dest='source_jsondb', required=True, help='source json db root directory')
-#    parser.add_argument('--dest_jsondb', type=str, dest='dest_jsondb', required=True, help='dest json db root directory')
-#
-#    args = parser.parse_args()
+    import argparse
+    parser = argparse.ArgumentParser(description='Utility functions to troubleshoot queue management')
+    parser.add_argument('--source_jsondb', type=str, dest='source_jsondb', required=True, help='source json db root directory')
+    parser.add_argument('--dest_jsondb', type=str, dest='dest_jsondb', required=True, help='dest json db root directory')
 
-    source_root = "/Users/sevas/Documents/juliette/json_db_0_5"
-    dest = "/Users/sevas/Documents/juliette/json_db_0_5_reprocess"
+    args = parser.parse_args()
+
+#    source_root = "/Users/sevas/Documents/juliette/json_db_0_5"
+#    dest = "/Users/sevas/Documents/juliette/json_db_0_5_reprocess"
+    source_root = args.source_jsondb
+    dest = args.dest_jsondb
 
     main(source_root, dest)