Commits

Frederic De Groef committed e6fe594

added an option to reprocess only from a starting date

Comments (0)

Files changed (2)

-import os, os.path
+import os
 from datetime import datetime, time
 
 
     Returns a list of file names corresponding to all the .json files in the supplied directory.
     """
     return [i for i in os.listdir(parent_dir)
-                    if os.path.isfile(os.path.join(parent_dir, i)) and i.endswith(".json")]
+            if os.path.isfile(os.path.join(parent_dir, i)) and i.endswith(".json")]
 
 
 def make_time_from_string(time_string):
     datetime.time(22, 6, 10)
     """
     h, m, s = [int(i) for i in time_string.split('.')]
-    return time(h, m ,s)
-
+    return time(h, m, s)
 
 
 def make_date_from_string(date_string):
     return datetime.strptime(date_string, '%Y-%m-%d')
 
 
-
 def make_date_time_from_string(date_string, time_string):
     return make_date_from_string(date_string), make_time_from_string(time_string)
 
 
-
 def convert_date_to_string(d):
     return d.strftime("%Y-%m-%d")
 
     From a list of string-formatted hours ('HH.MM.SS'), returns the latest hour (also in string form).
 
     For example:
-    
+
     >>> get_latest_hour(['12.00.10', '21.00.00', '01.00.30'])
     '21.00.00'
 
     return max(l, key=lambda x: x[0])[1]
 
 
-
 def get_latest_day(day_directory_names):
     """
     From a list of string-formatted dates ('YYYY-MM-DD'), returns the latest dat (also in string form)
     l = [(make_date_from_string(date_string), date_string) for date_string in day_directory_names]
     last_day = max(l, key=lambda x: x[0])[1]
     return last_day
+
+
+def is_after_start_date(start_from, d):
+    """
+
+    >>> is_after_start_date('2012-12-20', '2012-12-21')
+    True
+
+    >>> is_after_start_date('2012-07-31', '2012-07-31')
+    True
+
+    >>> is_after_start_date('2012-09-05', '2012-07-31')
+    False
+
+    >>> is_after_start_date('2012-12-31', '2013-01-01')
+    True
+    """
+    start_date, actual_date = make_date_from_string(start_from), make_date_from_string(d)
+    return actual_date >= start_date
+
+
+if __name__ == '__main__':
+    import sys
+    if '--test' in sys.argv:
+        import doctest
+        doctest.testmod(verbose=True)

scripts/csxj_reprocess_entire_database.py

     errors_encountered = list()
     raw_data_index_file = os.path.join(raw_data_dir, csxjdb.constants.RAW_DATA_INDEX_FILENAME)
 
+    if not os.path.exists(raw_data_index_file):
+        errors_encountered.append(("NO_URL", raw_data_index_file, "IOError: [Errno 2] No such file or directory: "+raw_data_index_file))
+        return reprocessed_articles, errors_encountered
+
     with open(raw_data_index_file, 'r') as f:
         index = json.load(f)
         index = [(raw_data_dir, url, filename) for (url, filename) in index]
 
 
 def reprocess_raw_html(args):
-    provider_name, source_root_dir, dest_root_dir = args
+    provider_name, source_root_dir, dest_root_dir, start_from = args
     p = csxjdb.Provider(source_root_dir, provider_name)
     datasource = NAME_TO_SOURCE_MODULE_MAPPING[provider_name]
     article_count = 0
 
     errors_by_day = dict()
-    all_days = p.get_all_days()
-    for day in all_days[:]:
+    all_days = (d for d in p.get_all_days() if csxjdb.utils.is_after_start_date(start_from, d))
+    for day in all_days:
         errors_by_batch = dict()
         for batch_hour in p.get_all_batch_hours(day):
             batch_root_dir = os.path.join(p.directory, day, batch_hour)
     return article_count, err_dict
 
 
-def main(source_path, dest_path, processes, source_names):
+def main(source_path, dest_path, processes, source_names, start_from):
     if not os.path.exists(dest_path):
         print "°°° Creating missing destination root:", dest_path
         os.makedirs(dest_path)
     if processes > 1:
         import multiprocessing as mp
         p = mp.Pool(processes)
-        results = p.map(reprocess_raw_html, [(name, source_path, dest_path) for name in provider_names if name in source_names])
+        results = p.map(reprocess_raw_html, [(name, source_path, dest_path, start_from) for name in provider_names if name in source_names])
     else:
         results = list()
         for name in [_ for _ in provider_names if _ in source_names]:
             print "***", name
-            results.append(reprocess_raw_html((name, source_path, dest_path)))
+            results.append(reprocess_raw_html((name, source_path, dest_path, start_from)))
 
     n_samples = sum([x[0] for x in results])
     errors_by_source = [x[1] for x in results]
     parser.add_argument('--dest-jsondb', type=str, dest='dest_jsondb', required=True, help='dest json db root directory')
     parser.add_argument('--sources', type=str, dest='source_names', default=make_parser_list(), help='comma-separated list of the sources to consider (default={0})'.format(make_parser_list()))
     parser.add_argument('--processes', type=int, dest='processes', required=False, default=1, help='Number of parallel processes to use (default=1)')
+    parser.add_argument('--start-from', type=str, dest='start_from', default='', help='forces to start at a specific date (format: YYYY-MM-DD)')
 
     args = parser.parse_args()
 
     selected_sources = args.source_names.split(',')
 
     print "Using {0} processes".format(args.processes)
-    main(source_root, dest_root, args.processes, selected_sources)
+    main(source_root, dest_root, args.processes, selected_sources, args.start_from)