Mike Lissner avatar Mike Lissner committed e5e600d

Many changes:
- accepts Brian's pull request at long last.
- fixes titlecase and adds many test cases
- fixes many errors in pull request
- finishes reorg of scrapers

Comments (0)

Files changed (15)

.settings/org.eclipse.core.resources.prefs

-eclipse.preferences.version=1
-encoding//lib/html_utils.py=utf-8
-encoding//lib/parse_dates.py=utf-8
-encoding//lib/string_utils.py=utf-8
-encoding//opinions/united_states/state/ak.py=utf-8
-encoding//opinions/united_states/state/id_civil.py=utf-8
-encoding//opinions/united_states/state/idaho_civil.py=utf-8
 import re
 import requests
 
-from juriscraper.lib.string_utils import clean_string, harmonize
+from juriscraper.lib.string_utils import clean_string, harmonize, force_unicode
 
 LOG_FILENAME = '/var/log/juriscraper/debug.log'
 
         return self
 
     def _clean_text(self, text):
-        # this function provides the opportunity to clean text before it's made
-        # into an HTML tree.
+        ''' Cleans up text before we make it into an HTML tree:
+            1. Nukes <![CDATA stuff.
+            2. ?
+        '''
         text = re.sub(r'<!\[CDATA\[', '', text)
         text = re.sub(r'\]\]>', '', text)
         return text
 
     def _check_sanity(self):
         '''Check that the objects attributes make sense:
-        1. Do all the attributes have the same length?
-        2. Do we have any content at all?
-        3. Is there a bare minimum of meta data? 
-        4. ?
+            1. Do all the attributes have the same length?
+            2. Do we have any content at all?
+            3. Is there a bare minimum of meta data? 
+            4. ?
 
         If sanity is OK, no return value. If not, throw InsanityException or 
         warnings, as appropriate.

lib/string_utils.py

 import re
 
 # For use in titlecase
-BIG = '3D|AFL|AKA|A/K/A|BMG|CBS|CDC|CDT|CEO|CIO|CNMI|D/B/A|DOJ|DVA|EFF|FCC|FTC|IBM|II|III|IV|LLC|LLP|MCI|MJL|MSPB|NLRB|UPS|RSS|SEC|UMG|USA|USC|USPS|WTO'
+BIG = ('3D|AFL|AKA|A/K/A|BMG|CBS|CDC|CDT|CEO|CIO|CNMI|D/B/A|DOJ|DVA|EFF|FCC|'
+       'FTC|IBM|II|III|IV|LLC|LLP|MCI|MJL|MSPB|NLRB|UPS|RSS|SEC|UMG|USA|USC|'
+       'USPS|WTO')
 SMALL = 'a|an|and|as|at|but|by|en|for|if|in|is|of|on|or|the|to|v\.?|via|vs\.?'
-NUMS = '0|1|2|3|4|5|6|7|8|9'
+NUMS = '0123456789'
 PUNCT = r"""!"#$¢%&'‘()*+,\-./:;?@[\\\]_—`{|}~"""
-WEIRD_CHARS = r"""¼½¾§ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜßàáâãäåæçèéêëìíîïñòóôœõöøùúûüÿ"""
-BIG_WORDS = re.compile(r'^(%s)$' % BIG, re.I)
+WEIRD_CHARS = r'¼½¾§ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜßàáâãäåæçèéêëìíîïñòóôœõöøùúûüÿ'
+BIG_WORDS = re.compile(r'^(%s)[%s]$' % (BIG, PUNCT), re.I)
 SMALL_WORDS = re.compile(r'^(%s)$' % SMALL, re.I)
 INLINE_PERIOD = re.compile(r'[a-z][.][a-z]', re.I)
+INLINE_SLASH = re.compile(r'[a-z][/][a-z]', re.I)
+INLINE_AMPERSAND = re.compile(r'([a-z][&][a-z])(.*)', re.I)
 UC_ELSEWHERE = re.compile(r'[%s]*?[a-zA-Z]+[A-Z]+?' % PUNCT)
 CAPFIRST = re.compile(r"^[%s]*?([A-Za-z])" % PUNCT)
 SMALL_FIRST = re.compile(r'^([%s]*)(%s)\b' % (PUNCT, SMALL), re.I)
 SMALL_LAST = re.compile(r'\b(%s)[%s]?$' % (SMALL, PUNCT), re.I)
-SUBPHRASE = re.compile(r'([:.;?!][ ])(%s)' % SMALL)
+SUBPHRASE = re.compile(r'([:;?!][ ])(%s)' % SMALL)
 APOS_SECOND = re.compile(r"^[dol]{1}['‘]{1}[a-z]+$", re.I)
 ALL_CAPS = re.compile(r'^[A-Z\s%s%s%s]+$' % (PUNCT, WEIRD_CHARS, NUMS))
 UC_INITIALS = re.compile(r"^(?:[A-Z]{1}\.{1}|[A-Z]{1}\.{1}[A-Z]{1})+,?$")
-MAC_MC = re.compile(r"^([Mm]a?c)(\w+)")
+MAC_MC = re.compile(r'^([Mm]a?c)(\w+.*)')
 def titlecase(text, DEBUG=False):
     '''Titlecases input text
 
     The list of "SMALL words" which are not capped comes from
     the New York Times Manual of Style, plus 'vs' and 'v'.
 
-    List of "BIG words" grows organically over time as entries are needed.
+    This will fail if multiple sentences are provided as input and if the
+    first word of a sentence is a SMALL_WORD.
+
+    List of "BIG words" grows over time as entries are needed.
     '''
 
-    # make all input uppercase.
-    text = text.upper()
+    if text.replace('v', '').isupper():
+        if DEBUG:
+            print "Entire string is uppercase, thus lowercasing."
+        # if, after removing lowercase v., the entire string is uppercase,
+        # we lowercase it
+        text = text.lower()
 
     lines = re.split('[\r\n]+', text)
     processed = []
             if all_caps:
                 if UC_INITIALS.match(word):
                     if DEBUG:
-                        print "UC_INITIALS match for: " + word
+                        print "  UC_INITIALS match for: " + word
                     tc_line.append(word)
                     continue
                 else:
                     if DEBUG:
-                        print "Not initials. Lowercasing: " + word
+                        print "  Not initials. Lowercasing: " + word
                     word = word.lower()
 
             if APOS_SECOND.match(word):
-                word = word.replace(word[0], word[0].upper())
-                word = word.replace(word[2], word[2].upper())
+                if DEBUG:
+                    print "  APOS_SECOND matched. Fixing it: " + word
+                word = word[0:3].upper() + word[3:]
                 tc_line.append(word)
                 continue
 
-            if INLINE_PERIOD.search(word) or UC_ELSEWHERE.match(word):
+            if INLINE_PERIOD.search(word):
+                if DEBUG:
+                    print "  INLINE_PERIOD matched. Uppercasing if == 1 char: " + word
+                parts = word.split('.')
+                new_parts = []
+                for part in parts:
+                    if len(part) == 1:
+                        # It's an initial like U.S.
+                        new_parts.append(part.upper())
+                    else:
+                        # It's something like '.com'
+                        new_parts.append(part)
+                word = '.'.join(new_parts)
+                tc_line.append(word)
+                continue
+
+            if INLINE_SLASH.search(word):
+                # This repeats INLINE_PERIOD. Could be more elegant.
+                if DEBUG:
+                    print "  INLINE_SLASH matched. Uppercasing if == 1 char: " + word
+                parts = word.split('/')
+                new_parts = []
+                for part in parts:
+                    if len(part) == 1:
+                        # It's an initial like A/M
+                        new_parts.append(part.upper())
+                    else:
+                        # It's something like 'True/False'
+                        new_parts.append(part)
+                word = '/'.join(new_parts)
+                tc_line.append(word)
+                continue
+
+            amp_match = INLINE_AMPERSAND.match(word)
+            if amp_match:
+                if DEBUG:
+                    print "  INLINE_AMPERSAND matched. Uppercasing: " + word
+                tc_line.append("%s%s" % (amp_match.group(1).upper(),
+                                         amp_match.group(2)))
+                continue
+
+            if UC_ELSEWHERE.match(word):
+                if DEBUG:
+                    print "  UC_ELSEWHERE matched. Leaving unchanged: " + word
                 tc_line.append(word)
                 continue
 
             if SMALL_WORDS.match(word):
+                if DEBUG:
+                    print "  SMALL_WORDS matched. Lowercasing: " + word
                 tc_line.append(word.lower())
                 continue
 
             if BIG_WORDS.match(word):
+                if DEBUG:
+                    print "  BIG_WORDS matched. Uppercasing: " + word
                 tc_line.append(word.upper())
                 continue
 
             match = MAC_MC.match(word)
             if match and (word != 'mack'):
+                if DEBUG:
+                    print "  MAC_MAC matched. Capitlizing: " + word
                 tc_line.append("%s%s" % (match.group(1).capitalize(),
                                       match.group(2).capitalize()))
                 continue
 
 
 def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
-    # Borrows heavily from django.utils.encoding.force_unicde
+    # Borrows heavily from django.utils.encoding.force_unicde. 
+    # This should be applied to *input* not *output*!
     # Handle the common case first, saves 30-40% in performance when s
     # is an instance of unicode. This function gets called often in that
     # setting.
-from string_utils import clean_string
-from string_utils import harmonize
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
 
-def test_strings():
+from string_utils import clean_string, force_unicode, harmonize, titlecase
+
+
+def harmonize_and_clean_string_tests():
     '''
     >>> harmonize(clean_string('U.S.A. v. Lissner'))
     u'United States v. Lissner'
     u'United States v. White'
     '''
 
-if __name__ == "__main__":
+
+def titlecase_tests():
+    '''
+    >>> titlecase(force_unicode("Q&A with steve jobs: 'that's what happens in technology'")).encode('utf-8')
+    "Q&A With Steve Jobs: 'That's What Happens in Technology'"
+    >>> titlecase(force_unicode("What is AT&T's problem?")).encode('utf-8')
+    "What is AT&T's Problem?"
+    >>> titlecase(force_unicode('Apple deal with AT&T falls through')).encode('utf-8')
+    'Apple Deal With AT&T Falls Through'
+    >>> titlecase(force_unicode('this v that')).encode('utf-8')
+    'This v That'
+    >>> titlecase(force_unicode('this v. that')).encode('utf-8')
+    'This v. That'
+    >>> titlecase(force_unicode('this vs that')).encode('utf-8')
+    'This vs That'
+    >>> titlecase(force_unicode('this vs. that')).encode('utf-8')
+    'This vs. That'
+    >>> titlecase(force_unicode("The SEC's Apple Probe: What You Need to Know")).encode('utf-8')
+    "The SEC's Apple Probe: What You Need to Know"
+    >>> titlecase(force_unicode("'by the Way, small word at the start but within quotes.'")).encode('utf-8')
+    "'By the Way, Small Word at the Start but Within Quotes.'"
+    >>> titlecase(force_unicode('Small word at end is nothing to be afraid of')).encode('utf-8')
+    'Small Word at End is Nothing to Be Afraid Of'
+    >>> titlecase(force_unicode('Starting Sub-Phrase With a Small Word: a Trick, Perhaps?')).encode('utf-8')
+    'Starting Sub-Phrase With a Small Word: A Trick, Perhaps?'
+    >>> titlecase(force_unicode("Sub-Phrase With a Small Word in Quotes: 'a Trick, Perhaps?'")).encode('utf-8')
+    "Sub-Phrase With a Small Word in Quotes: 'A Trick, Perhaps?'"
+    >>> titlecase(force_unicode('Sub-Phrase With a Small Word in Quotes: "a Trick, Perhaps?"')).encode('utf-8')
+    'Sub-Phrase With a Small Word in Quotes: "A Trick, Perhaps?"'
+    >>> titlecase(force_unicode('"Nothing to Be Afraid of?"')).encode('utf-8')
+    '"Nothing to Be Afraid Of?"'
+    >>> titlecase(force_unicode('"Nothing to be Afraid Of?"')).encode('utf-8')
+    '"Nothing to Be Afraid Of?"'
+    >>> titlecase(force_unicode('a thing')).encode('utf-8')
+    'A Thing'
+    >>> titlecase(force_unicode("2lmc Spool: 'gruber on OmniFocus and vapo(u)rware'")).encode('utf-8')
+    "2lmc Spool: 'Gruber on OmniFocus and Vapo(u)rware'"
+    >>> titlecase(force_unicode('this is just an example.com')).encode('utf-8')
+    'This is Just an example.com'
+    >>> titlecase(force_unicode('this is something listed on del.icio.us')).encode('utf-8')
+    'This is Something Listed on del.icio.us'
+    >>> titlecase(force_unicode('iTunes should be unmolested')).encode('utf-8')
+    'iTunes Should Be Unmolested'
+    >>> text = titlecase(force_unicode('Reading between the lines of steve jobs’s ‘thoughts on music’')).encode('utf-8')
+    >>> result = 'Reading Between the Lines of Steve Jobs\xe2\x80\x99s \xe2\x80\x98thoughts on Music\xe2\x80\x99'
+    >>> text == result
+    True
+    >>> text = titlecase(force_unicode('seriously, ‘repair permissions’ is voodoo')).encode('utf-8')
+    >>> result = 'Seriously, \xe2\x80\x98repair Permissions\xe2\x80\x99 is Voodoo'
+    >>> text == result
+    True
+    >>> titlecase(force_unicode('generalissimo francisco franco: still dead; kieren McCarthy: still a jackass')).encode('utf-8')
+    'Generalissimo Francisco Franco: Still Dead; Kieren McCarthy: Still a Jackass'
+    >>> titlecase(force_unicode('Chapman v. u.s. Postal Service')).encode('utf-8')
+    'Chapman v. U.S. Postal Service'
+    >>> titlecase(force_unicode('Spread Spectrum Screening Llc. v. Eastman Kodak Co.')).encode('utf-8')
+    'Spread Spectrum Screening LLC. v. Eastman Kodak Co.'
+    >>> titlecase(force_unicode('Consolidated Edison Co. of New York, Inc. v. Entergy Nuclear Indian Point 2, Llc.')).encode('utf-8')
+    'Consolidated Edison Co. of New York, Inc. v. Entergy Nuclear Indian Point 2, LLC.'
+    >>> titlecase(force_unicode('Infosint s.a. v. H. Lundbeck A/s')).encode('utf-8')
+    'Infosint S.A. v. H. Lundbeck A/S'
+    >>> titlecase(force_unicode("KEVIN O'CONNELL v. KELLY HARRINGTON")).encode('utf-8')
+    "Kevin O'Connell v. Kelly Harrington"
+    >>> titlecase(force_unicode('International Union of Painter v. J&r Flooring, Inc')).encode('utf-8')
+    'International Union of Painter v. J&R Flooring, Inc'
+    '''
+
+if __name__ == '__main__':
+    '''Run tests with python tests.py'''
     import doctest
     doctest.testmod()

opinions/united_states/federal_appellate/ca4.py

         return [e for e in self.html.xpath('//a[contains(@href, "opinion.pdf")]/@href')]
 
     def _get_case_dates(self):
-        date_regex = re.compile('\d{2}/\d{2}/\d{4}')
+        date_regex = re.compile('\d{1,2}/\d{1,2}/\d{2,4}')
         dates = []
         for e in self.html.xpath('//a[contains(@href, "opinion.pdf")]/following-sibling::text()'):
             try:
                 date_string = date_regex.search(e).group(0)
             except AttributeError:
-                # We have to try a bunch of text notes before we'll get the right ones
+                # We have to try a bunch of text nodes before we'll get the right ones
                 continue
-            dates.append(date.fromtimestamp(time.mktime(time.strptime(date_string, '%m/%d/%Y'))))
+            try:
+                dates.append(date.fromtimestamp(time.mktime(time.strptime(date_string, '%m/%d/%Y'))))
+            except ValueError:
+                dates.append(date.fromtimestamp(time.mktime(time.strptime(date_string, '%m/%d/%y'))))
         return dates
 
     def _get_docket_numbers(self):

opinions/united_states/federal_special/__init__.py

 __all__ = ['armfor',
-           'cavc'
-           'cavc_full_court'
+           'cavc',
+           'cavc_full_court',
            'cit',
            'tax',
            'uscfc',
            'uscfc_u',
            'uscfc_vaccine',
-           'uscfc_vaccine_u', ]
+           'uscfc_vaccine_u']

opinions/united_states/federal_special/uscfc.py

 Court Short Name: Fed. Cl."""
 
 from juriscraper.GenericSite import GenericSite
+from juriscraper.lib.string_utils import titlecase
 import time
-import datetime
 from datetime import date
 import re
 from lxml import html
 
+
 class Site(GenericSite):
     def __init__(self):
         super(Site, self).__init__()
-        self.url = (
-            'http://www.uscfc.uscourts.gov/opinions_decisions_general/Published')
+        self.url = 'http://www.uscfc.uscourts.gov/opinions_decisions_general/Published'
         self.court_id = self.__module__
 
     def _get_case_dates(self):
 
     def _get_case_names(self):
         case_names = []
-        for txt in self.html.xpath('//div[2]/table/tbody/tr/td[3]/text()'):        
-            case_names.append(txt.strip()[:-8].replace('[', '').strip())
+        for txt in self.html.xpath('//div[2]/table/tbody/tr/td[3]/text()'):
+            case_names.append(titlecase(txt.strip()[:-8].replace('[', '')))
         return case_names
 
     def _get_docket_numbers(self):
-        docket_numbers = []
-        regex = re.compile("\d\d.\d*[a-zA-Z]")        
-        return [regex.search(html.tostring(
-                            ele, method ='text', encoding='unicode')).group(0)
-            for ele in self.html.xpath('//div[2]/table/tbody/tr/td[3]')]
-        return docket_numbers
+        regex = re.compile("\d\d.\d*[a-zA-Z]")
+        return [regex.search(html.tostring(ele, method='text', encoding='unicode')).group(0)
+                    for ele in self.html.xpath('//div[2]/table/tbody/tr/td[3]')]
 
     def _get_summaries(self):
         summaries = []
         for txt in self.html.xpath('//div[2]/table/tbody/tr/td[4]/a/text()'):
             summaries.append(txt)
         return summaries
-    
+
     def _get_download_urls(self):
         download_urls = []
         for url in self.html.xpath('//div[2]/table/tbody/tr/td[4]/a/@href'):
         return download_urls
 
     def _get_precedential_statuses(self):
-        return ['Published'] * len(self.case_names)
+        return ['Published'] * len(self.case_names)

opinions/united_states/federal_special/uscfc_u.py

 Court Short Name: Fed. Cl."""
 
 import uscfc
+from juriscraper.lib.string_utils import titlecase
+
 
 class Site(uscfc.Site):
     def __init__(self):
         super(Site, self).__init__()
-        self.url = (
-            'http://www.uscfc.uscourts.gov/opinions_decisions_general/Unpublished')
+        self.url = 'http://www.uscfc.uscourts.gov/opinions_decisions_general/Unpublished'
         self.court_id = self.__module__
 
     def _get_precedential_statuses(self):
-        return ['Unpublished'] * len(self.case_names)
+        return ['Unpublished'] * len(self.case_names)
+
+    def _get_summaries(self):
+        summaries = []
+        for txt in self.html.xpath('//div[2]/table/tbody/tr/td[4]/a/text()'):
+            summaries.append(titlecase(txt.lower()))
+        return summaries

opinions/united_states/federal_special/uscfc_vaccine_u.py

 class Site(uscfc.Site):
     def __init__(self):
         super(Site, self).__init__()
-        self.url = (
-            'http://www.uscfc.uscourts.gov/opinions_decisions_vaccine/Unpublished')
+        self.url = 'http://www.uscfc.uscourts.gov/opinions_decisions_vaccine/Unpublished'
         self.court_id = self.__module__
 
     # Exclude rows without opinions by ensuring there is a sibling row that
     def _get_case_names(self):
         case_names = []
         for txt in self.html.xpath(
-                '//div[2]/table/tbody/tr/td[3][../td[4]/a]/a/text()'):        
+                '//div[2]/table/tbody/tr/td[3][../td[4]/a]/a/text()'):
             case_names.append(txt.strip()[:-8].replace('[', '').strip())
         return case_names
 
     def _get_docket_numbers(self):
         docket_numbers = []
-        regex = re.compile("\d\d.\d*[a-zA-Z]")        
-        return [regex.search(html.tostring(
-                            ele, method ='text', encoding='unicode')).group(0)
-            for ele in self.html.xpath(
-                '//div[2]/table/tbody/tr/td[3][../td[4]/a]')]
+        regex = re.compile("\d{1,2}.\d*[a-zA-Z]?")
+        for txt in self.html.xpath('//div[2]/table/tbody/tr/td[3][../td[4]/a]/a/text()'):
+            try:
+                docket_numbers.append(regex.search(txt).group(0))
+            except AttributeError:
+                # Happens when the regex fails or there's truly no docket number.
+                docket_numbers.append('')
         return docket_numbers
 
     def _get_precedential_statuses(self):
-        return ['Unpublished'] * len(self.case_names)
+        return ['Unpublished'] * len(self.case_names)

opinions/united_states/state/alaska.py

+'''
+Auth: Jordan Atanasov <jordan.atanasov@commetric.com>
+History:
+    2012-05-07: Written by Jordan.
+    2012-07-06: Updated by mlr to only get the first ten items.
+Notes: Only queries first ten dates. Beyond that, they get messy.
+'''
+
 from juriscraper.GenericSite import GenericSite
 import time
 from datetime import date
         self.court_id = self.__module__
 
     def _get_case_names(self):
-        return [e for e in self.html.xpath("//ul/li[descendant::a/em]//em/text()")]
+        return [e for e in self.html.xpath("//ul[position() > 1 and position() <= 10]/li[descendant::a/em]//em/text()")]
 
     def _get_download_urls(self):
-        return [h for h in self.html.xpath("//ul[position() > 1]/li/a[child::em]/@href")]
+        return [h for h in self.html.xpath("//ul[position() > 1 and position() <= 10]/li/a[child::em]/@href")]
 
     def _get_case_dates(self):
         dates = []
-        for h2_element in self.html.xpath('//h2[following-sibling::ul//a/em]'):
+        for h2_element in self.html.xpath('//h2[position() <= 10][following-sibling::ul//a/em]'):
             date_string = str(h2_element.xpath('./text()')[0])
             try:
                 date_obj = date.fromtimestamp(
         return dates
 
     def _get_docket_numbers(self):
-        return [t for t in self.html.xpath("//ul[position() > 1]/li[descendant::a/em]/text()[1]")]
+        return [t for t in self.html.xpath("//ul[position() > 1 and position() <= 10]/li[descendant::a/em]/text()[1]")]
 
     def _get_precedential_statuses(self):
         return ["Published"] * len(self.case_names)

opinions/united_states/state/cal_u.py

-import cal_supreme
+import cal
 
 
-class Site(cal_supreme.Site):
+class Site(cal.Site):
     def __init__(self):
         super(Site, self).__init__()
-        self.url = 'http://www.courtinfo.ca.gov/cms/npopinions.htm'
+        self.url = 'http://www.courts.ca.gov/cms/npopinions.htm'
         self.court_id = self.__module__
 
     def _get_precedential_statuses(self):

opinions/united_states/state/mich.py

 """Scraper for the Supreme Court of Michigan
 CourtID: mich
 Court Short Name: Mich.
-Backscrapers possible back to 2007-08 term on this same page by parsing tables 
+Backscrapers possible back to 2007-08 term on this same page by parsing tables
 further down."""
 
 from juriscraper.GenericSite import GenericSite
 import time
 from datetime import date
 
+
 class Site(GenericSite):
     def __init__(self):
         super(Site, self).__init__()
-        self.url = (
-            'http://courts.michigan.gov/supremecourt/Clerk/Opinions.html')
+        self.url = 'http://courts.michigan.gov/supremecourt/Clerk/Opinions.html'
         self.court_id = self.__module__
 
     def _get_case_dates(self):
         dates = []
-        for txt in self.html.xpath(
-            '//table[4]/tr/td/table[3]/tr/td[1]/text()'):
+        for txt in self.html.xpath('//table[4]/tr/td/table[3]/tr/td[1]/text()'):
             dates.append(date.fromtimestamp(time.mktime(time.strptime(
                 txt.strip(), '%m/%d/%y'))))
         return dates
 
     def _get_docket_numbers(self):
-        return [txt for txt in self.html.xpath(
-            '//table[4]/tr/td/table[3]/tr/td[2]/text()')]
+        return [txt for txt in self.html.xpath('//table[4]/tr/td/table[3]/tr/td[2]/text()')]
 
     def _get_case_names(self):
         case_names = []
         # Some stray p tags and span tags make this xpath complex.
-        for txt in self.html.xpath(
-            '//table[4]/tr/td/table[3]/tr/td[3]/a/text() | //table[4]/tr/td/table[3]/tr/td[3]/*/a/text()'):
+        for txt in self.html.xpath('//table[4]/tr/td/table[3]/tr/td[3]/a/text()'
+                                   ' | //table[4]/tr/td/table[3]/tr/td[3]/*/a/text()'):
             # Two case names ignored because we get them under other names.
             if 'People v King (Larry)' in txt:
                 continue
     def _get_download_urls(self):
         download_urls = []
         # Some stray p tags and span tags make this xpath complex.
-        for txt in self.html.xpath(
-            '//table[4]/tr/td/table[3]/tr/td[3]/a/@href | //table[4]/tr/td/table[3]/tr/td[3]/*/a/@href'):
+        for txt in self.html.xpath('//table[4]/tr/td/table[3]/tr/td[3]/a/@href '
+                                   '| //table[4]/tr/td/table[3]/tr/td[3]/*/a/@href'):
             # This table also includes a url from last year we must ignore.
             if '10-11-Term' in txt:
                 continue
             else:
                 download_urls.append(txt)
         # One url is listed 2x that shouldn't be and so 1 instance is removed.
-        download_urls.remove(
-'http://courts.michigan.gov/supremecourt/Clerk/11-12-Term-Opinions/142695.pdf')
+        download_urls.remove('http://courts.michigan.gov/supremecourt/Clerk/11-12-Term-Opinions/142695.pdf')
         return download_urls
 
     def _get_precedential_statuses(self):
-        return ['Published'] * len(self.case_names)
+        return ['Published'] * len(self.case_names)

opinions/united_states/state_backscrapers/__init__.py

            'ind_archive',
            'mich_2007_2008',
            'mich_2008_2009',
-           'mich_2009_2001',
+           'mich_2009_2010',
            'mich_2010_2011', ]

opinions/united_states/state_backscrapers/cal_archive.py

-import cal_supreme
+from juriscraper.opinions.united_states.state import cal_supreme
 
 
 class Site(cal_supreme.Site):
                     print 'EmptyFileError: %s' % download_url
                     print traceback.format_exc()
                     continue
-            except:
+            except Exception:
                 print 'DownloadingError: %s' % download_url
                 print traceback.format_exc()
                 continue
                       'neutral_citations', 'precedential_statuses',
                       'summaries', 'west_citations']
         for attr in attributes:
-            if site.__getattribute__(attr) is not None:
-                print '    %s: %s' % (attr, site.__getattribute__(attr)[i])
+            if getattr(site, attr) is not None:
+                print ('    %s: %s' % (attr, getattr(site, attr)[i])).encode('utf-8')
 
         # Extract the contents using e.g. antiword, pdftotext, etc.
         # extract_doc_content(data)
 
-    print "%s: Successfully crawled." % site.court_id
+    print '%s: Successfully crawled.' % site.court_id
 
 
 def main():
-    print 'Starting up the scraper.'
     global die_now
 
     # this line is used for handling SIGTERM (CTRL+4), so things can die safely
 
     usage = ('usage: %prog -c COURTID [-d|--daemon] [-b|--binaries]\n\n'
              'To test ca1, downloading binaries, use: \n'
-             '    python %prog -c opinions.united_states.federal.ca1 -b\n\n'
+             '    python %prog -c opinions.united_states.federal_appellate.ca1 -b\n\n'
              'To test all federal courts, disregarding binaries, use: \n'
-             '    python %prog -c opinions.united_states.federal')
+             '    python %prog -c opinions.united_states.federal_appellate')
     parser = OptionParser(usage)
     parser.add_option('-c', '--courts', dest='court_id', metavar="COURTID",
                       help=('The court(s) to scrape and extract. This should be in '
                                       ['*']).__all__
         except AttributeError:
             # Lacks the __all__ attribute. Probably of the form:
-            # juriscraper.opinions.united_states.federal.ca1
+            # juriscraper.opinions.united_states.federal_appellate.ca1
             mod_str_list = [court_id.rsplit('.', 1)[1]]
         except ImportError:
             parser.error('Unable to import module or package. Aborting.')
 
+        print 'Starting up the scraper.'
         num_courts = len(mod_str_list)
         i = 0
         while i < num_courts:
                                  [mod_str_list[i]])
             try:
                 scrape_court(mod, binaries)
-            except:
-                print '********!! CRAWLER DOWN !!***********'
-                print '*****scrape_court method failed!*****'
-                print '********!! ACTION NEEDED !!**********'
+            except Exception:
+                print '*************!! CRAWLER DOWN !!****************'
+                print '*****scrape_court method failed on mod: %s*****' % mod_str_list[i]
+                print '*************!! ACTION NEEDED !!***************'
                 print traceback.format_exc()
                 i += 1
                 continue
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.