Commits

Adrian Sampson committed 0ef02e0 Merge

Merge pull request #325 from KraYmer/master

lyrics: google backend should turn up more results

Comments (0)

Files changed (1)

beetsplug/lyrics.py

     lyrics = re.sub(r'\n +', '\n', lyrics)
     lyrics = re.sub(r' +\n', '\n', lyrics)
     lyrics = TAG_RE.sub('', lyrics) # Strip remaining HTML tags.
+    lyrics = lyrics.replace('\r','\n')
     lyrics = lyrics.strip()
     return lyrics
 
         log.exception("Failing to normalize '%s'" % (text))
     return urllib.quote(text)
 
+
+BY_TRANS     = ['by', 'par']
+LYRICS_TRANS = ['lyrics', 'paroles']
+
 def is_page_candidate(urlLink, urlTitle, title, artist):
     """Return True if the URL title makes it a good candidate to be a
     page that contains lyrics of title by artist.
     """
     title = slugify(title.lower())
     artist = slugify(artist.lower())
-    urlLink = slugify(urlLink.lower())
+    sitename = re.search(u"//([^/]+)/.*", slugify(urlLink.lower())).group(1)
     urlTitle = slugify(urlTitle.lower())
 
     # Check if URL title contains song title (exact match)
         return True
     # or try extracting song title from URL title and check if
     # they are close enough
-    songTitle = urlTitle.replace('lyrics', '') \
-                        .replace(artist, '').strip('%20')
+    tokens = [by+'%20'+artist for by in BY_TRANS] + \
+             [artist, sitename, sitename.replace('www.','')] + LYRICS_TRANS
+    songTitle = re.sub(u'(%s)' % u'|'.join(tokens) ,u'', urlTitle).strip('%20')
+
     if songTitle:
         log.debug("Match ratio of '%s' with title: %s" %
                   (songTitle,
         tokensStr[idx] = ltoken[0] + '\n' + ltoken[1]
     return ''.join(tokensStr)
 
-def decimate_line_feeds(text):
-    """Decimate newline characters. By default use only one newline as
-    an end-of-line marker. Keep at most two newlines in a row (e.g., to
-    separate verses).
-    """
-    # Remove first occurrence of \n for each sequence of \n
-    text = re.sub(r'\n(\n+)', '\g<1>', text)
-    # Keep at most two \n in a row
-    text = re.sub(r'\n\n+', '\n\n', text)
-    return text.strip('\n')
-
 def sanitize_lyrics(text):
     """Clean text, returning raw lyrics as output or None if it happens
     that input text is actually not lyrics content.  Clean (x)html tags
     """
     text = strip_cruft(text, False)
 
+    # Suppress advertisements.
+    # Match lines with an opening bracket but no ending one, ie lines that
+    # contained html link that has been wiped out when scraping.
+    LINK1_RE = re.compile(r'(\(|\[).*[^\)\]]$')
+    # Match lines containing url between brackets
+    LINK2_RE = re.compile(r'(\(|\[).*[http|www].*(\]|\))')
+    text = LINK1_RE.sub('', text)
+    text = LINK2_RE.sub('', text)
+
     # Restore \n in input text
     if '\n' not in text:
         text = insert_line_feeds(text)
 
-    # Suppress advertisements.
-    textLines = text.splitlines(True)
-    # Match lines with an opening bracket but no ending one, ie lines that
-    # contained html link that has been wiped out when scraping.
-    reAdHtml = re.compile(r'(\(|\[).*[^\)\]]$')
-    # Match lines containing url between brackets
-    reAdTxt  = re.compile(r'(\(|\[).*[http|www].*(\]|\))')
-    for line in textLines:
-        if re.match(reAdHtml, line) or re.match(reAdTxt, line):
-            textLines.remove(line)
+    while text.count('\n\n') > text.count('\n')/4:
+        # Remove first occurrence of \n for each sequence of \n
+        text = re.sub(r'\n(\n+)', '\g<1>', text)
 
-    # \n might have been duplicated during the scraping.
-    # decimate \n while number of \n represent more than half the number of
-    # lines
-    while len([x for x in textLines if x == '\n']) >= len(textLines) / 2 - 1:
-        if len(textLines) <= 3:
-            break
-        text = ''.join(textLines)
-        text = decimate_line_feeds(text)
-        textLines = [line.strip(' ') for line in text.splitlines(True)]
+    text = re.sub(r'\n\n+', '\n\n', text)   # keep at most two \n in a row
 
-    return ''.join(textLines)
+    return text
 
 def is_lyrics(text, artist):
     """Determine whether the text seems to be valid lyrics.
     """Scrape lyrics from a URL. If no lyrics can be found, return None
     instead.
     """
-    from bs4 import BeautifulSoup, Tag
+    from bs4 import BeautifulSoup, Tag, Comment
     html = fetch_url(url)
     soup = BeautifulSoup(html)
 
-    # Simplify the code by replacing some markers by the <p> marker
+    for tag in soup.findAll('br'):
+        tag.replaceWith('\n')
+
+    # Remove non relevant html parts
+    [s.extract() for s in soup(['head', 'script'])]
+    comments = soup.findAll(text=lambda text:isinstance(text, Comment))
+    [s.extract() for s in comments]
+
     try:
-        for tag in soup.findAll(['center', 'blockquote']):
-            pTag = Tag(soup, "p")
-            pTag.contents = tag.contents
-            tag.replaceWith(pTag)
-
-        for tag in soup.findAll(['script', 'a', 'font']):
-            tag.replaceWith('<p>')
+        for tag in soup.findAll(True):
+            tag.name = 'p'          # keep tag contents
 
     except Exception, e:
         log.debug('Error %s when replacing containing marker by p marker' % e,
             exc_info=True)
 
-    for tag in soup.findAll('br'):
-        tag.replaceWith('\n')
-
-    # Keep only tags that can possibly be parent tags and eol
-    for tag in soup.findAll(True):
-        containers = ['div', 'p', 'html', 'body', 'table', 'tr', 'td', 'br']
-        if tag.name not in containers:
-            tag.extract()
-
     # Make better soup from current soup! The previous unclosed <p> sections
     # are now closed.  Use str() rather than prettify() as it's more
     # conservative concerning EOL
         pTag.insert(0, bodyTag)
 
     tagTokens = []
+
     for tag in soup.findAll('p'):
         soup2 = BeautifulSoup(str(tag))
         # Extract all text of <p> section.
         # Lyrics are expected to be the longest paragraph
         tagTokens = sorted(tagTokens, key=len, reverse=True)
         soup = BeautifulSoup(tagTokens[0])
-        if soup.findAll(['div', 'a']):
-            return None
         return unescape(tagTokens[0].strip("\n\r: "))
 
 def fetch_google(artist, title):
                               (item.artist, item.title))
 
         item.lyrics = lyrics
+
         if write:
             item.write()
         lib.store(item)