Davide Alberani avatar Davide Alberani committed cb2f74c

helper functions to handle movie AKAs titles in various languages

Comments (0)

Files changed (6)

docs/CONTRIBUTORS.txt

 share copyright on some (minor) portions of the code:
 
 NAME: Alberto Malagoli
-CONTRIBUTION: developed the new web site, and detain the copyright of it.
+CONTRIBUTION: developed the new web site, and detains the copyright of it,
+and provided helper functions and other code.
 
 
 NAME: Martin Kirst

docs/Changelog.txt

   Changelog for IMDbPY
   ====================
 
-* What's the new in release 4.9dev20120218 "In Time" (18 Feb 2012)
+* What's the new in release 4.9dev20120225 "Kick-Ass" (25 Feb 2012)
   [general]
   - urls used to access the IMDb site can be configured.
+  - helpers function to handle movie AKAs in various
+    languages (code by Alberto Malagoli).
 
   [http]
   - fix for business information.
 
 __all__ = ['IMDb', 'IMDbError', 'Movie', 'Person', 'Character', 'Company',
             'available_access_systems']
-__version__ = VERSION = '4.9dev20120218'
+__version__ = VERSION = '4.9dev20120225'
 
 # Import compatibility module (importing it is enough).
 import _compat
 # Maps a language to countries where it is the main language.
 # If you want to add an entry for another language or country, mail it at
 # imdbpy-devel@lists.sourceforge.net .
-_LANG_COUNTRIES = {
-    'English': ('USA', 'UK', 'Canada', 'Ireland', 'Australia'),
-    'Italian': ('Italy',),
-    'Spanish': ('Spain', 'Mexico'),
-    'Portuguese': ('Portugal', 'Brazil'),
-    'Turkish': ('Turkey',),
-    #'German': ('Germany', 'East Germany', 'West Germany'),
-    #'French': ('France'),
+LANG_COUNTRIES = {
+    'English': ('Canada', 'Swaziland', 'Ghana', 'St. Lucia', 'Liberia', 'Jamaica', 'Bahamas', 'New Zealand', 'Lesotho', 'Kenya', 'Solomon Islands', 'United States', 'South Africa', 'St. Vincent and the Grenadines', 'Fiji', 'UK', 'Nigeria', 'Australia', 'USA', 'St. Kitts and Nevis', 'Belize', 'Sierra Leone', 'Gambia', 'Namibia', 'Micronesia', 'Kiribati', 'Grenada', 'Antigua and Barbuda', 'Barbados', 'Malta', 'Zimbabwe', 'Ireland', 'Uganda', 'Trinidad and Tobago', 'South Sudan', 'Guyana', 'Botswana', 'United Kingdom', 'Zambia'),
+    'Italian': ('Italy', 'San Marino', 'Vatican City'),
+    'Spanish': ('Spain', 'Mexico', 'Argentina', 'Bolivia', 'Guatemala', 'Uruguay', 'Peru', 'Cuba', 'Dominican Republic', 'Panama', 'Costa Rica', 'Ecuador', 'El Salvador', 'Chile', 'Equatorial Guinea', 'Spain', 'Colombia', 'Nicaragua', 'Venezuela', 'Honduras', 'Paraguay'),
+    'French': ('Cameroon', 'Burkina Faso', 'Dominica', 'Gabon', 'Monaco', 'France', "Cote d'Ivoire", 'Benin', 'Togo', 'Central African Republic', 'Mali', 'Niger', 'Congo, Republic of', 'Guinea', 'Congo, Democratic Republic of the', 'Luxembourg', 'Haiti', 'Chad', 'Burundi', 'Madagascar', 'Comoros', 'Senegal'),
+    'Portuguese': ('Portugal', 'Brazil', 'Sao Tome and Principe', 'Cape Verde', 'Angola',  'Mozambique', 'Guinea-Bissau'),
+    'German': ('Liechtenstein', 'Austria', 'West Germany', 'Switzerland', 'East Germany', 'Germany'),
+    'Arabic': ('Saudi Arabia', 'Kuwait', 'Jordan', 'Oman', 'Yemen', 'United Arab Emirates', 'Mauritania', 'Lebanon', 'Bahrain', 'Libya', 'Palestinian State (proposed)', 'Qatar', 'Algeria', 'Morocco', 'Iraq', 'Egypt', 'Djibouti', 'Sudan', 'Syria', 'Tunisia'),
+    'Turkish': ('Turkey', 'Azerbaijan'),
+    'Swahili': ('Tanzania',),
+    'Swedish': ('Sweden',),
+    'Icelandic': ('Iceland',),
+    'Estonian': ('Estonia',),
+    'Romanian': ('Romania',),
+    'Samoan': ('Samoa',),
+    'Slovenian': ('Slovenia',),
+    'Tok Pisin': ('Papua New Guinea',),
+    'Palauan': ('Palau',),
+    'Macedonian': ('Macedonia',),
+    'Hindi': ('India',),
+    'Dutch': ('Netherlands', 'Belgium', 'Suriname'),
+    'Marshallese': ('Marshall Islands',),
+    'Korean': ('Korea, North', 'Korea, South', 'North Korea', 'South Korea'),
+    'Vietnamese': ('Vietnam',),
+    'Danish': ('Denmark',),
+    'Khmer': ('Cambodia',),
+    'Lao': ('Laos',),
+    'Somali': ('Somalia',),
+    'Filipino': ('Philippines',),
+    'Hungarian': ('Hungary',),
+    'Ukrainian': ('Ukraine',),
+    'Bosnian': ('Bosnia and Herzegovina',),
+    'Georgian': ('Georgia',),
+    'Lithuanian': ('Lithuania',),
+    'Malay': ('Brunei',),
+    'Tetum': ('East Timor',),
+    'Norwegian': ('Norway',),
+    'Armenian': ('Armenia',),
+    'Russian': ('Russia',),
+    'Slovak': ('Slovakia',),
+    'Thai': ('Thailand',),
+    'Croatian': ('Croatia',),
+    'Turkmen': ('Turkmenistan',),
+    'Nepali': ('Nepal',),
+    'Finnish': ('Finland',),
+    'Uzbek': ('Uzbekistan',),
+    'Albanian': ('Albania', 'Kosovo'),
+    'Hebrew': ('Israel',),
+    'Bulgarian': ('Bulgaria',),
+    'Greek': ('Cyprus', 'Greece'),
+    'Burmese': ('Myanmar',),
+    'Latvian': ('Latvia',),
+    'Serbian': ('Serbia',),
+    'Afar': ('Eritrea',),
+    'Catalan': ('Andorra',),
+    'Chinese': ('China', 'Taiwan'),
+    'Czech': ('Czech Republic', 'Czechoslovakia'),
+    'Bislama': ('Vanuatu',),
+    'Japanese': ('Japan',),
+    'Kinyarwanda': ('Rwanda',),
+    'Amharic': ('Ethiopia',),
+    'Persian': ('Afghanistan', 'Iran'),
+    'Tajik': ('Tajikistan',),
+    'Mongolian': ('Mongolia',),
+    'Dzongkha': ('Bhutan',),
+    'Urdu': ('Pakistan',),
+    'Polish': ('Poland',),
+    'Sinhala': ('Sri Lanka',),
 }
 
 # Maps countries to their main language.
 COUNTRY_LANG = {}
-for lang in _LANG_COUNTRIES:
-    for country in _LANG_COUNTRIES[lang]:
+for lang in LANG_COUNTRIES:
+    for country in LANG_COUNTRIES[lang]:
         COUNTRY_LANG[country] = lang
 
 
 This module provides functions not used directly by the imdb package,
 but useful for IMDbPY-based programs.
 
-Copyright 2006-2010 Davide Alberani <da@erlug.linux.it>
+Copyright 2006-2012 Davide Alberani <da@erlug.linux.it>
+               2012 Alberto Malagoli <albemala AT gmail.com>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 # XXX: find better names for the functions in this modules.
 
 import re
+import difflib
 from cgi import escape
 import gettext
 from gettext import gettext as _
                     re_characterRef, _tagAttr, _Container, TAGS_TO_MODIFY
 from imdb import IMDb, imdbURL_movie_base, imdbURL_person_base, \
                     imdbURL_character_base
+
 import imdb.locale
+from imdb.articles import COUNTRY_LANG
 from imdb.Movie import Movie
 from imdb.Person import Person
 from imdb.Character import Character
     return None
 
 
+_re_akas_lang = re.compile('(?:[(])([a-zA-Z]+?)(?: title[)])')
+_re_akas_country = re.compile('\(.*?\)')
+
+# akasLanguages, sortAKAsBySimilarity and getAKAsInLanguage code
+# copyright of Alberto Malagoli (refactoring by Davide Alberani).
+def akasLanguages(movie):
+    """Given a movie, return a list of tuples in (lang, AKA) format;
+    lang can be None, if unable to detect."""
+    lang_and_aka = []
+    akas = set((movie.get('akas') or []) +
+                (movie.get('akas from release info') or []))
+    for aka in akas:
+        # split aka
+        aka = aka.encode('utf8').split('::')
+        # sometimes there is no countries information
+        if len(aka) == 2:
+            # search for something like "(... title)" where ... is a language
+            language = _re_akas_lang.search(aka[1])
+            if language:
+                language = language.groups()[0]
+            else:
+                # split countries using , and keep only the first one (it's sufficient)
+                country = aka[1].split(',')[0]
+                # remove parenthesis
+                country = _re_akas_country.sub('', country).strip()
+                # given the country, get corresponding language from dictionary
+                language = COUNTRY_LANG.get(country)
+        else:
+            language = None
+        lang_and_aka.append((language, aka[0].decode('utf8')))
+    return lang_and_aka
+
+
+def sortAKAsBySimilarity(movie, title, _titlesOnly=True, _preferredLang=None):
+    """Return a list of movie AKAs, sorted by their similarity to
+    the given title.
+    If _titlesOnly is not True, similarity information are returned.
+    If _preferredLang is specified, AKAs in the given language will get
+    a higher score.
+    The return is a list of title, or a list of tuples if _titlesOnly is False."""
+    language = movie.guessLanguage()
+    # estimate string distance between current title and given title
+    m_title = movie['title'].lower()
+    l_title = title.lower()
+    if isinstance(l_title, unicode):
+        l_title = l_title.encode('utf8')
+    scores = []
+    score = difflib.SequenceMatcher(None, m_title.encode('utf8'), l_title).ratio()
+    # set original title and corresponding score as the best match for given title
+    scores.append((score, movie['title'], None))
+    for language, aka in akasLanguages(movie):
+        # estimate string distance between current title and given title
+        m_title = aka.lower()
+        if isinstance(m_title, unicode):
+            m_title = m_title.encode('utf8')
+        score = difflib.SequenceMatcher(None, m_title, l_title).ratio()
+        # if current language is the same as the given one, increase score
+        if _preferredLang and _preferredLang == language:
+            score += 1
+        scores.append((score, aka, language))
+    scores.sort(reverse=True)
+    if _titlesOnly:
+        return [x[1] for x in scores]
+    return scores
+
+
+def getAKAsInLanguage(movie, lang, _searchedTitle=None):
+    """Return a list of AKAs of a movie, in the specified language.
+    If _searchedTitle is given, the AKAs are sorted by their similarity
+    to it."""
+    akas = []
+    for language, aka in akasLanguages(movie):
+        if lang == language:
+            akas.append(aka)
+    if _searchedTitle:
+        scores = []
+        if isinstance(_searchedTitle, unicode):
+            _searchedTitle = _searchedTitle.encode('utf8')
+        for aka in akas:
+            m_aka = aka
+            if isinstance(m_aka):
+                m_aka = m_aka.encode('utf8')
+            scores.append(difflib.SequenceMatcher(None, m_aka.lower(),
+                            _searchedTitle.lower()), aka)
+        scores.sort(reverse=True)
+        akas = [x[1] for x in scores]
+    return akas
+

imdb/parser/http/__init__.py

         ret['info sets'] = ('release dates', 'akas')
         return ret
     get_movie_akas = get_movie_release_dates
+    get_movie_release_info = get_movie_release_dates
 
     def get_movie_vote_details(self, movieID):
         cont = self._retrieve(self.urls['movie_main'] % movieID + 'ratings')
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.