Commits

Timo Sulg committed 2bc3e03

For Chapter.4 - add SqlAlchemy and NLTK examples

  • Participants
  • Parent commits 2db993d

Comments (0)

Files changed (6)

File Chapter4/advanced_nltk.py

+'''
+Chapter 4.3 Extracting Intelligence step by step
+Text processing with NLTK module.
+
+Remember typical steps in text analysis are:
+    * tokenization -parse the text to generate terms
+    * normalize - convert the, lowercase 
+    * eliminate stop words - eliminate terms that appear very often 
+        and dont have semantic value
+    * Stemming - convert the terms into their stemmed words
+
+ps: There is superb book "Python Text Processing with NLTK 2.0", 
+found copy and  take more from NLTK.
+'''
+from nltk.corpus import webtext
+from ntlk.coprus import stopwords
+from ntlk.tokenize import sent_tokenize, word_tokenize
+from nltk.probability import FreqDist
+
+def demo():
+    """ introduce NTLK """
+
+    #get tokenized text - for this example we will use  Monty.Python Holy Grail
+    terms = [w.lower() for w in webtext.words('grail.txt')]
+    #eliminate stopwords
+    stopset = set(stopwords.words('english')) #more langs: stopwords.fileids()
+    filter_stops = lambda w: len(w) < 3 or w in stopset
+    terms
+
+def demo2():
+    """demo with realdata and usingour data base schema """
+    paragraph = "Hello! Thats just unfinished example. Thanks for reading"
+    sentence = sent_tokenize(paragraph)
+    words = word_tokenize(sentence[0])
+    print words
+    #TODO: eliminate words
+    #calculate frequencies
+    fd = FreqDist()
+    for word in words:
+        fd.inc(word)
+
+
+if __name__ == "__main__":
+    demo()
+

File Chapter4/database_schema.py

+'''
+    Database schema for Chapter.4
+    requirements:
+        sqlite
+Comment:
+    I just rewrote author schema - its not perfect either reallife example and demo, but
+    good enough to study SqlAlchemy library
+'''
+
+from sqlalchemy import Table, Column
+from sqlalchemy import Integer, String, DateTime
+from sqlalchemy import MetaData
+from sqlalchemy import ForeignKey
+from sqlalchemy.orm import mapper
+from sqlalchemy.ext.declarative import declarative_base
+
+from datetime import datetime
+
+
+#defining tables within a  catalog calles MetaData
+metadata = MetaData()
+
+#Lets define blogs table and  Blog class, which maps python object to db record
+#This demo separates logic(python object) and real database table
+blogs_table = Table('blogs', metadata,
+        Column('blog_id',Integer, primary_key = True),
+        Column("blog_name", String(200)),
+        Column("user_id", Integer, ForeignKey('users.user_id')),
+        Column('create_date', DateTime)
+        )
+
+class Blog(object):
+    ''' mapping python object to db record '''
+    def __init__(self, name, user_id, create_date = None):
+        self.blog_name = name
+        self.user_id = user_id
+        self.create_date = create_date if create_date != None else datetime.utcnow()
+
+    def __repr__(self):
+        return "<Blog('%s', '%s', '%s')" % (self.blog_name,
+            self.user_id, self.create_date)
+
+mapper(Blog, blogs_table) #map Blog class with blogs table
+
+
+#Showing how to create table,class and mapper together, once and declaratively
+Base = declarative_base()
+
+
+class User(Base):
+    """ i added it myself - to normalize Blogs table"""
+    __tablename__ = "users"
+    user_id = Column( Integer, primary_key = True)
+    username = Column(String)
+    name = Column( String)
+    passwd = Column( String)
+
+    def __init__(self, name,username, passwd):
+        self.name = name
+        self.username = username
+        self.passwd = passwd #TODO: god sake, you'll encrypt in reallife examples
+
+    def __repr__(self):
+        return "User('%s', '%s')" % (self.user_id, self.name)
+
+class BlogEntry(Base):
+    ''' ...  '''
+    __tablename__ = 'blog_entries'
+    blog_entry_id = Column( Integer, primary_key = True)
+    blog_id =  Column( Integer, ForeignKey('blogs.blog_id'))
+    title = Column( String(200))
+    body = Column( String) #TODO: in book there is BLOB
+    permalink = Column(String)
+    version_id = Column( Integer)
+    create_date = Column( DateTime)
+    last_update_date = Column( DateTime)
+    count = 0
+
+    def __init__(self, blog_id, title, body) :
+        self.blog_id = blog_id
+        self.title = title
+        self.body = body
+        self.version_id  =  self.count + 1 #actually need more advanced solution
+        self.create_date =  datetime.utcnow() #samestory
+        self.update_date = datetime.utcnow() #samestory #2
+
+    def __repr__(self):
+        return "%s('%s, %s')"%(self.__name__,
+                self.last_update_date, self.title)
+
+class BlogComment(Base):
+    """ comment object for  comment of blog entry -
+    as you see it also includes userinfo - for anon users"""
+    __tablename__ = "blog_entry_comment"
+    blog_comment_entry_id = Column( Integer, primary_key = True)
+    blog_entry_id = Column( Integer, ForeignKey('blog_entries.blog_entry_id'))
+    comment = Column( String)
+    name = Column( String)
+    email = Column(String)
+    url = Column(String)
+    visibility = Column(Integer)
+    create_date = Column( DateTime)
+
+    def __init__(self, comment, name, email, url, visibility = True):
+        self.comment = comment
+        self.name = name
+        self.email = email
+        self.url = url
+        self.visibility = visibility #default all comments are visible
+        self.create_date = datetime.utcnow()
+
+    def __repr__(self):
+        return "%s('%s, %s')"%(self.__name__,
+                self.create_date, self.name)
+
+#following classes are not used in demo
+
+class ReferenceWeblog(Base):
+    '''table that links entries '''
+    __tablename__ = "reference_weblog"
+    ref_id = Column( Integer, primary_key = True)
+    blog_entry_id = Column( Integer, ForeignKey('blog_entries.blog_entry_id'))
+    title = Column( String)
+    track_time = Column(DateTime)
+    url = Column( String)
+    visibility = Column(Integer)
+    create_date = Column(DateTime)
+
+    def __init__(self, ref_id, blog_entry_id, title, url = "",visibility = True):
+        self.ref_id = ref_id
+        self.blog_entry_id = blog_entry_id
+        self.title = title if len(title) > 0 else "Lazy user"
+        self.visibility = visibility
+        self.url = url
+        self.create_date = datetime.utcnow()
+        self.tracktime = datetime.utcnow() #FIX: what is point of this field???
+
+    def __repr__(self):
+        return "%s('%s',Visible:'%s', %s )" % ( self.__name__,
+                self.create_date, self.visibility, self.title)
+
+class BlogHistory(Base):
+    ''' '''
+    __tablename__ = "blog_entry_history"
+    history_id = Column(Integer, primary_key = True)
+    blog_entry_id = Column(Integer)
+    version_id = Column(Integer)
+    blog_id = Column(Integer)
+    title = Column(String)
+    body = Column(String)
+    permalink_url = Column(String)
+    create_date = Column(DateTime)
+    last_update_date = Column(DateTime)
+
+    def __init__(self, blog_entry_id, blog_id, version_id, title, body):
+        self.blog_entry_id = blog_entry_id
+        self.blog_id = blog_id
+        self.title = title
+        self.body = body
+        self.create_date = DateTime.utcnow() #Fix: in reallife solution - fix it
+        self.last_update_date = DateTime.utcnow()
+
+    def __repr__(self):
+        return "%s('%s', %s )" % ( self.__name__,
+                self.create_date, self.title)
+

File Chapter4/database_schema.pyc

Binary file added.

File Chapter4/dummy_data.py

+'''
+This module includes dummy data.
+
+'''
+users = [
+            ("Chuck Norris", "texasranger", "god"),
+            ("Charlie Sheen", "sugardaddy", "boobs"),
+            ("Justin Beaver", "bigboy12",   "ILoveMum!!")
+        ]
+
+#add blogs
+blogs = [ ("Chuck kicks ass", 1),
+          ("Chuck says...", 1),
+          ("Charlie's list", 2),
+          ("Charlie cookbook", 2),
+          ("2 and 1/2 men", 2),
+          ("Mum and me", 3),
+        ]
+
+#add blog entities
+entities = [
+            (1, "Welcome", "I just post my first roundhouse kick"),
+            (1, "Second", "I just kicked second time."),
+            (1, "Third post", "I called to Steve and moves eyebrow. He had smt in mind."),
+            (3, "Todays candys", "Sheryl, Lee"),
+            (3, "Tonight", "I take 2whisky and called pizza. Lena, Sheena"),
+            (4, "Sitcom and Charlie", "It sucks"),
+            (5, "I love Mum", "Mum,mum"),
+            (5, "New hit", "Yeah, my new hit \"Baby,Baby, babie yeah\" is out.")
+        ]
+
+
+
+
+
+

File Chapter4/dummy_data.pyc

Binary file added.

File Chapter4/init_db.py

+'''
+This module helps to initiliaze demo database and
+inserts dummy data to DB.
+
+'''
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
+from sqlalchemy import MetaData
+
+#usually bad idea - luckly
+from database_schema import *
+from dummy_data import *
+
+
+#TODO: dummy data to special module ?
+#dummy users - mapping - name, email, passwd
+
+
+def initialize_demo():
+    """ add tables and inserts data into fresh tables"""
+    #make connection and initialize session
+    Session = sessionmaker()
+    engine = create_engine("sqlite:///:memory:")
+    Session.configure(bind = engine)
+    metadata = MetaData()
+    session = Session()
+
+    init_dummy(session, ["users","blogs", "entities"])
+
+    #test inserting
+    for instance in session.query(User).order_by(User.user_id):
+        print instance.name
+    #create structure
+    #insert dummy data
+    session.commit()
+
+#TODO: refactor it
+def init_dummy(session, tables):
+    ''' insert dummy data to specified tables'''
+    Table = None
+    for table in tables:
+        if table == "users":
+            Table = User
+            data = users
+        elif table == "blogs":
+            Table = Blog
+            data = blogs
+        elif table == "entities":
+            Table = BlogEntry
+            data = entities
+        else:
+            Table = None
+            data = None
+        #add values to table
+        if Table != None:
+            rows = [Table(*values) for values in data]
+            session.add_all(rows)
+            session.commit()
+
+
+if __name__ == "__main__":
+    """ if users runs code on command-line """
+    initialize_demo()