Commits

mitsuhiko  committed e964817

Changed WordPress importer to use the new inject_implicit_paragraph() function that injects implicit <p> tags similar to WordPress. This should fix #97

  • Participants
  • Parent commits d781373

Comments (0)

Files changed (7)

File zine/database.py

     db.Column('comment_id', db.Integer, primary_key=True),
     db.Column('post_id', db.Integer, db.ForeignKey('posts.post_id')),
     db.Column('user_id', db.Integer, db.ForeignKey('users.user_id')),
-    db.Column('author', db.String(100)),
+    db.Column('author', db.String(160)),
     db.Column('email', db.String(250)),
     db.Column('www', db.String(200)),
     db.Column('text', db.Text),

File zine/forms.py

 
 class NewCommentForm(forms.Form):
     """New comment form for authors."""
+    # implementation detail: the maximum length of the column in the
+    # database is longer than that.  However we don't want users to
+    # insert too long names there.  The long column is reserved for
+    # pingbacks and such.
     author = forms.TextField(lazy_gettext(u'Name*'), required=True,
                              max_length=100, messages=dict(
         too_long=lazy_gettext(u'Your name is too long.'),

File zine/importers/__init__.py

                             author.www, author.is_author)
                 if author.pw_hash:
                     user.pw_hash = author.pw_hash
-                user.privileges.update(author.privileges)
+                user.own_privileges.update(author.privileges)
             author_mapping[author.id] = user
         return author_mapping[author.id]
 
     """Represents an author."""
 
     def __init__(self, username, email, real_name=u'', description=u'',
-                 pw_hash=None, is_author=True, extra=None, id=None):
+                 www=u'', pw_hash=None, is_author=True, extra=None,
+                 id=None):
         if id is None:
             id = _make_id(username, email)
         self.id = id
-        self.username = username
+        self.username = username[:30]
         self.real_name = real_name or u''
         self.email = email or u''
         self.description = description or u''
+        self.www = www or u''
         self.privileges = set([ENTER_ADMIN_PANEL])
         self.is_author = is_author
         self.pw_hash = pw_hash
     """Represents a tag."""
 
     def __init__(self, slug, name=None):
-        self.slug = slug
+        self.slug = slug[:150]
         if name is None:
             name = slug
-        self.name = name
+        self.name = name[:100]
 
     @property
     def id(self):
                  pub_date, remote_addr, parser=None, is_pingback=False,
                  status=COMMENT_MODERATED, blocked_msg=u'',
                  parser_data=None):
+        if isinstance(author, basestring):
+            author = author[:400]
         self.author = author
         self.author_email = author_email
         self.author_url = author_url

File zine/importers/feed.py

                 element.findtext(zine.email),
                 element.findtext(zine.real_name),
                 element.findtext(zine.description),
+                element.findtext(zine.www),
                 element.findtext(zine.pw_hash),
                 _to_bool(element.findtext(zine.is_author)),
                 _pickle(element.findtext(zine.extra)),

File zine/importers/wordpress.py

 from zine.utils.validators import is_valid_url
 from zine.utils.admin import flash
 from zine.utils.xml import Namespace, html_entities, escape
+from zine.utils.zeml import parse_html, inject_implicit_paragraphs
 from zine.utils.http import redirect_to
 from zine.models import COMMENT_UNMODERATED, COMMENT_MODERATED, \
      STATUS_DRAFT, STATUS_PUBLISHED
                          r'(</wp:comment_content>.*?</wp:comment>)(?s)')
 
 
+def _wordpress_to_html(markup):
+    """Convert WordPress-HTML into read HTML."""
+    return inject_implicit_paragraphs(parse_html(markup)).to_html()
+
+
 def parse_broken_wxr(fd):
     """This method reads from a file descriptor and parses a WXR file as
     created by current WordPress versions.  This method also injects a
             pub_date,
             get_author(item.findtext(DC_METADATA.creator)),
             item.findtext('description'),
-            item.findtext(CONTENT.encoded),
+            _wordpress_to_html(item.findtext(CONTENT.encoded)),
             [tags[x.text] for x in item.findall('tag')
              if x.text in tags],
             [categories[x.text] for x in item.findall('category')

File zine/utils/zeml.py

 _entity_re = re.compile(r'&([^;]+);')
 _entity_re = re.compile(r'&([^;]+);')
 _paragraph_re = re.compile(r'(\s*?\n){2,}')
+_autoparagraphed_elements = set(['div', 'blockquote'])
 
 _entities = {
     'Aacute':       u'\xc1',        'aacute':       u'\xe1',
     return intro, body
 
 
+def inject_implicit_paragraphs(tree):
+    """Inject implicit paragraphs into the tree.  This mimicks the WordPress
+    automatic paragarph insertion and can be used to import markup from blogs
+    like WordPress that use implicit paragraphs.
+
+    This however must not be used for any kind of ZEML trees because it only
+    knows some basic rules for regular HTML.
+    """
+    def joined_text_iter(node):
+        text_buf = [node.text]
+        node.text = u''
+
+        def flush_text_buf():
+            if text_buf:
+                text = u''.join(text_buf)
+                del text_buf[:]
+                if text:
+                    return text
+
+        for child in node.children:
+            text = flush_text_buf()
+            if text is not None:
+                yield text
+            yield child
+            text_buf.append(child.tail)
+            child.tail = u''
+
+        text = flush_text_buf()
+        if text is not None:
+            yield text
+
+    def make_paragraph(children):
+        element = Element('p')
+        for child in children:
+            if isinstance(child, unicode):
+                if element.children:
+                    element.children[-1].tail += child
+                else:
+                    element.text += child
+            elif child:
+                element.children.append(child)
+        return element
+
+    def transform(parent):
+        for node in parent.children[:]:
+            transform(node)
+        if not parent.is_root and \
+           parent.name not in _autoparagraphed_elements:
+            return
+        paragraphs = [[]]
+
+        for item in joined_text_iter(parent):
+            if isinstance(item, unicode):
+                blockiter = iter(_paragraph_re.split(item))
+                for block in blockiter:
+                    try:
+                        is_paragraph = blockiter.next()
+                    except StopIteration:
+                        is_paragraph = False
+                    if block:
+                        paragraphs[-1].append(block)
+                    if is_paragraph:
+                        paragraphs.append([])
+            elif item.name in Parser.block_elements:
+                paragraphs.extend((item, []))
+            else:
+                paragraphs[-1].append(item)
+
+        del parent.children[:]
+        for paragraph in paragraphs:
+            if not isinstance(paragraph, list):
+                parent.children.append(paragraph)
+            else:
+                for item in paragraph:
+                    if not isinstance(item, unicode) or item:
+                        parent.children.append(make_paragraph(paragraph))
+                        break
+
+    transform(tree)
+    return tree
+
+
 class ElementHandler(object):
     """A dynamic element handler."""
 
     is_isolated = False
     is_semi_isolated = False
     is_block_level = False
-    is_autoparagraphed = False
     broken_by = None
 
     def __init__(self, app):
     """
 
     isolated_elements = set(['script', 'style', 'noscript', 'iframe'])
-    autoparagraphed_elements = set(['div', 'blockquote'])
     semi_isolated_elements = set(['textarea'])
     void_elements = set(['br', 'img', 'area', 'hr', 'param', 'input',
                          'embed', 'col'])
         self.stack = [self.result]
 
         self.isolated_elements = self.isolated_elements.copy()
-        self.autoparagraphed_elements = self.autoparagraphed_elements.copy()
         self.semi_isolated_elements = self.semi_isolated_elements.copy()
         self.void_elements = self.void_elements.copy()
         self.block_elements = self.block_elements.copy()
         for element in element_handlers or ():
             if element.is_isolated:
                 self.isolated_elements.add(element.tag)
-            if element.is_autoparagraphed:
-                self.autoparagraphed_elements.add(element.tag)
             if element.is_semi_isolated:
                 self.semi_isolated_elements.add(element.tag)
             if element.is_void:
         self.z('display_name', text=user._display_name, parent=rv)
         self.z('real_name', text=user.real_name, parent=rv)
         self.z('description', text=user.description, parent=rv)
+        self.u('www', text=user.www, parent=rv)
         self.z('is_author', text=user.is_author and 'yes' or 'no', parent=rv)
         self.z('extra', text=dumps(user.extra).encode('base64'))
         for participant in self.participants: