wikify / wikify.py

"""
wikify your texts! micro-framework for text wikification

goals - easy to extend and debug

operation (flat algorithm)
 for each region
   - find region in processed text
   - process region matched
   - exclude processed text from further processing

example - replace all wiki:something with HTML links
 [x] wrap text into list with single item
 [x] split text into three parts using regexp `wiki:\w+`
 [x] copy 1st part (not-matched) into the resulting list
 [x] replace matched part with link, insert (processed)
     into the resulting list
 [ ] process (the-rest) until text list doesn't change

 [ ] repeat the above for the rest of rules, skipping
     (processed) parts

 [x] reassemble text from the list

roadmap
 [ ] optimize - measure perfomance of using indexes
     instead of text chunks

notes - (flat algorithm) doesn't process nested markup,
        for example *`bold preformatted text`*


0.1  - proof of concept, production ready, no API sugar
       and optimizations
0.2  - helper to build regexp based rules
"""

__author__ = "anatoly techtonik <techtonik@gmail.com>"
__license__ = "Public Domain"
__version__ = "0.2"


# --- define rules ---

# rule is a function that takes text and returns either
# None (not mathed) or a list of three text items:
# [ not-matched, processed, the-rest ]

import re

# just an example of simple rule
def rule_linkstub(text):
  """ replace urls with [link] stubs """
  linkre = re.compile('https?://\S+')
  # [ ] test with commas and other URL escaped symbols
  match = linkre.search(text)
  if match == None:
    return None
  return (text[:match.start()], "[link]", text[match.end():])


# helper to substitute backreferences
def subst_backrefs(pattern, groups):
  """ helper to replace backreferences such as \0, \1
      in the given `pattern` string with elements from
      the `groups` tuple
  """
  backrefs = re.findall(r'\\\d{1,2}', pattern)
  for b in backrefs:
    pattern = pattern.replace(b, groups[int(b[1:])])
  return pattern


# helper to build regexp based rules
def create_regexp_rule(search, replace=r'\0'):
  """ helper that returns rule, suitable as an argument to
      wikify. parameters are regexps - what to `search` for
      and what to `replace` with. it is possible to use
      backreferences (like \1) in replacement string.
  """
  search = re.compile(search)

  def regexp_rule(text):
      # saving for debug
      _research = search
      _replace = replace

      match = _research.search(text)
      if match == None:
        return None

      # match.groups() doesn't return whole match as a 1st element
      groups = (match.group(0),) + match.groups()
      replaced = subst_backrefs(_replace, groups)

      return (text[:match.start()], replaced, text[match.end():])

  return regexp_rule

# [ ] sets of common rules
# [ ] configurable replacements


# --- execute rules ---

# [ ] indented prints after every step

def wikify(text, rules):
  """
  Replaces text according to the given rules. Guarantees
  that replacements won't affect each other.

  Raises TypeError when rule returns invalid result.
  """
  texts = []  # store processed pieces
  subst = []  # store replacements

  texts = [text]
  for rule in rules:
    subidx = 0  # index in replacements array
    for idx,part in enumerate(texts):
      if part == None:
        subidx += 1
        continue
      res = rule(part)
      if res == None:
        continue
      elif len(res) != 3:
        raise TypeError(
          "Rule '%s' returned %d element(s) instead of 3"
            % (rule.__name__, len(res)))
      else:
        subst.append(res[1])
        # replacing current text with three elements
        # ( not-matched, None, the-rest )
        # the None component will be picked on next
        # iterationand subidx will be increased

        # [ ] check endless cycle condition
        texts[idx:] = [res[0], None, res[2]]

  # -- reasseble
  # [ ] optimize for memory usage
  # [ ] sanity check count(None) == len(subst)
  restext = ''
  subidx = 0
  for part in texts:
    if part == None:
      # substitute
      restext += subst[subidx]
      subidx += 1
    else:
      restext += part
  return restext


if __name__ == '__main__':
  # tests for linkstub
  text = 'a web site http://example.com'
  print(text)
  print(rule_linkstub(text))
  print(wikify(text, [rule_linkstub]))

  text = ''
  print(wikify(text, [rule_linkstub]))

  text = 'somematch metext'
  print text
  w = create_regexp_rule('match me', ' replacement (\\0) ')
  print wikify(text, [w])
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.