Commits

Matt Bone committed 9aacb28

xml programming notebook

Comments (0)

Files changed (1)

XML Programming.ipynb

+{
+ "metadata": {
+  "name": ""
+ },
+ "nbformat": 3,
+ "nbformat_minor": 0,
+ "worksheets": [
+  {
+   "cells": [
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "import json\n",
+      "import xml.etree.ElementTree as ET"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 5
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "my_markup = {'articles': [\n",
+      "                          {'title': 'My Great Article',\n",
+      "                           'journal': 'Journal of Article Greatness',\n",
+      "                           'start_page': 4,\n",
+      "                           'end_page': 5},\n",
+      "                          {'title': 'My Second Great Article',\n",
+      "                           'journal': 'Journal of Article Greatness',\n",
+      "                           'start_page': 8,\n",
+      "                           'end_page': 9},\n",
+      "                          ]\n",
+      "             }"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 6
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "my_markup_json = json.dumps(my_markup, indent=4)\n",
+      "print my_markup_json"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "{\n",
+        "    \"articles\": [\n",
+        "        {\n",
+        "            \"journal\": \"Journal of Article Greatness\", \n",
+        "            \"end_page\": 5, \n",
+        "            \"start_page\": 4, \n",
+        "            \"title\": \"My Great Article\"\n",
+        "        }, \n",
+        "        {\n",
+        "            \"journal\": \"Journal of Article Greatness\", \n",
+        "            \"end_page\": 9, \n",
+        "            \"start_page\": 8, \n",
+        "            \"title\": \"My Second Great Article\"\n",
+        "        }\n",
+        "    ]\n",
+        "}\n"
+       ]
+      }
+     ],
+     "prompt_number": 7
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "How do we find all the articles that start on page 4?"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "[article for article in my_markup['articles'] if article['start_page']==4]"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "metadata": {},
+       "output_type": "pyout",
+       "prompt_number": 8,
+       "text": [
+        "[{'end_page': 5,\n",
+        "  'journal': 'Journal of Article Greatness',\n",
+        "  'start_page': 4,\n",
+        "  'title': 'My Great Article'}]"
+       ]
+      }
+     ],
+     "prompt_number": 8
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Now let's use xml."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "articles = ET.Element('articles')\n",
+      "ET.SubElement(articles, 'article', {'title': 'My Great Article',\n",
+      "                                    'journal': 'Journal of Article Greatness',\n",
+      "                                    'start_page': '4',\n",
+      "                                    'end_page': '5'})\n",
+      "ET.SubElement(articles, 'article', {'title': 'My Second Great Article',\n",
+      "                                    'journal': 'Journal of Article Greatness',\n",
+      "                                    'start_page': '8',\n",
+      "                                    'end_page': '9'})\n",
+      "print \"\\n\".join(ET.tostringlist(articles, encoding='utf8'))"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "<?xml version='1.0' encoding='utf8'?>\n",
+        "\n",
+        "<articles\n",
+        ">\n",
+        "<article\n",
+        " end_page=\"5\"\n",
+        " journal=\"Journal of Article Greatness\"\n",
+        " start_page=\"4\"\n",
+        " title=\"My Great Article\"\n",
+        " />\n",
+        "<article\n",
+        " end_page=\"9\"\n",
+        " journal=\"Journal of Article Greatness\"\n",
+        " start_page=\"8\"\n",
+        " title=\"My Second Great Article\"\n",
+        " />\n",
+        "</articles>\n"
+       ]
+      }
+     ],
+     "prompt_number": 18
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "We can find all the elements starting on page 4 with xpath."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "articles.findall(\"./article[@start_page='4']\")"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "metadata": {},
+       "output_type": "pyout",
+       "prompt_number": 19,
+       "text": [
+        "[<Element 'article' at 0x2685c90>]"
+       ]
+      }
+     ],
+     "prompt_number": 19
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    }
+   ],
+   "metadata": {}
+  }
+ ]
+}