Source

Learn Pandas / lessons / 06 - Lesson.ipynb

Full commit
{
 "metadata": {
  "name": "06 - Lesson"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# Lesson 6  \n",
      "\n",
      "Lets take a look at the ***groupby*** function."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Import libraries\n",
      "from pandas import DataFrame\n",
      "import pandas as pd"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print 'Pandas version: ' + pd.__version__"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Pandas version: 0.11.0\n"
       ]
      }
     ],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Our small data set\n",
      "d = {'one':[1,1,1,1,1],'two':[2,2,2,2,2],'letter':['a','a','b','b','c']}\n",
      "\n",
      "# Create dataframe\n",
      "df = DataFrame(d)\n",
      "df"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>letter</th>\n",
        "      <th>one</th>\n",
        "      <th>two</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td> a</td>\n",
        "      <td> 1</td>\n",
        "      <td> 2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td> a</td>\n",
        "      <td> 1</td>\n",
        "      <td> 2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td> b</td>\n",
        "      <td> 1</td>\n",
        "      <td> 2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td> b</td>\n",
        "      <td> 1</td>\n",
        "      <td> 2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td> c</td>\n",
        "      <td> 1</td>\n",
        "      <td> 2</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "output_type": "pyout",
       "prompt_number": 3,
       "text": [
        "  letter  one  two\n",
        "0      a    1    2\n",
        "1      a    1    2\n",
        "2      b    1    2\n",
        "3      b    1    2\n",
        "4      c    1    2"
       ]
      }
     ],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Create group object\n",
      "one = df.groupby('letter')\n",
      "\n",
      "# Apply sum function\n",
      "one.sum()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>one</th>\n",
        "      <th>two</th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>letter</th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>a</th>\n",
        "      <td> 2</td>\n",
        "      <td> 4</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>b</th>\n",
        "      <td> 2</td>\n",
        "      <td> 4</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>c</th>\n",
        "      <td> 1</td>\n",
        "      <td> 2</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "output_type": "pyout",
       "prompt_number": 4,
       "text": [
        "        one  two\n",
        "letter          \n",
        "a         2    4\n",
        "b         2    4\n",
        "c         1    2"
       ]
      }
     ],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "letterone = df.groupby(['letter','one']).sum()\n",
      "letterone"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th>two</th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>letter</th>\n",
        "      <th>one</th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>a</th>\n",
        "      <th>1</th>\n",
        "      <td> 4</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>b</th>\n",
        "      <th>1</th>\n",
        "      <td> 4</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>c</th>\n",
        "      <th>1</th>\n",
        "      <td> 2</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "output_type": "pyout",
       "prompt_number": 5,
       "text": [
        "            two\n",
        "letter one     \n",
        "a      1      4\n",
        "b      1      4\n",
        "c      1      2"
       ]
      }
     ],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "letterone.index"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 6,
       "text": [
        "MultiIndex\n",
        "[a  1, b  1, c  1]"
       ]
      }
     ],
     "prompt_number": 6
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "You may want to ***not*** have the columns you are grouping by become your index, this can be easily achieved as shown below."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "letterone = df.groupby(['letter','one'], as_index=False).sum()\n",
      "letterone"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>letter</th>\n",
        "      <th>one</th>\n",
        "      <th>two</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td> a</td>\n",
        "      <td> 1</td>\n",
        "      <td> 4</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td> b</td>\n",
        "      <td> 1</td>\n",
        "      <td> 4</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td> c</td>\n",
        "      <td> 1</td>\n",
        "      <td> 2</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "output_type": "pyout",
       "prompt_number": 7,
       "text": [
        "  letter  one  two\n",
        "0      a    1    4\n",
        "1      b    1    4\n",
        "2      c    1    2"
       ]
      }
     ],
     "prompt_number": 7
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "letterone.index"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 8,
       "text": [
        "Int64Index([0, 1, 2], dtype=int64)"
       ]
      }
     ],
     "prompt_number": 8
    }
   ],
   "metadata": {}
  }
 ]
}