Commits

Anonymous committed c43c02c

added dulwich library and got the script to call it for clone

Comments (0)

Files changed (86)

+GitHug, the Hg-Git Plugin
+=========================
+
+This plugin is designed to allow you to push to a Git server over the Git 
+protocol and to pull from a Git based project.  All data is stored in Hg
+native format with a mapping table.  People collaborating in Git should not 
+even be able to tell that you're using Hg to collaborate on their project. 
+Nothing should be kept in the Git format except perhaps for caching.
+
+(Dulwich library)
+
+May need to use bookmarks extension to do everything better.
+
+* Cloning from a Git Repository *
+
+hg init, hg add remote, hg gfetch, hg checkout
+
+* Fetching from a Git Repository *
+
+hg gremote add (git-url)
+
+hg gfetch origin (like a hg pull)
+ - connects to server (upload-pack)
+ - gets a list of server shas
+ - sees what it does not have (maps all unmapped shas, checks list)
+ - requests needed shas
+ - fetches packfile
+    - explodes
+    - converts parent linages to hg changesets
+    - updates local parents
+
+* Pushing to a Git Repository *
+
+hg gpush origin
+  - maps all unmapped shas
+  - connects to server
+  - needs/haves
+  - creates packfile of git versions of everything the server needs
+  - transfers it, updates local references (bookmarks?)
+
+
 
 # just importing every damn thing because i don't know python that well
 # and I have no idea what I actually need
-from mercurial import util, repair, merge, cmdutil, commands, error, hg, url
+from mercurial import util, repair, merge, cmdutil, commands, hg, url
 from mercurial import extensions, ancestor
 from mercurial.commands import templateopts
 from mercurial.node import nullrev, nullid, short
 from mercurial.i18n import _
-import os, errno
+import os, errno, sys
 import subprocess
+import dulwich
 
 def gclone(ui, git_url, hg_repo_path=None):
     ## TODO : add git_url as the default remote path
         if hg_repo_path.endswith('.git'):
             hg_repo_path = hg_repo_path[:-4]
         hg_repo_path += '-hg'
-    subprocess.call(['hg', 'init', hg_repo_path])    
-    clone_git(git_url, hg_repo_path)
-    import_git_heads(hg_repo_path)
+    dest_repo = hg.repository(ui, hg_repo_path, create=True)
+
+    # make the git data directory
+    git_hg_path = os.path.join(hg_repo_path, '.hg', 'git')
+    os.mkdir(git_hg_path)
+    dulwich.repo.Repo.init_bare(git_hg_path)
     
-    # check it out
-    oldwd = os.getcwd()
-    os.chdir(hg_repo_path)
-    subprocess.call(['hg', 'checkout'])    
-    os.chdir(oldwd)
+    # fetch the initial git data
+    git_fetch(dest_repo, git_url)
+    
+    # checkout the tip
+    # hg.update(ui, dest_repo)
 
-def gpull(ui, repo, source='default', **opts):
-    """fetch from a git repo
-    """
-    lock = wlock = None
+def gpush(ui, repo):
+    dest_repo.ui.status(_("pushing to git url\n"))
+    
+def gpull(ui, repo):
+    dest_repo.ui.status(_("pulling from git url\n"))
+    
+
+def git_fetch(dest_repo, git_url):
+    dest_repo.ui.status(_("fetching from git url\n"))
+    git_fetch_pack(dest_repo, git_url)
+    
+def git_fetch_pack(dest_repo, git_url):
+    from dulwich.repo import Repo
+    from dulwich.client import SimpleFetchGraphWalker
+    client, path = get_transport_and_path(git_url)
+    git_dir = os.path.join(dest_repo.path, 'git')
+    r = Repo(git_dir)
+    graphwalker = SimpleFetchGraphWalker(r.heads().values(), r.get_parents)
+    f, commit = r.object_store.add_pack()
     try:
-        lock = repo.lock()
-        wlock = repo.wlock()
-        ui.write("fetching from the remote\n")
-        git_fetch(git_path())
-        import_git_heads()
-        # do the pull
-    finally:
-        del lock, wlock
+        client.fetch_pack(path, r.object_store.determine_wants_all, graphwalker, f.write, sys.stdout.write)
+        f.close()
+        commit()
+    except:
+        f.close()
+    raise
 
-def gpush(ui, repo, dest='default', **opts):
-    """push to a git repo
-    """
-    lock = wlock = None
-    try:
-        lock = repo.lock()
-        wlock = repo.wlock()
-        ui.write("pushing to the remote\n")
-        # do the push
-    finally:
-        del lock, wlock
-
-def git_path(hg_path=None):
-    if hg_path:
-      return os.path.join(hg_path, '.hg', 'git-remote')
-    else:
-      return os.path.join('.hg', 'git-remote')
-
-def clone_git(git_url, hg_path=None):
-    git_initialize(git_path(hg_path), git_url)
-    git_fetch(git_path(hg_path))
-    
-def git_initialize(git_repo_path, git_url):
-    # TODO: implement this in pure python - should be strait-forward
-    oldwd = os.getcwd()
-    os.makedirs(git_repo_path)
-    os.chdir(git_repo_path)
-    subprocess.call(['git', '--bare', 'init'])
-    subprocess.call(['git', 'remote', 'add', 'origin', git_url])
-    os.chdir(oldwd)
-    
-def git_fetch(git_repo_path, remote='origin'):
-    # TODO: implement this in pure python
-    #       - we'll have to handle ssh and git
-    oldwd = os.getcwd()
-    os.chdir(git_repo_path)
-    subprocess.call(['git', 'fetch', remote])
-    os.chdir(oldwd)
-  
-def git_push():
-    # find all the local changesets that aren't mapped
-    # create git commit object shas and map them
-    # stick those objects in a packfile and push them up (over ssh)
-    return 0
-
-def import_git_heads(hg_path=None):
-    # go through each branch
-      # add all commits we don't have locally
-      # write a SHA<->SHA mapping table
-      # update the local branches to match
-    if not hg_path:
-      hg_path = '.'
-    return subprocess.call(['hg', 'convert', git_path(hg_path), hg_path])
-  
+def get_transport_and_path(uri):
+    from dulwich.client import TCPGitClient, SSHGitClient, SubprocessGitClient
+    for handler, transport in (("git://", TCPGitClient), ("git+ssh://", SSHGitClient)):
+        if uri.startswith(handler):
+            host, path = uri[len(handler):].split("/", 1)
+            return transport(host), "/"+path
+    # if its not git or git+ssh, try a local url..
+    return SubprocessGitClient(), uri
         
 commands.norepo += " gclone"
 cmdtable = {

dulwich/__init__.py

+# __init__.py -- The git module of dulwich
+# Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
+# Copyright (C) 2008 Jelmer Vernooji <jelmer@samba.org>
+# 
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; version 2
+# of the License or (at your option) any later version of 
+# the License.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+# MA  02110-1301, USA.
+
+import client
+import protocol
+import repo
+import server
+
+__version__ = (0, 1, 1)

dulwich/_objects.c

+/* 
+ * Copyright (C) 2009 Jelmer Vernooij <jelmer@samba.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License or (at your option) a later version of the License.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA  02110-1301, USA.
+ */
+
+#include <Python.h>
+
+#define hexbyte(x) (isdigit(x)?(x)-'0':(x)-'a'+0xa)
+#define bytehex(x) (((x)<0xa)?('0'+(x)):('a'-0xa+(x)))
+
+static PyObject *py_hex_to_sha(PyObject *self, PyObject *py_hexsha)
+{
+	char *hexsha;
+	char sha[20];
+	int i;
+
+	if (!PyString_Check(py_hexsha)) {
+		PyErr_SetString(PyExc_TypeError, "hex sha is not a string");
+		return NULL;
+	}
+
+	if (PyString_Size(py_hexsha) != 40) {
+		PyErr_SetString(PyExc_ValueError, "hex sha is not 40 bytes long");
+		return NULL;
+	}
+
+	hexsha = PyString_AsString(py_hexsha);
+
+	for (i = 0; i < 20; i++) {
+		sha[i] = (hexbyte(hexsha[i*2]) << 4) + hexbyte(hexsha[i*2+1]);
+	}
+
+	return PyString_FromStringAndSize(sha, 20);
+}
+
+static PyObject *py_sha_to_hex(PyObject *self, PyObject *py_sha)
+{
+	char hexsha[41];
+	unsigned char *sha;
+	int i;
+
+	if (!PyString_Check(py_sha)) {
+		PyErr_SetString(PyExc_TypeError, "sha is not a string");
+		return NULL;
+	}
+
+	if (PyString_Size(py_sha) != 20) {
+		PyErr_SetString(PyExc_ValueError, "sha is not 20 bytes long");
+		return NULL;
+	}
+
+	sha = (unsigned char *)PyString_AsString(py_sha);
+	for (i = 0; i < 20; i++) {
+		hexsha[i*2] = bytehex((sha[i] & 0xF0) >> 4);
+		hexsha[i*2+1] = bytehex(sha[i] & 0x0F);
+	}
+	
+	return PyString_FromStringAndSize(hexsha, 40);
+}
+
+static PyMethodDef py_objects_methods[] = {
+	{ "hex_to_sha", (PyCFunction)py_hex_to_sha, METH_O, NULL },
+	{ "sha_to_hex", (PyCFunction)py_sha_to_hex, METH_O, NULL },
+};
+
+void init_objects(void)
+{
+	PyObject *m;
+
+	m = Py_InitModule3("_objects", py_objects_methods, NULL);
+	if (m == NULL)
+		return;
+}
+/* 
+ * Copyright (C) 2009 Jelmer Vernooij <jelmer@samba.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License or (at your option) a later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA  02110-1301, USA.
+ */
+
+#include <Python.h>
+#include <stdint.h>
+
+static int py_is_sha(PyObject *sha)
+{
+    if (!PyString_Check(sha))
+        return 0;
+
+    if (PyString_Size(sha) != 20)
+        return 0;
+
+    return 1;
+}
+
+
+static size_t get_delta_header_size(uint8_t *delta, int *index, int length)
+{
+	size_t size = 0;
+	int i = 0;
+	while ((*index) < length) {
+		uint8_t cmd = delta[*index];
+		(*index)++;
+		size |= (cmd & ~0x80) << i;
+		i += 7;
+		if (!(cmd & 0x80))
+			break;
+	}
+	return size;
+}
+
+
+static PyObject *py_apply_delta(PyObject *self, PyObject *args)
+{
+	uint8_t *src_buf, *delta;
+	int src_buf_len, delta_len;
+	size_t src_size, dest_size;
+	size_t outindex = 0;
+	int index;
+	uint8_t *out;
+	PyObject *ret;
+
+	if (!PyArg_ParseTuple(args, "s#s#", (uint8_t *)&src_buf, &src_buf_len, 
+						  (uint8_t *)&delta, &delta_len))
+		return NULL;
+
+    index = 0;
+    src_size = get_delta_header_size(delta, &index, delta_len);
+    if (src_size != src_buf_len) {
+		PyErr_Format(PyExc_ValueError, 
+			"Unexpected source buffer size: %lu vs %d", src_size, src_buf_len);
+		return NULL;
+	}
+    dest_size = get_delta_header_size(delta, &index, delta_len);
+	ret = PyString_FromStringAndSize(NULL, dest_size);
+	if (ret == NULL) {
+		PyErr_NoMemory();
+		return NULL;
+	}
+	out = (uint8_t *)PyString_AsString(ret);
+    while (index < delta_len) {
+        char cmd = delta[index];
+        index++;
+        if (cmd & 0x80) {
+            size_t cp_off = 0, cp_size = 0;
+			int i;
+            for (i = 0; i < 4; i++) {
+                if (cmd & (1 << i)) {
+                    uint8_t x = delta[index];
+                    index++;
+                    cp_off |= x << (i * 8);
+				}
+			}
+            for (i = 0; i < 3; i++) {
+                if (cmd & (1 << (4+i))) {
+                    uint8_t x = delta[index];
+                    index++;
+                    cp_size |= x << (i * 8);
+				}
+			}
+            if (cp_size == 0)
+                cp_size = 0x10000;
+            if (cp_off + cp_size < cp_size ||
+                cp_off + cp_size > src_size ||
+                cp_size > dest_size)
+                break;
+			memcpy(out+outindex, src_buf+cp_off, cp_size);
+			outindex += cp_size;
+		} else if (cmd != 0) {
+			memcpy(out+outindex, delta+index, cmd);
+			outindex += cmd;
+            index += cmd;
+		} else {
+			PyErr_SetString(PyExc_ValueError, "Invalid opcode 0");
+			return NULL;
+		}
+	}
+    
+    if (index != delta_len) {
+		PyErr_SetString(PyExc_ValueError, "delta not empty");
+		return NULL;
+	}
+
+	if (dest_size != outindex) {
+        PyErr_SetString(PyExc_ValueError, "dest size incorrect");
+		return NULL;
+	}
+
+    return ret;
+}
+
+static PyObject *py_bisect_find_sha(PyObject *self, PyObject *args)
+{
+    PyObject *unpack_name;
+    char *sha;
+    int sha_len;
+	int start, end;
+    if (!PyArg_ParseTuple(args, "iis#O", &start, &end, 
+						  &sha, &sha_len, &unpack_name))
+        return NULL;
+
+    if (sha_len != 20) {
+        PyErr_SetString(PyExc_ValueError, "Sha is not 20 bytes long");
+        return NULL;
+    }
+    if (start > end) {
+        PyErr_SetString(PyExc_AssertionError, "start > end");
+        return NULL;
+    }
+
+    while (start <= end) {
+        PyObject *file_sha;
+        int i = (start + end)/2;
+        int cmp;
+        file_sha = PyObject_CallFunction(unpack_name, "i", i);
+        if (file_sha == NULL) {
+            return NULL;
+        }
+        if (!py_is_sha(file_sha)) {
+            PyErr_SetString(PyExc_TypeError, "unpack_name returned non-sha object");
+			Py_DECREF(file_sha);
+            return NULL;
+        }
+        cmp = memcmp(PyString_AsString(file_sha), sha, 20);
+		Py_DECREF(file_sha);
+        if (cmp < 0)
+            start = i + 1;
+        else if (cmp > 0)
+            end = i - 1;
+        else {
+			return PyInt_FromLong(i);
+        }
+    }
+    Py_RETURN_NONE;
+}
+
+
+static PyMethodDef py_pack_methods[] = {
+	{ "apply_delta", (PyCFunction)py_apply_delta, METH_VARARGS, NULL },
+    { "bisect_find_sha", (PyCFunction)py_bisect_find_sha, METH_VARARGS, NULL },
+};
+
+void init_pack(void)
+{
+	PyObject *m;
+
+	m = Py_InitModule3("_pack", py_pack_methods, NULL);
+	if (m == NULL)
+		return;
+}

dulwich/client.py

+# server.py -- Implementation of the server side git protocols
+# Copyright (C) 2008 Jelmer Vernooij <jelmer@samba.org>
+# Copyright (C) 2008 John Carr
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# or (at your option) a later version of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+# MA  02110-1301, USA.
+
+"""Client side support for the Git protocol."""
+
+__docformat__ = 'restructuredText'
+
+import os
+import select
+import socket
+import subprocess
+
+from protocol import (
+    Protocol,
+    TCP_GIT_PORT,
+    extract_capabilities,
+    )
+from pack import (
+    write_pack_data,
+    )
+
+
+def _fileno_can_read(fileno):
+    return len(select.select([fileno], [], [], 0)[0]) > 0
+
+
+class SimpleFetchGraphWalker(object):
+
+    def __init__(self, local_heads, get_parents):
+        self.heads = set(local_heads)
+        self.get_parents = get_parents
+        self.parents = {}
+
+    def ack(self, ref):
+        if ref in self.heads:
+            self.heads.remove(ref)
+        if ref in self.parents:
+            for p in self.parents[ref]:
+                self.ack(p)
+
+    def next(self):
+        if self.heads:
+            ret = self.heads.pop()
+            ps = self.get_parents(ret)
+            self.parents[ret] = ps
+            self.heads.update(ps)
+            return ret
+        return None
+
+
+CAPABILITIES = ["multi_ack", "side-band-64k", "ofs-delta"]
+
+
+class GitClient(object):
+    """Git smart server client.
+
+    """
+
+    def __init__(self, can_read, read, write, thin_packs=True, 
+        report_activity=None):
+        """Create a new GitClient instance.
+
+        :param can_read: Function that returns True if there is data available
+            to be read.
+        :param read: Callback for reading data, takes number of bytes to read
+        :param write: Callback for writing data
+        :param thin_packs: Whether or not thin packs should be retrieved
+        :param report_activity: Optional callback for reporting transport
+            activity.
+        """
+        self.proto = Protocol(read, write, report_activity)
+        self._can_read = can_read
+        self._capabilities = list(CAPABILITIES)
+        if thin_packs:
+            self._capabilities.append("thin-pack")
+
+    def capabilities(self):
+        return " ".join(self._capabilities)
+
+    def read_refs(self):
+        server_capabilities = None
+        refs = {}
+        # Receive refs from server
+        for pkt in self.proto.read_pkt_seq():
+            (sha, ref) = pkt.rstrip("\n").split(" ", 1)
+            if server_capabilities is None:
+                (ref, server_capabilities) = extract_capabilities(ref)
+            refs[ref] = sha
+        return refs, server_capabilities
+
+    def send_pack(self, path, generate_pack_contents):
+        """Upload a pack to a remote repository.
+
+        :param path: Repository path
+        :param generate_pack_contents: Function that can return the shas of the 
+            objects to upload.
+        """
+        refs, server_capabilities = self.read_refs()
+        changed_refs = [] # FIXME
+        if not changed_refs:
+            self.proto.write_pkt_line(None)
+            return
+        self.proto.write_pkt_line("%s %s %s\0%s" % (changed_refs[0][0], changed_refs[0][1], changed_refs[0][2], self.capabilities()))
+        want = []
+        have = []
+        for changed_ref in changed_refs[:]:
+            self.proto.write_pkt_line("%s %s %s" % changed_refs)
+            want.append(changed_refs[1])
+            if changed_refs[0] != "0"*40:
+                have.append(changed_refs[0])
+        self.proto.write_pkt_line(None)
+        shas = generate_pack_contents(want, have, None)
+        write_pack_data(self.write, shas, len(shas))
+
+    def fetch_pack(self, path, determine_wants, graph_walker, pack_data, progress):
+        """Retrieve a pack from a git smart server.
+
+        :param determine_wants: Callback that returns list of commits to fetch
+        :param graph_walker: Object with next() and ack().
+        :param pack_data: Callback called for each bit of data in the pack
+        :param progress: Callback for progress reports (strings)
+        """
+        (refs, server_capabilities) = self.read_refs()
+        wants = determine_wants(refs)
+        if not wants:
+            self.proto.write_pkt_line(None)
+            return
+        self.proto.write_pkt_line("want %s %s\n" % (wants[0], self.capabilities()))
+        for want in wants[1:]:
+            self.proto.write_pkt_line("want %s\n" % want)
+        self.proto.write_pkt_line(None)
+        have = graph_walker.next()
+        while have:
+            self.proto.write_pkt_line("have %s\n" % have)
+            if self._can_read():
+                pkt = self.proto.read_pkt_line()
+                parts = pkt.rstrip("\n").split(" ")
+                if parts[0] == "ACK":
+                    graph_walker.ack(parts[1])
+                    assert parts[2] == "continue"
+            have = graph_walker.next()
+        self.proto.write_pkt_line("done\n")
+        pkt = self.proto.read_pkt_line()
+        while pkt:
+            parts = pkt.rstrip("\n").split(" ")
+            if parts[0] == "ACK":
+                graph_walker.ack(pkt.split(" ")[1])
+            if len(parts) < 3 or parts[2] != "continue":
+                break
+            pkt = self.proto.read_pkt_line()
+        for pkt in self.proto.read_pkt_seq():
+            channel = ord(pkt[0])
+            pkt = pkt[1:]
+            if channel == 1:
+                pack_data(pkt)
+            elif channel == 2:
+                progress(pkt)
+            else:
+                raise AssertionError("Invalid sideband channel %d" % channel)
+
+
+class TCPGitClient(GitClient):
+    """A Git Client that works over TCP directly (i.e. git://)."""
+
+    def __init__(self, host, port=None, *args, **kwargs):
+        self._socket = socket.socket(type=socket.SOCK_STREAM)
+        if port is None:
+            port = TCP_GIT_PORT
+        self._socket.connect((host, port))
+        self.rfile = self._socket.makefile('rb', -1)
+        self.wfile = self._socket.makefile('wb', 0)
+        self.host = host
+        super(TCPGitClient, self).__init__(lambda: _fileno_can_read(self._socket.fileno()), self.rfile.read, self.wfile.write, *args, **kwargs)
+
+    def send_pack(self, path):
+        """Send a pack to a remote host.
+
+        :param path: Path of the repository on the remote host
+        """
+        self.proto.send_cmd("git-receive-pack", path, "host=%s" % self.host)
+        super(TCPGitClient, self).send_pack(path)
+
+    def fetch_pack(self, path, determine_wants, graph_walker, pack_data, progress):
+        """Fetch a pack from the remote host.
+        
+        :param path: Path of the reposiutory on the remote host
+        :param determine_wants: Callback that receives available refs dict and 
+            should return list of sha's to fetch.
+        :param graph_walker: GraphWalker instance used to find missing shas
+        :param pack_data: Callback for writing pack data
+        :param progress: Callback for writing progress
+        """
+        self.proto.send_cmd("git-upload-pack", path, "host=%s" % self.host)
+        super(TCPGitClient, self).fetch_pack(path, determine_wants, graph_walker, pack_data, progress)
+
+
+class SubprocessGitClient(GitClient):
+
+    def __init__(self, *args, **kwargs):
+        self.proc = None
+        self._args = args
+        self._kwargs = kwargs
+
+    def _connect(self, service, *args):
+        argv = [service] + list(args)
+        self.proc = subprocess.Popen(argv, bufsize=0,
+                                stdin=subprocess.PIPE,
+                                stdout=subprocess.PIPE)
+        def read_fn(size):
+            return self.proc.stdout.read(size)
+        def write_fn(data):
+            self.proc.stdin.write(data)
+            self.proc.stdin.flush()
+        return GitClient(lambda: _fileno_can_read(self.proc.stdout.fileno()), read_fn, write_fn, *args, **kwargs)
+
+    def send_pack(self, path):
+        client = self._connect("git-receive-pack", path)
+        client.send_pack(path)
+
+    def fetch_pack(self, path, determine_wants, graph_walker, pack_data, 
+        progress):
+        client = self._connect("git-upload-pack", path)
+        client.fetch_pack(path, determine_wants, graph_walker, pack_data, progress)
+
+
+class SSHSubprocess(object):
+    """A socket-like object that talks to an ssh subprocess via pipes."""
+
+    def __init__(self, proc):
+        self.proc = proc
+
+    def send(self, data):
+        return os.write(self.proc.stdin.fileno(), data)
+
+    def recv(self, count):
+        return self.proc.stdout.read(count)
+
+    def close(self):
+        self.proc.stdin.close()
+        self.proc.stdout.close()
+        self.proc.wait()
+
+
+class SSHVendor(object):
+
+    def connect_ssh(self, host, command, username=None, port=None):
+        #FIXME: This has no way to deal with passwords..
+        args = ['ssh', '-x']
+        if port is not None:
+            args.extend(['-p', str(port)])
+        if username is not None:
+            host = "%s@%s" % (username, host)
+        args.append(host)
+        proc = subprocess.Popen(args + command,
+                                stdin=subprocess.PIPE,
+                                stdout=subprocess.PIPE)
+        return SSHSubprocess(proc)
+
+# Can be overridden by users
+get_ssh_vendor = SSHVendor
+
+
+class SSHGitClient(GitClient):
+
+    def __init__(self, host, port=None, *args, **kwargs):
+        self.host = host
+        self.port = port
+        self._args = args
+        self._kwargs = kwargs
+
+    def send_pack(self, path):
+        remote = get_ssh_vendor().connect_ssh(self.host, ["git-receive-pack %s" % path], port=self.port)
+        client = GitClient(lambda: _fileno_can_read(remote.proc.stdout.fileno()), remote.recv, remote.send, *self._args, **self._kwargs)
+        client.send_pack(path)
+
+    def fetch_pack(self, path, determine_wants, graph_walker, pack_data, progress):
+        remote = get_ssh_vendor().connect_ssh(self.host, ["git-upload-pack %s" % path], port=self.port)
+        client = GitClient(lambda: _fileno_can_read(remote.proc.stdout.fileno()), remote.recv, remote.send, *self._args, **self._kwargs)
+        client.fetch_pack(path, determine_wants, graph_walker, pack_data, progress)
+

dulwich/errors.py

+# errors.py -- errors for dulwich
+# Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
+# 
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; version 2
+# or (at your option) any later version of the License.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+# MA  02110-1301, USA.
+
+"""Dulwich-related exception classes and utility functions."""
+
+class ChecksumMismatch(Exception):
+    """A checksum didn't match the expected contents."""
+
+    def __init__(self, expected, got, extra=None):
+        self.expected = expected
+        self.got = got
+        self.extra = extra
+        if self.extra is None:
+            Exception.__init__(self, 
+                "Checksum mismatch: Expected %s, got %s" % (expected, got))
+        else:
+            Exception.__init__(self,
+                "Checksum mismatch: Expected %s, got %s; %s" % 
+                (expected, got, extra))
+
+
+class WrongObjectException(Exception):
+    """Baseclass for all the _ is not a _ exceptions on objects.
+  
+    Do not instantiate directly.
+  
+    Subclasses should define a _type attribute that indicates what
+    was expected if they were raised.
+    """
+  
+    def __init__(self, sha, *args, **kwargs):
+        string = "%s is not a %s" % (sha, self._type)
+        Exception.__init__(self, string)
+
+
+class NotCommitError(WrongObjectException):
+    """Indicates that the sha requested does not point to a commit."""
+  
+    _type = 'commit'
+
+
+class NotTreeError(WrongObjectException):
+    """Indicates that the sha requested does not point to a tree."""
+  
+    _type = 'tree'
+
+
+class NotBlobError(WrongObjectException):
+    """Indicates that the sha requested does not point to a blob."""
+  
+    _type = 'blob'
+
+
+class MissingCommitError(Exception):
+    """Indicates that a commit was not found in the repository"""
+  
+    def __init__(self, sha, *args, **kwargs):
+        Exception.__init__(self, "%s is not in the revision store" % sha)
+
+
+class ObjectMissing(Exception):
+    """Indicates that a requested object is missing."""
+  
+    def __init__(self, sha, *args, **kwargs):
+        Exception.__init__(self, "%s is not in the pack" % sha)
+
+
+class ApplyDeltaError(Exception):
+    """Indicates that applying a delta failed."""
+    
+    def __init__(self, *args, **kwargs):
+        Exception.__init__(self, *args, **kwargs)
+
+
+class NotGitRepository(Exception):
+    """Indicates that no Git repository was found."""
+
+    def __init__(self, *args, **kwargs):
+        Exception.__init__(self, *args, **kwargs)
+
+
+class GitProtocolError(Exception):
+    """Git protocol exception."""
+    
+    def __init__(self, *args, **kwargs):
+        Exception.__init__(self, *args, **kwargs)
+
+
+class HangupException(GitProtocolError):
+    """Hangup exception."""
+# index.py -- File parser/write for the git index file
+# Copryight (C) 2008 Jelmer Vernooij <jelmer@samba.org>
+ 
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; version 2
+# of the License or (at your opinion) any later version of the license.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+# MA  02110-1301, USA.
+
+"""Parser for the git index file format."""
+
+import struct
+
+def read_cache_time(f):
+    return struct.unpack(">LL", f.read(8))
+
+
+def write_cache_time(f, t):
+    if isinstance(t, int):
+        t = (t, 0)
+    f.write(struct.pack(">LL", *t))
+
+
+def read_cache_entry(f):
+    """Read an entry from a cache file.
+
+    :param f: File-like object to read from
+    :return: tuple with: inode, device, mode, uid, gid, size, sha, flags
+    """
+    beginoffset = f.tell()
+    ctime = read_cache_time(f)
+    mtime = read_cache_time(f)
+    (ino, dev, mode, uid, gid, size, sha, flags, ) = \
+        struct.unpack(">LLLLLL20sH", f.read(20 + 4 * 6 + 2))
+    name = ""
+    char = f.read(1)
+    while char != "\0":
+        name += char
+        char = f.read(1)
+    # Padding:
+    real_size = ((f.tell() - beginoffset + 7) & ~7)
+    f.seek(beginoffset + real_size)
+    return (name, ctime, mtime, ino, dev, mode, uid, gid, size, sha, flags)
+
+
+def write_cache_entry(f, entry):
+    """Write an index entry to a file.
+
+    :param f: File object
+    :param entry: Entry to write, tuple with: 
+        (name, ctime, mtime, ino, dev, mode, uid, gid, size, sha, flags)
+    """
+    beginoffset = f.tell()
+    (name, ctime, mtime, ino, dev, mode, uid, gid, size, sha, flags) = entry
+    write_cache_time(f, ctime)
+    write_cache_time(f, mtime)
+    f.write(struct.pack(">LLLLLL20sH", ino, dev, mode, uid, gid, size, sha, flags))
+    f.write(name)
+    f.write(chr(0))
+    real_size = ((f.tell() - beginoffset + 7) & ~7)
+    f.write("\0" * ((beginoffset + real_size) - f.tell()))
+
+
+def read_index(f):
+    """Read an index file, yielding the individual entries."""
+    header = f.read(4)
+    if header != "DIRC":
+        raise AssertionError("Invalid index file header: %r" % header)
+    (version, num_entries) = struct.unpack(">LL", f.read(4 * 2))
+    assert version in (1, 2)
+    for i in range(num_entries):
+        yield read_cache_entry(f)
+
+
+def read_index_dict(f):
+    """Read an index file and return it as a dictionary.
+    
+    :param f: File object to read from
+    """
+    ret = {}
+    for x in read_index(f):
+        ret[x[0]] = tuple(x[1:])
+    return ret
+
+
+def write_index(f, entries):
+    """Write an index file.
+    
+    :param f: File-like object to write to
+    :param entries: Iterable over the entries to write
+    """
+    f.write("DIRC")
+    f.write(struct.pack(">LL", 2, len(entries)))
+    for x in entries:
+        write_cache_entry(f, x)
+
+
+def write_index_dict(f, entries):
+    """Write an index file based on the contents of a dictionary.
+
+    """
+    entries_list = []
+    for name in sorted(entries):
+        entries_list.append((name,) + tuple(entries[name]))
+    write_index(f, entries_list)
+
+
+class Index(object):
+
+    def __init__(self, filename):
+        self._filename = filename
+        self.clear()
+        self.read()
+
+    def write(self):
+        f = open(self._filename, 'w')
+        try:
+            write_index_dict(f, self._byname)
+        finally:
+            f.close()
+
+    def read(self):
+        f = open(self._filename, 'r')
+        try:
+            for x in read_index(f):
+
+                self[x[0]] = tuple(x[1:])
+        finally:
+            f.close()
+
+    def __len__(self):
+        return len(self._byname)
+
+    def __getitem__(self, name):
+        return self._byname[name]
+
+    def get_sha1(self, path):
+        return self[path][-2]
+
+    def clear(self):
+        self._byname = {}
+
+    def __setitem__(self, name, x):
+        assert isinstance(name, str)
+        assert len(x) == 10
+        # Remove the old entry if any
+        self._byname[name] = x
+
+    def iteritems(self):
+        return self._byname.iteritems()
+
+    def update(self, entries):
+        for name, value in entries.iteritems():
+            self[name] = value

dulwich/lru_cache.py

+# Copyright (C) 2006, 2008 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+"""A simple least-recently-used (LRU) cache."""
+
+from collections import deque
+
+
+class LRUCache(object):
+    """A class which manages a cache of entries, removing unused ones."""
+
+    def __init__(self, max_cache=100, after_cleanup_count=None):
+        self._cache = {}
+        self._cleanup = {}
+        self._queue = deque() # Track when things are accessed
+        self._refcount = {} # number of entries in self._queue for each key
+        self._update_max_cache(max_cache, after_cleanup_count)
+
+    def __contains__(self, key):
+        return key in self._cache
+
+    def __getitem__(self, key):
+        val = self._cache[key]
+        self._record_access(key)
+        return val
+
+    def __len__(self):
+        return len(self._cache)
+
+    def add(self, key, value, cleanup=None):
+        """Add a new value to the cache.
+
+        Also, if the entry is ever removed from the queue, call cleanup.
+        Passing it the key and value being removed.
+
+        :param key: The key to store it under
+        :param value: The object to store
+        :param cleanup: None or a function taking (key, value) to indicate
+                        'value' sohuld be cleaned up.
+        """
+        if key in self._cache:
+            self._remove(key)
+        self._cache[key] = value
+        if cleanup is not None:
+            self._cleanup[key] = cleanup
+        self._record_access(key)
+
+        if len(self._cache) > self._max_cache:
+            # Trigger the cleanup
+            self.cleanup()
+
+    def get(self, key, default=None):
+        if key in self._cache:
+            return self[key]
+        return default
+
+    def keys(self):
+        """Get the list of keys currently cached.
+
+        Note that values returned here may not be available by the time you
+        request them later. This is simply meant as a peak into the current
+        state.
+
+        :return: An unordered list of keys that are currently cached.
+        """
+        return self._cache.keys()
+
+    def cleanup(self):
+        """Clear the cache until it shrinks to the requested size.
+
+        This does not completely wipe the cache, just makes sure it is under
+        the after_cleanup_count.
+        """
+        # Make sure the cache is shrunk to the correct size
+        while len(self._cache) > self._after_cleanup_count:
+            self._remove_lru()
+        # No need to compact the queue at this point, because the code that
+        # calls this would have already triggered it based on queue length
+
+    def __setitem__(self, key, value):
+        """Add a value to the cache, there will be no cleanup function."""
+        self.add(key, value, cleanup=None)
+
+    def _record_access(self, key):
+        """Record that key was accessed."""
+        self._queue.append(key)
+        # Can't use setdefault because you can't += 1 the result
+        self._refcount[key] = self._refcount.get(key, 0) + 1
+
+        # If our access queue is too large, clean it up too
+        if len(self._queue) > self._compact_queue_length:
+            self._compact_queue()
+
+    def _compact_queue(self):
+        """Compact the queue, leaving things in sorted last appended order."""
+        new_queue = deque()
+        for item in self._queue:
+            if self._refcount[item] == 1:
+                new_queue.append(item)
+            else:
+                self._refcount[item] -= 1
+        self._queue = new_queue
+        # All entries should be of the same size. There should be one entry in
+        # queue for each entry in cache, and all refcounts should == 1
+        if not (len(self._queue) == len(self._cache) ==
+                len(self._refcount) == sum(self._refcount.itervalues())):
+            raise AssertionError()
+
+    def _remove(self, key):
+        """Remove an entry, making sure to maintain the invariants."""
+        cleanup = self._cleanup.pop(key, None)
+        val = self._cache.pop(key)
+        if cleanup is not None:
+            cleanup(key, val)
+        return val
+
+    def _remove_lru(self):
+        """Remove one entry from the lru, and handle consequences.
+
+        If there are no more references to the lru, then this entry should be
+        removed from the cache.
+        """
+        key = self._queue.popleft()
+        self._refcount[key] -= 1
+        if not self._refcount[key]:
+            del self._refcount[key]
+            self._remove(key)
+
+    def clear(self):
+        """Clear out all of the cache."""
+        # Clean up in LRU order
+        while self._cache:
+            self._remove_lru()
+
+    def resize(self, max_cache, after_cleanup_count=None):
+        """Change the number of entries that will be cached."""
+        self._update_max_cache(max_cache,
+                               after_cleanup_count=after_cleanup_count)
+
+    def _update_max_cache(self, max_cache, after_cleanup_count=None):
+        self._max_cache = max_cache
+        if after_cleanup_count is None:
+            self._after_cleanup_count = self._max_cache * 8 / 10
+        else:
+            self._after_cleanup_count = min(after_cleanup_count, self._max_cache)
+
+        self._compact_queue_length = 4*self._max_cache
+        if len(self._queue) > self._compact_queue_length:
+            self._compact_queue()
+        self.cleanup()
+
+
+class LRUSizeCache(LRUCache):
+    """An LRUCache that removes things based on the size of the values.
+
+    This differs in that it doesn't care how many actual items there are,
+    it just restricts the cache to be cleaned up after so much data is stored.
+
+    The values that are added must support len(value).
+    """
+
+    def __init__(self, max_size=1024*1024, after_cleanup_size=None,
+                 compute_size=None):
+        """Create a new LRUSizeCache.
+
+        :param max_size: The max number of bytes to store before we start
+            clearing out entries.
+        :param after_cleanup_size: After cleaning up, shrink everything to this
+            size.
+        :param compute_size: A function to compute the size of the values. We
+            use a function here, so that you can pass 'len' if you are just
+            using simple strings, or a more complex function if you are using
+            something like a list of strings, or even a custom object.
+            The function should take the form "compute_size(value) => integer".
+            If not supplied, it defaults to 'len()'
+        """
+        self._value_size = 0
+        self._compute_size = compute_size
+        if compute_size is None:
+            self._compute_size = len
+        # This approximates that texts are > 0.5k in size. It only really
+        # effects when we clean up the queue, so we don't want it to be too
+        # large.
+        self._update_max_size(max_size, after_cleanup_size=after_cleanup_size)
+        LRUCache.__init__(self, max_cache=max(int(max_size/512), 1))
+
+    def add(self, key, value, cleanup=None):
+        """Add a new value to the cache.
+
+        Also, if the entry is ever removed from the queue, call cleanup.
+        Passing it the key and value being removed.
+
+        :param key: The key to store it under
+        :param value: The object to store
+        :param cleanup: None or a function taking (key, value) to indicate
+                        'value' sohuld be cleaned up.
+        """
+        if key in self._cache:
+            self._remove(key)
+        value_len = self._compute_size(value)
+        if value_len >= self._after_cleanup_size:
+            return
+        self._value_size += value_len
+        self._cache[key] = value
+        if cleanup is not None:
+            self._cleanup[key] = cleanup
+        self._record_access(key)
+
+        if self._value_size > self._max_size:
+            # Time to cleanup
+            self.cleanup()
+
+    def cleanup(self):
+        """Clear the cache until it shrinks to the requested size.
+
+        This does not completely wipe the cache, just makes sure it is under
+        the after_cleanup_size.
+        """
+        # Make sure the cache is shrunk to the correct size
+        while self._value_size > self._after_cleanup_size:
+            self._remove_lru()
+
+    def _remove(self, key):
+        """Remove an entry, making sure to maintain the invariants."""
+        val = LRUCache._remove(self, key)
+        self._value_size -= self._compute_size(val)
+
+    def resize(self, max_size, after_cleanup_size=None):
+        """Change the number of bytes that will be cached."""
+        self._update_max_size(max_size, after_cleanup_size=after_cleanup_size)
+        max_cache = max(int(max_size/512), 1)
+        self._update_max_cache(max_cache)
+
+    def _update_max_size(self, max_size, after_cleanup_size=None):
+        self._max_size = max_size
+        if after_cleanup_size is None:
+            self._after_cleanup_size = self._max_size * 8 / 10
+        else:
+            self._after_cleanup_size = min(after_cleanup_size, self._max_size)
+# misc.py -- For dealing with python2.4 oddness
+# Copyright (C) 2008 Canonical Ltd.
+# 
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; version 2
+# of the License or (at your option) a later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+# MA  02110-1301, USA.
+"""Misc utilities to work with python2.4.
+
+These utilities can all be deleted when dulwich decides it wants to stop
+support for python 2.4.
+"""
+try:
+    import hashlib
+except ImportError:
+    import sha
+import struct
+
+
+class defaultdict(dict):
+    """A python 2.4 equivalent of collections.defaultdict."""
+
+    def __init__(self, default_factory=None, *a, **kw):
+        if (default_factory is not None and
+            not hasattr(default_factory, '__call__')):
+            raise TypeError('first argument must be callable')
+        dict.__init__(self, *a, **kw)
+        self.default_factory = default_factory
+
+    def __getitem__(self, key):
+        try:
+            return dict.__getitem__(self, key)
+        except KeyError:
+            return self.__missing__(key)
+
+    def __missing__(self, key):
+        if self.default_factory is None:
+            raise KeyError(key)
+        self[key] = value = self.default_factory()
+        return value
+
+    def __reduce__(self):
+        if self.default_factory is None:
+            args = tuple()
+        else:
+            args = self.default_factory,
+        return type(self), args, None, None, self.items()
+
+    def copy(self):
+        return self.__copy__()
+
+    def __copy__(self):
+        return type(self)(self.default_factory, self)
+
+    def __deepcopy__(self, memo):
+        import copy
+        return type(self)(self.default_factory,
+                          copy.deepcopy(self.items()))
+    def __repr__(self):
+        return 'defaultdict(%s, %s)' % (self.default_factory,
+                                        dict.__repr__(self))
+
+
+def make_sha(source=''):
+    """A python2.4 workaround for the sha/hashlib module fiasco."""
+    try:
+        return hashlib.sha1(source)
+    except NameError:
+        sha1 = sha.sha(source)
+        return sha1
+
+
+def unpack_from(fmt, buf, offset=0):
+    """A python2.4 workaround for struct missing unpack_from."""
+    try:
+        return struct.unpack_from(fmt, buf, offset)
+    except AttributeError:
+        b = buf[offset:offset+struct.calcsize(fmt)]
+        return struct.unpack(fmt, b)
+

dulwich/object_store.py

+# object_store.py -- Object store for git objects 
+# Copyright (C) 2008 Jelmer Vernooij <jelmer@samba.org>
+# 
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# or (at your option) a later version of the License.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+# MA  02110-1301, USA.
+
+import os
+import tempfile
+import urllib2
+
+from errors import (
+    NotTreeError,
+    )
+from objects import (
+    ShaFile,
+    Tree,
+    hex_to_sha,
+    sha_to_hex,
+    )
+from pack import (
+    Pack,
+    PackData, 
+    iter_sha1, 
+    load_packs, 
+    load_pack_index,
+    write_pack,
+    write_pack_data,
+    write_pack_index_v2,
+    )
+
+PACKDIR = 'pack'
+
+class ObjectStore(object):
+    """Object store."""
+
+    def __init__(self, path):
+        """Open an object store.
+
+        :param path: Path of the object store.
+        """
+        self.path = path
+        self._pack_cache = None
+        self.pack_dir = os.path.join(self.path, PACKDIR)
+
+    def determine_wants_all(self, refs):
+	    return [sha for (ref, sha) in refs.iteritems() if not sha in self and not ref.endswith("^{}")]
+
+    def iter_shas(self, shas):
+        """Iterate over the objects for the specified shas.
+
+        :param shas: Iterable object with SHAs
+        """
+        return ObjectStoreIterator(self, shas)
+
+    def __contains__(self, sha):
+        for pack in self.packs:
+            if sha in pack:
+                return True
+        ret = self._get_shafile(sha)
+        if ret is not None:
+            return True
+        return False
+
+    @property
+    def packs(self):
+        """List with pack objects."""
+        if self._pack_cache is None:
+            self._pack_cache = list(load_packs(self.pack_dir))
+        return self._pack_cache
+
+    def _add_known_pack(self, path):
+        """Add a newly appeared pack to the cache by path.
+
+        """
+        if self._pack_cache is not None:
+            self._pack_cache.append(Pack(path))
+
+    def _get_shafile_path(self, sha):
+        dir = sha[:2]
+        file = sha[2:]
+        # Check from object dir
+        return os.path.join(self.path, dir, file)
+
+    def _get_shafile(self, sha):
+        path = self._get_shafile_path(sha)
+        if os.path.exists(path):
+          return ShaFile.from_file(path)
+        return None
+
+    def _add_shafile(self, sha, o):
+        dir = os.path.join(self.path, sha[:2])
+        if not os.path.isdir(dir):
+            os.mkdir(dir)
+        path = os.path.join(dir, sha[2:])
+        f = open(path, 'w+')
+        try:
+            f.write(o.as_legacy_object())
+        finally:
+            f.close()
+
+    def get_raw(self, name):
+        """Obtain the raw text for an object.
+        
+        :param name: sha for the object.
+        :return: tuple with object type and object contents.
+        """
+        if len(name) == 40:
+            sha = hex_to_sha(name)
+            hexsha = name
+        elif len(name) == 20:
+            sha = name
+            hexsha = None
+        else:
+            raise AssertionError
+        for pack in self.packs:
+            try:
+                return pack.get_raw(sha)
+            except KeyError:
+                pass
+        if hexsha is None: 
+            hexsha = sha_to_hex(name)
+        ret = self._get_shafile(hexsha)
+        if ret is not None:
+            return ret.as_raw_string()
+        raise KeyError(hexsha)
+
+    def __getitem__(self, sha):
+        type, uncomp = self.get_raw(sha)
+        return ShaFile.from_raw_string(type, uncomp)
+
+    def move_in_thin_pack(self, path):
+        """Move a specific file containing a pack into the pack directory.
+
+        :note: The file should be on the same file system as the 
+            packs directory.
+
+        :param path: Path to the pack file.
+        """
+        data = PackData(path)
+
+        # Write index for the thin pack (do we really need this?)
+        temppath = os.path.join(self.pack_dir, 
+            sha_to_hex(urllib2.randombytes(20))+".tempidx")
+        data.create_index_v2(temppath, self.get_raw)
+        p = Pack.from_objects(data, load_pack_index(temppath))
+
+        # Write a full pack version
+        temppath = os.path.join(self.pack_dir, 
+            sha_to_hex(urllib2.randombytes(20))+".temppack")
+        write_pack(temppath, ((o, None) for o in p.iterobjects(self.get_raw)), 
+                len(p))
+        pack_sha = load_pack_index(temppath+".idx").objects_sha1()
+        newbasename = os.path.join(self.pack_dir, "pack-%s" % pack_sha)
+        os.rename(temppath+".pack", newbasename+".pack")
+        os.rename(temppath+".idx", newbasename+".idx")
+        self._add_known_pack(newbasename)
+
+    def move_in_pack(self, path):
+        """Move a specific file containing a pack into the pack directory.
+
+        :note: The file should be on the same file system as the 
+            packs directory.
+
+        :param path: Path to the pack file.
+        """
+        p = PackData(path)
+        entries = p.sorted_entries()
+        basename = os.path.join(self.pack_dir, 
+            "pack-%s" % iter_sha1(entry[0] for entry in entries))
+        write_pack_index_v2(basename+".idx", entries, p.get_stored_checksum())
+        os.rename(path, basename + ".pack")
+        self._add_known_pack(basename)
+
+    def add_thin_pack(self):
+        """Add a new thin pack to this object store.
+
+        Thin packs are packs that contain deltas with parents that exist 
+        in a different pack.
+        """
+        fd, path = tempfile.mkstemp(dir=self.pack_dir, suffix=".pack")
+        f = os.fdopen(fd, 'w')
+        def commit():
+            os.fsync(fd)
+            f.close()
+            if os.path.getsize(path) > 0:
+                self.move_in_thin_pack(path)
+        return f, commit
+
+    def add_pack(self):
+        """Add a new pack to this object store. 
+
+        :return: Fileobject to write to and a commit function to 
+            call when the pack is finished.
+        """
+        fd, path = tempfile.mkstemp(dir=self.pack_dir, suffix=".pack")
+        f = os.fdopen(fd, 'w')
+        def commit():
+            os.fsync(fd)
+            f.close()
+            if os.path.getsize(path) > 0:
+                self.move_in_pack(path)
+        return f, commit
+
+    def add_object(self, obj):
+        self._add_shafile(obj.id, obj)
+
+    def add_objects(self, objects):
+        """Add a set of objects to this object store.
+
+        :param objects: Iterable over a list of objects.
+        """
+        if len(objects) == 0:
+            return
+        f, commit = self.add_pack()
+        write_pack_data(f, objects, len(objects))
+        commit()
+
+
+class ObjectImporter(object):
+    """Interface for importing objects."""
+
+    def __init__(self, count):
+        """Create a new ObjectImporter.
+
+        :param count: Number of objects that's going to be imported.
+        """
+        self.count = count
+
+    def add_object(self, object):
+        """Add an object."""
+        raise NotImplementedError(self.add_object)
+
+    def finish(self, object):
+        """Finish the imoprt and write objects to disk."""
+        raise NotImplementedError(self.finish)
+
+
+class ObjectIterator(object):
+    """Interface for iterating over objects."""
+
+    def iterobjects(self):
+        raise NotImplementedError(self.iterobjects)
+
+
+class ObjectStoreIterator(ObjectIterator):
+    """ObjectIterator that works on top of an ObjectStore."""
+
+    def __init__(self, store, sha_iter):
+        self.store = store
+        self.sha_iter = sha_iter
+        self._shas = []
+
+    def __iter__(self):
+        for sha, path in self.itershas():
+            yield self.store[sha], path
+
+    def iterobjects(self):
+        for o, path in self:
+            yield o
+
+    def itershas(self):
+        for sha in self._shas:
+            yield sha
+        for sha in self.sha_iter:
+            self._shas.append(sha)
+            yield sha
+
+    def __contains__(self, needle):
+        """Check if an object is present.
+
+        :param needle: SHA1 of the object to check for
+        """
+        return needle in self.store
+
+    def __getitem__(self, key):
+        """Find an object by SHA1."""
+        return self.store[key]
+
+    def __len__(self):
+        """Return the number of objects."""
+        return len(list(self.itershas()))
+
+
+def tree_lookup_path(lookup_obj, root_sha, path):
+    parts = path.split("/")
+    sha = root_sha
+    for p in parts:
+        obj = lookup_obj(sha)
+        if type(obj) is not Tree:
+            raise NotTreeError(sha)
+        if p == '':
+            continue
+        mode, sha = obj[p]
+    return lookup_obj(sha)

dulwich/objects.py

+# objects.py -- Access to base git objects
+# Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
+# Copyright (C) 2008 Jelmer Vernooij <jelmer@samba.org>
+# 
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; version 2
+# of the License or (at your option) a later version of the License.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+# MA  02110-1301, USA.
+
+
+"""Access to base git objects."""
+
+
+import mmap
+import os
+import sha
+import zlib
+
+from errors import (
+    NotBlobError,
+    NotCommitError,
+    NotTreeError,
+    )
+
+BLOB_ID = "blob"
+TAG_ID = "tag"
+TREE_ID = "tree"
+COMMIT_ID = "commit"
+PARENT_ID = "parent"
+AUTHOR_ID = "author"
+COMMITTER_ID = "committer"
+OBJECT_ID = "object"
+TYPE_ID = "type"
+TAGGER_ID = "tagger"
+
+def _decompress(string):
+    dcomp = zlib.decompressobj()
+    dcomped = dcomp.decompress(string)
+    dcomped += dcomp.flush()
+    return dcomped
+
+
+def sha_to_hex(sha):
+    """Takes a string and returns the hex of the sha within"""
+    hexsha = "".join(["%02x" % ord(c) for c in sha])
+    assert len(hexsha) == 40, "Incorrect length of sha1 string: %d" % hexsha
+    return hexsha
+
+
+def hex_to_sha(hex):
+    """Takes a hex sha and returns a binary sha"""
+    assert len(hex) == 40, "Incorrent length of hexsha: %s" % hex
+    return ''.join([chr(int(hex[i:i+2], 16)) for i in xrange(0, len(hex), 2)])
+
+
+class ShaFile(object):
+    """A git SHA file."""
+  
+    @classmethod
+    def _parse_legacy_object(cls, map):
+        """Parse a legacy object, creating it and setting object._text"""
+        text = _decompress(map)
+        object = None
+        for posstype in type_map.keys():
+            if text.startswith(posstype):
+                object = type_map[posstype]()
+                text = text[len(posstype):]
+                break
+        assert object is not None, "%s is not a known object type" % text[:9]
+        assert text[0] == ' ', "%s is not a space" % text[0]
+        text = text[1:]
+        size = 0
+        i = 0
+        while text[0] >= '0' and text[0] <= '9':
+            if i > 0 and size == 0:
+                assert False, "Size is not in canonical format"
+            size = (size * 10) + int(text[0])
+            text = text[1:]
+            i += 1
+        object._size = size
+        assert text[0] == "\0", "Size not followed by null"
+        text = text[1:]
+        object._text = text
+        return object
+
+    def as_legacy_object(self):
+        return zlib.compress("%s %d\0%s" % (self._type, len(self._text), self._text))
+  
+    def as_raw_string(self):
+        return self._num_type, self._text
+  
+    @classmethod
+    def _parse_object(cls, map):
+        """Parse a new style object , creating it and setting object._text"""
+        used = 0
+        byte = ord(map[used])
+        used += 1
+        num_type = (byte >> 4) & 7
+        try:
+            object = num_type_map[num_type]()
+        except KeyError:
+            raise AssertionError("Not a known type: %d" % num_type)
+        while (byte & 0x80) != 0:
+            byte = ord(map[used])
+            used += 1
+        raw = map[used:]
+        object._text = _decompress(raw)
+        return object
+  
+    @classmethod
+    def _parse_file(cls, map):
+        word = (ord(map[0]) << 8) + ord(map[1])
+        if ord(map[0]) == 0x78 and (word % 31) == 0:
+            return cls._parse_legacy_object(map)
+        else:
+            return cls._parse_object(map)
+  
+    def __init__(self):
+        """Don't call this directly"""
+  
+    def _parse_text(self):
+        """For subclasses to do initialisation time parsing"""
+  
+    @classmethod
+    def from_file(cls, filename):
+        """Get the contents of a SHA file on disk"""
+        size = os.path.getsize(filename)
+        f = open(filename, 'rb')
+        try:
+            map = mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ)
+            shafile = cls._parse_file(map)
+            shafile._parse_text()
+            return shafile
+        finally:
+            f.close()
+  
+    @classmethod
+    def from_raw_string(cls, type, string):
+        """Creates an object of the indicated type from the raw string given.
+    
+        Type is the numeric type of an object. String is the raw uncompressed
+        contents.
+        """
+        real_class = num_type_map[type]
+        obj = real_class()
+        obj._num_type = type
+        obj._text = string
+        obj._parse_text()
+        return obj
+  
+    def _header(self):
+        return "%s %lu\0" % (self._type, len(self._text))
+  
+    def sha(self):
+        """The SHA1 object that is the name of this object."""
+        ressha = sha.new()
+        ressha.update(self._header())
+        ressha.update(self._text)
+        return ressha
+  
+    @property
+    def id(self):
+        return self.sha().hexdigest()
+  
+    @property
+    def type(self):
+        return self._num_type
+  
+    def __repr__(self):
+        return "<%s %s>" % (self.__class__.__name__, self.id)
+  
+    def __eq__(self, other):
+        """Return true id the sha of the two objects match.
+  
+        The __le__ etc methods aren't overriden as they make no sense,
+        certainly at this level.
+        """
+        return self.sha().digest() == other.sha().digest()
+
+
+class Blob(ShaFile):
+    """A Git Blob object."""
+
+    _type = BLOB_ID
+    _num_type = 3
+
+    @property
+    def data(self):
+        """The text contained within the blob object."""
+        return self._text
+
+    @classmethod
+    def from_file(cls, filename):
+        blob = ShaFile.from_file(filename)
+        if blob._type != cls._type:
+            raise NotBlobError(filename)
+        return blob
+
+    @classmethod
+    def from_string(cls, string):
+        """Create a blob from a string."""
+        shafile = cls()
+        shafile._text = string
+        return shafile
+
+
+class Tag(ShaFile):
+    """A Git Tag object."""
+
+    _type = TAG_ID
+    _num_type = 4
+
+    @classmethod
+    def from_file(cls, filename):
+        blob = ShaFile.from_file(filename)
+        if blob._type != cls._type:
+            raise NotBlobError(filename)
+        return blob
+
+    @classmethod
+    def from_string(cls, string):
+        """Create a blob from a string."""
+        shafile = cls()
+        shafile._text = string
+        return shafile
+
+    def _parse_text(self):
+        """Grab the metadata attached to the tag"""
+        text = self._text
+        count = 0
+        assert text.startswith(OBJECT_ID), "Invalid tag object, " \
+            "must start with %s" % OBJECT_ID
+        count += len(OBJECT_ID)
+        assert text[count] == ' ', "Invalid tag object, " \
+            "%s must be followed by space not %s" % (OBJECT_ID, text[count])
+        count += 1
+        self._object_sha = text[count:count+40]
+        count += 40
+        assert text[count] == '\n', "Invalid tag object, " \
+            "%s sha must be followed by newline" % OBJECT_ID
+        count += 1
+        assert text[count:].startswith(TYPE_ID), "Invalid tag object, " \
+            "%s sha must be followed by %s" % (OBJECT_ID, TYPE_ID)
+        count += len(TYPE_ID)
+        assert text[count] == ' ', "Invalid tag object, " \
+            "%s must be followed by space not %s" % (TAG_ID, text[count])
+        count += 1
+        self._object_type = ""
+        while text[count] != '\n':
+            self._object_type += text[count]
+            count += 1
+        count += 1
+        assert self._object_type in (COMMIT_ID, BLOB_ID, TREE_ID, TAG_ID), "Invalid tag object, " \
+            "unexpected object type %s" % self._object_type
+        self._object_type = type_map[self._object_type]
+
+        assert text[count:].startswith(TAG_ID), "Invalid tag object, " \
+            "object type must be followed by %s" % (TAG_ID)
+        count += len(TAG_ID)
+        assert text[count] == ' ', "Invalid tag object, " \
+            "%s must be followed by space not %s" % (TAG_ID, text[count])
+        count += 1
+        self._name = ""
+        while text[count] != '\n':
+            self._name += text[count]
+            count += 1
+        count += 1
+
+        assert text[count:].startswith(TAGGER_ID), "Invalid tag object, " \
+            "%s must be followed by %s" % (TAG_ID, TAGGER_ID)
+        count += len(TAGGER_ID)
+        assert text[count] == ' ', "Invalid tag object, " \
+            "%s must be followed by space not %s" % (TAGGER_ID, text[count])
+        count += 1
+        self._tagger = ""
+        while text[count] != '>':
+            assert text[count] != '\n', "Malformed tagger information"
+            self._tagger += text[count]
+            count += 1
+        self._tagger += text[count]
+        count += 1
+        assert text[count] == ' ', "Invalid tag object, " \
+            "tagger information must be followed by space not %s" % text[count]
+        count += 1
+        self._tag_time = int(text[count:count+10])
+        while text[count] != '\n':
+            count += 1
+        count += 1
+        assert text[count] == '\n', "There must be a new line after the headers"
+        count += 1
+        self._message = text[count:]
+
+    @property
+    def object(self):
+        """Returns the object pointed by this tag, represented as a tuple(type, sha)"""
+        return (self._object_type, self._object_sha)
+
+    @property
+    def name(self):
+        """Returns the name of this tag"""
+        return self._name
+
+    @property
+    def tagger(self):
+        """Returns the name of the person who created this tag"""
+        return self._tagger
+
+    @property
+    def tag_time(self):
+        """Returns the creation timestamp of the tag.
+
+        Returns it as the number of seconds since the epoch"""
+        return self._tag_time
+
+    @property
+    def message(self):
+        """Returns the message attached to this tag"""
+        return self._message
+
+
+class Tree(ShaFile):
+    """A Git tree object"""
+
+    _type = TREE_ID
+    _num_type = 2
+
+    def __init__(self):
+        self._entries = {}
+
+    @classmethod
+    def from_file(cls, filename):
+        tree = ShaFile.from_file(filename)
+        if tree._type != cls._type:
+            raise NotTreeError(filename)
+        return tree
+
+    def __getitem__(self, name):
+        return self._entries[name]
+
+    def __setitem__(self, name, value):
+        assert isinstance(value, tuple)
+        assert len(value) == 2
+        self._entries[name] = value
+
+    def __delitem__(self, name):
+        del self._entries[name]
+
+    def add(self, mode, name, hexsha):
+        self._entries[name] = mode, hexsha
+
+    def entries(self):
+        """Return a list of tuples describing the tree entries"""
+        return [(mode, name, hexsha) for (name, (mode, hexsha)) in self._entries.iteritems()]
+
+    def iteritems(self):
+        for name in sorted(self._entries.keys()):
+            yield name, self_entries[name][0], self._entries[name][1]
+
+    def _parse_text(self):
+        """Grab the entries in the tree"""
+        count = 0
+        while count < len(self._text):
+            mode = 0
+            chr = self._text[count]
+            while chr != ' ':
+                assert chr >= '0' and chr <= '7', "%s is not a valid mode char" % chr
+                mode = (mode << 3) + (ord(chr) - ord('0'))
+                count += 1
+                chr = self._text[count]
+            count += 1
+            chr = self._text[count]
+            name = ''
+            while chr != '\0':
+                name += chr
+                count += 1
+                chr = self._text[count]
+            count += 1
+            chr = self._text[count]
+            sha = self._text[count:count+20]
+            hexsha = sha_to_hex(sha)
+            self.add(mode, name, hexsha)
+            count = count + 20
+
+    def serialize(self):
+        self._text = ""
+        for name, mode, hexsha in self.iteritems():
+            self._text += "%04o %s\0%s" % (mode, name, hex_to_sha(hexsha))
+