Commits

Mikhail Korobov committed c96b31e

faster base64 decoding using libb64 (this makes BytesDAWG and RecordDAWG 30% faster)

Comments (0)

Files changed (14)

 include lib/COPYING
 
 recursive-include src *.cpp *.pxd *.pyx
-recursive-include lib/dawgdic *.h
+recursive-include lib *.c *.h
 This module is based on `dawgdic`_ C++ library by
 Susumu Yata & contributors.
 
+base64 decoder is based on Public Domain libb64_ by Chris Venter.
+
+.. _libb64: http://libb64.sourceforge.net/
+
 License
 =======
 
     import pstats
     import cProfile
     print('\n====== Profiling =======\n')
-    trie = create_dawg()
+    d = create_bytes_dawg()
     WORDS = words100k()
 
+    def check_getitem(trie, words):
+        for word in words:
+            trie[word]
+
+    cProfile.runctx("check_getitem(d, WORDS)", globals(), locals(), "Profile.prof")
+
 #    def check_prefixes(trie, words):
 #        for word in words:
 #            trie.keys(word)
-#    cProfile.runctx("check_prefixes(trie, NON_WORDS_1k)", globals(), locals(), "Profile.prof")
+#    cProfile.runctx("check_prefixes(d, NON_WORDS_1k)", globals(), locals(), "Profile.prof")
 #
-    cProfile.runctx("check_trie(trie, WORDS)", globals(), locals(), "Profile.prof")
+    #cProfile.runctx("check_trie(d, WORDS)", globals(), locals(), "Profile.prof")
 
     s = pstats.Stats("Profile.prof")
     s.strip_dirs().sort_stats("time").print_stats(20)
+libb64: Base64 Encoding/Decoding Routines
+======================================
+
+Authors:
+-------
+
+Chris Venter	chris.venter@gmail.com	http://rocketpod.blogspot.com
+Copyright-Only Dedication (based on United States law) 
+or Public Domain Certification
+
+The person or persons who have associated work with this document (the
+"Dedicator" or "Certifier") hereby either (a) certifies that, to the best of
+his knowledge, the work of authorship identified is in the public domain of the
+country from which the work is published, or (b) hereby dedicates whatever
+copyright the dedicators holds in the work of authorship identified below (the
+"Work") to the public domain. A certifier, moreover, dedicates any copyright
+interest he may have in the associated work, and for these purposes, is
+described as a "dedicator" below.
+
+A certifier has taken reasonable steps to verify the copyright status of this
+work. Certifier recognizes that his good faith efforts may not shield him from
+liability if in fact the work certified is not in the public domain.
+
+Dedicator makes this dedication for the benefit of the public at large and to
+the detriment of the Dedicator's heirs and successors. Dedicator intends this
+dedication to be an overt act of relinquishment in perpetuity of all present
+and future rights under copyright law, whether vested or contingent, in the
+Work. Dedicator understands that such relinquishment of all rights includes
+the relinquishment of all rights to enforce (by lawsuit or otherwise) those
+copyrights in the Work.
+
+Dedicator recognizes that, once placed in the public domain, the Work may be
+freely reproduced, distributed, transmitted, used, modified, built upon, or
+otherwise exploited by anyone for any purpose, commercial or non-commercial,
+and in any way, including by methods that have not yet been invented or
+conceived.

lib/b64/cdecode.c

+/*
+cdecoder.c - c source to a base64 decoding algorithm implementation
+
+This is part of the libb64 project, and has been placed in the public domain.
+For details, see http://sourceforge.net/projects/libb64
+*/
+
+#include <b64/cdecode.h>
+
+int base64_decode_value(char value_in)
+{
+	static const char decoding[] = {62,-1,-1,-1,63,52,53,54,55,56,57,58,59,60,61,-1,-1,-1,-2,-1,-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,-1,-1,-1,-1,-1,-1,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51};
+	static const char decoding_size = sizeof(decoding);
+	value_in -= 43;
+	if (value_in < 0 || value_in > decoding_size) return -1;
+	return decoding[(int)value_in];
+}
+
+void base64_init_decodestate(base64_decodestate* state_in)
+{
+	state_in->step = step_a;
+	state_in->plainchar = 0;
+}
+
+int base64_decode_block(const char* code_in, const int length_in, char* plaintext_out, base64_decodestate* state_in)
+{
+	const char* codechar = code_in;
+	char* plainchar = plaintext_out;
+	char fragment;
+	
+	*plainchar = state_in->plainchar;
+	
+	switch (state_in->step)
+	{
+		while (1)
+		{
+	case step_a:
+			do {
+				if (codechar == code_in+length_in)
+				{
+					state_in->step = step_a;
+					state_in->plainchar = *plainchar;
+					return plainchar - plaintext_out;
+				}
+				fragment = (char)base64_decode_value(*codechar++);
+			} while (fragment < 0);
+			*plainchar    = (fragment & 0x03f) << 2;
+	case step_b:
+			do {
+				if (codechar == code_in+length_in)
+				{
+					state_in->step = step_b;
+					state_in->plainchar = *plainchar;
+					return plainchar - plaintext_out;
+				}
+				fragment = (char)base64_decode_value(*codechar++);
+			} while (fragment < 0);
+			*plainchar++ |= (fragment & 0x030) >> 4;
+			*plainchar    = (fragment & 0x00f) << 4;
+	case step_c:
+			do {
+				if (codechar == code_in+length_in)
+				{
+					state_in->step = step_c;
+					state_in->plainchar = *plainchar;
+					return plainchar - plaintext_out;
+				}
+				fragment = (char)base64_decode_value(*codechar++);
+			} while (fragment < 0);
+			*plainchar++ |= (fragment & 0x03c) >> 2;
+			*plainchar    = (fragment & 0x003) << 6;
+	case step_d:
+			do {
+				if (codechar == code_in+length_in)
+				{
+					state_in->step = step_d;
+					state_in->plainchar = *plainchar;
+					return plainchar - plaintext_out;
+				}
+				fragment = (char)base64_decode_value(*codechar++);
+			} while (fragment < 0);
+			*plainchar++   |= (fragment & 0x03f);
+		}
+	}
+	/* control should not reach here */
+	return plainchar - plaintext_out;
+}
+

lib/b64/cdecode.h

+/*
+cdecode.h - c header for a base64 decoding algorithm
+
+This is part of the libb64 project, and has been placed in the public domain.
+For details, see http://sourceforge.net/projects/libb64
+*/
+
+#ifndef BASE64_CDECODE_H
+#define BASE64_CDECODE_H
+
+typedef enum
+{
+	step_a, step_b, step_c, step_d
+} base64_decodestep;
+
+typedef struct
+{
+	base64_decodestep step;
+	char plainchar;
+} base64_decodestate;
+
+void base64_init_decodestate(base64_decodestate* state_in);
+
+int base64_decode_value(char value_in);
+
+int base64_decode_block(const char* code_in, const int length_in, char* plaintext_out, base64_decodestate* state_in);
+
+#endif /* BASE64_CDECODE_H */

lib/b64/cencode.c

+/*
+cencoder.c - c source to a base64 encoding algorithm implementation
+
+This is part of the libb64 project, and has been placed in the public domain.
+For details, see http://sourceforge.net/projects/libb64
+*/
+
+#include <b64/cencode.h>
+
+const int CHARS_PER_LINE = 72;
+
+void base64_init_encodestate(base64_encodestate* state_in)
+{
+	state_in->step = step_A;
+	state_in->result = 0;
+	state_in->stepcount = 0;
+}
+
+char base64_encode_value(char value_in)
+{
+	static const char* encoding = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+	if (value_in > 63) return '=';
+	return encoding[(int)value_in];
+}
+
+int base64_encode_block(const char* plaintext_in, int length_in, char* code_out, base64_encodestate* state_in)
+{
+	const char* plainchar = plaintext_in;
+	const char* const plaintextend = plaintext_in + length_in;
+	char* codechar = code_out;
+	char result;
+	char fragment;
+	
+	result = state_in->result;
+	
+	switch (state_in->step)
+	{
+		while (1)
+		{
+	case step_A:
+			if (plainchar == plaintextend)
+			{
+				state_in->result = result;
+				state_in->step = step_A;
+				return codechar - code_out;
+			}
+			fragment = *plainchar++;
+			result = (fragment & 0x0fc) >> 2;
+			*codechar++ = base64_encode_value(result);
+			result = (fragment & 0x003) << 4;
+	case step_B:
+			if (plainchar == plaintextend)
+			{
+				state_in->result = result;
+				state_in->step = step_B;
+				return codechar - code_out;
+			}
+			fragment = *plainchar++;
+			result |= (fragment & 0x0f0) >> 4;
+			*codechar++ = base64_encode_value(result);
+			result = (fragment & 0x00f) << 2;
+	case step_C:
+			if (plainchar == plaintextend)
+			{
+				state_in->result = result;
+				state_in->step = step_C;
+				return codechar - code_out;
+			}
+			fragment = *plainchar++;
+			result |= (fragment & 0x0c0) >> 6;
+			*codechar++ = base64_encode_value(result);
+			result  = (fragment & 0x03f) >> 0;
+			*codechar++ = base64_encode_value(result);
+			
+			++(state_in->stepcount);
+			if (state_in->stepcount == CHARS_PER_LINE/4)
+			{
+				*codechar++ = '\n';
+				state_in->stepcount = 0;
+			}
+		}
+	}
+	/* control should not reach here */
+	return codechar - code_out;
+}
+
+int base64_encode_blockend(char* code_out, base64_encodestate* state_in)
+{
+	char* codechar = code_out;
+	
+	switch (state_in->step)
+	{
+	case step_B:
+		*codechar++ = base64_encode_value(state_in->result);
+		*codechar++ = '=';
+		*codechar++ = '=';
+		break;
+	case step_C:
+		*codechar++ = base64_encode_value(state_in->result);
+		*codechar++ = '=';
+		break;
+	case step_A:
+		break;
+	}
+	*codechar++ = '\n';
+	
+	return codechar - code_out;
+}
+

lib/b64/cencode.h

+/*
+cencode.h - c header for a base64 encoding algorithm
+
+This is part of the libb64 project, and has been placed in the public domain.
+For details, see http://sourceforge.net/projects/libb64
+*/
+
+#ifndef BASE64_CENCODE_H
+#define BASE64_CENCODE_H
+
+typedef enum
+{
+	step_A, step_B, step_C
+} base64_encodestep;
+
+typedef struct
+{
+	base64_encodestep step;
+	char result;
+	int stepcount;
+} base64_encodestate;
+
+void base64_init_encodestate(base64_encodestate* state_in);
+
+char base64_encode_value(char value_in);
+
+int base64_encode_block(const char* plaintext_in, int length_in, char* code_out, base64_encodestate* state_in);
+
+int base64_encode_blockend(char* code_out, base64_encodestate* state_in);
+
+#endif /* BASE64_CENCODE_H */
+// :mode=c++:
+/*
+decode.h - c++ wrapper for a base64 decoding algorithm
+
+This is part of the libb64 project, and has been placed in the public domain.
+For details, see http://sourceforge.net/projects/libb64
+*/
+#ifndef BASE64_DECODE_H
+#define BASE64_DECODE_H
+
+#include <iostream>
+
+namespace base64
+{
+	extern "C"
+	{
+		#include "cdecode.h"
+	}
+
+	struct decoder
+	{
+		base64_decodestate _state;
+		int _buffersize;
+
+		decoder(int buffersize_in = 4096)
+		: _buffersize(buffersize_in)
+		{}
+
+		int decode(char value_in)
+		{
+			return base64_decode_value(value_in);
+		}
+
+		int decode(const char* code_in, const int length_in, char* plaintext_out)
+		{
+			return base64_decode_block(code_in, length_in, plaintext_out, &_state);
+		}
+
+		void decode(std::istream& istream_in, std::ostream& ostream_in)
+		{
+			base64_init_decodestate(&_state);
+			//
+			const int N = _buffersize;
+			char* code = new char[N];
+			char* plaintext = new char[N];
+			int codelength;
+			int plainlength;
+
+			do
+			{
+				istream_in.read((char*)code, N);
+				codelength = istream_in.gcount();
+				plainlength = decode(code, codelength, plaintext);
+				ostream_in.write((const char*)plaintext, plainlength);
+			}
+			while (istream_in.good() && codelength > 0);
+			//
+			base64_init_decodestate(&_state);
+
+			delete [] code;
+			delete [] plaintext;
+		}
+
+		void init()
+		{
+			base64_init_decodestate(&_state);
+		}
+	};
+
+} // namespace base64
+
+
+
+#endif // BASE64_DECODE_H
+
+// :mode=c++:
+/*
+encode.h - c++ wrapper for a base64 encoding algorithm
+
+This is part of the libb64 project, and has been placed in the public domain.
+For details, see http://sourceforge.net/projects/libb64
+*/
+#ifndef BASE64_ENCODE_H
+#define BASE64_ENCODE_H
+
+#include <iostream>
+
+namespace base64
+{
+	extern "C" 
+	{
+		#include "cencode.h"
+	}
+
+	struct encoder
+	{
+		base64_encodestate _state;
+		int _buffersize;
+
+		encoder(int buffersize_in = BUFFERSIZE)
+		: _buffersize(buffersize_in)
+		{}
+
+		int encode(char value_in)
+		{
+			return base64_encode_value(value_in);
+		}
+
+		int encode(const char* code_in, const int length_in, char* plaintext_out)
+		{
+			return base64_encode_block(code_in, length_in, plaintext_out, &_state);
+		}
+
+		int encode_end(char* plaintext_out)
+		{
+			return base64_encode_blockend(plaintext_out, &_state);
+		}
+
+		void encode(std::istream& istream_in, std::ostream& ostream_in)
+		{
+			base64_init_encodestate(&_state);
+			//
+			const int N = _buffersize;
+			char* plaintext = new char[N];
+			char* code = new char[2*N];
+			int plainlength;
+			int codelength;
+
+			do
+			{
+				istream_in.read(plaintext, N);
+				plainlength = istream_in.gcount();
+				//
+				codelength = encode(plaintext, plainlength, code);
+				ostream_in.write(code, codelength);
+			}
+			while (istream_in.good() && plainlength > 0);
+
+			codelength = encode_end(code);
+			ostream_in.write(code, codelength);
+			//
+			base64_init_encodestate(&_state);
+
+			delete [] code;
+			delete [] plaintext;
+		}
+	};
+
+} // namespace base64
+
+#endif // BASE64_ENCODE_H
+
     ext_modules = [
         Extension(
             "dawg",
-            sources = glob.glob('src/*.cpp'),
+            sources = glob.glob('src/*.cpp') + glob.glob('lib/b64/*.c'),
             include_dirs=['lib'],
             language = "c++",
         )

src/b64_decode.pxd

+from iostream cimport istream, ostream
+
+cdef extern from "../lib/b64/decode.h" namespace "base64":
+
+    cdef cppclass decoder:
+        decoder()
+        decoder(int buffersize_in)
+
+        int decode(char* code_in, int length_in, char* plaintext_out)
+        void init()
+
+        void decode(istream istream_in, ostream ostream_in)
+# cython: profile=True
 from __future__ import unicode_literals
 from libcpp.string cimport string
 from iostream cimport stringstream, istream, ostream
 from _base_types cimport BaseType, SizeType
 cimport _guide_builder
 cimport _dictionary_builder
+cimport b64_decode
 
 import operator
 import collections
         cdef BaseType index
         cdef list res = []
         cdef bytes payload
-        cdef bytes decoded_payload
 
         if not self._follow_key(key, &index):
             return res
 
+        cdef char[32768] _buf
+        cdef int _len
+        cdef b64_decode.decoder dec
+
         cdef Completer* completer = new Completer(self.dct, self.guide)
         try:
             completer.Start(index)
             while completer.Next():
-                payload = completer.key()[:completer.length()]
-                decoded_payload = a2b_base64(payload)
-                res.append(decoded_payload)
+                dec.init()
+                _len = dec.decode(completer.key(), completer.length(), _buf)
+                payload = _buf[:_len]
+                res.append(payload)
         finally:
             del completer
 
         return res
 
+
     cpdef list items(self, unicode prefix=""):
         cdef bytes b_prefix = prefix.encode('utf8')
         cdef bytes key, raw_key, value, decoded_value