Commits

Woojong Koh  committed 4887db9

Import boyer_moore.h and README.md

  • Participants
  • Parent commits bcea59e

Comments (0)

Files changed (9)

+## How to compile demo programs ##
+1. Install [SCons](http://www.scons.org/)
+1. Run `scons`
+1. Look at `build` directory
+
+## Dependencies ##
+1. [Boost C++ Libraries](http://www.boost.org/)
+SConscript('tests/SConscript', variant_dir='build')

File rope/aho_corasick.h

 #ifndef AHO_CORASICK_H
 #define AHO_CORASICK_H
 
-#include <iostream>
 #include <string>
 #include <stdexcept>
 
 
 using namespace boost;
 
+namespace rope
+{
 template<class T = std::string>
 class aho_corasick
 {
 		goto_t goto_;
 		failure_t failure_;
 };
+}
 
 #undef MAP
 #undef PAIR_HASH

File rope/aho_corasick_impl.h

 #include <iostream>
 #include <boost/date_time/posix_time/posix_time.hpp>
 
+namespace rope
+{
+
 template<class T>
 void aho_corasick<T>::build() throw(std::length_error)
 {
 	return text;
 }
 
+}
+
 #endif

File rope/boyer_moore.h

+#ifndef BOYER_MOORE_H
+#define BOYER_MOORE_H
+
+#include <string>
+#include <map>
+#include <vector>
+#include <iterator>
+#include <iostream>
+#include <algorithm>
+#include <limits>
+#include <climits>
+
+using namespace std;
+
+#include <boost/date_time/posix_time/posix_time.hpp>
+
+#include "fast_map.h"
+//#include "loki/AssocVector.h"
+
+
+namespace rope
+{
+	template<class T = std::string>
+	class BM
+	{
+	public:
+		typedef T string_t;
+		typedef typename string_t::value_type str_value_t;
+		typedef typename string_t::size_type str_size_t;
+		typedef typename string_t::difference_type str_diff_t;
+		typedef typename string_t::const_iterator str_const_iter;
+		typedef typename string_t::const_reverse_iterator str_const_rev_iter;
+
+		class bcs_table
+		{
+			//static const int BCS_MAX = std::numeric_limits<unsigned char>::max() + 1;
+			static const int BCS_MAX = CHAR_MAX + 1;
+
+		public:
+			typedef unsigned int size_type;
+
+			bcs_table()
+				: bcs_(NULL)
+				  , min_(0)
+				  , max_(0)
+				  , value_size_(0)
+			{}
+
+			void build(const string_t& value)
+			{
+				value_size_ = value.size();
+				str_const_iter v_end = value.end();
+
+				min_ = *std::min_element(value.begin(), v_end);
+				max_ = *std::max_element(value.begin(), v_end);
+
+				str_value_t size = std::max(max_ - min_ + 1, 0);
+				//bcs_.resize(std::min<int>(size, BCS_MAX), value_size_);
+				delete [] bcs_;
+
+				//bcs_size_ = std::min<size_type>(size, BCS_MAX);
+				bcs_size_ = size;
+				bcs_ = new str_size_t[ bcs_size_ ];
+				std::fill(bcs_, bcs_ + bcs_size_, value_size_);
+
+				str_const_iter v_end_1 = v_end;
+				std::advance(v_end_1, -1);
+
+				for (str_const_iter it = value.begin(); it != v_end_1; ++it)
+				{
+					str_value_t idx = *it - min_;
+					//if (idx >= bcs_.size()) continue;
+					if (idx >= bcs_size_) continue;
+
+					bcs_[idx] = std::distance(it, v_end) - 1;
+				}
+			}
+
+			inline str_size_t operator[](str_value_t idx) const
+			{
+				if (idx > max_ || idx < min_) return value_size_;
+				return bcs_[ idx - min_ ];
+			}
+
+			size_type size() const { return bcs_size_; }
+
+		private:
+			//std::vector<str_size_t> bcs_;
+			str_size_t* bcs_;
+			size_type bcs_size_;
+
+			str_value_t min_;
+			str_value_t max_;
+			str_size_t value_size_;
+		};
+
+		//typedef map<str_value_t, str_size_t> MapBCS;
+		//typedef Loki::AssocVector<str_value_t, str_size_t> MapBCS;
+		//typedef vector<str_diff_t> MapBCS;
+		typedef bcs_table MapBCS;
+		typedef vector<str_size_t> VecGSS;
+
+		BM(const string_t& value)
+			: value_(value)
+		{
+			make_table();
+		}
+
+		void make_table()
+		{
+			const string_t& value = value_;
+
+			boost::posix_time::ptime t1(boost::posix_time::microsec_clock::local_time());
+
+			// bad character shift
+			/*
+			bcs_.clear();
+			for (str_const_rev_iter it = value_.rbegin() + 1; it != value.rend(); ++it)
+			{
+				typename MapBCS::iterator bcs_it = bcs_.find(*it);
+				if (bcs_it == bcs_.end())
+				{
+					const str_diff_t it_idx = std::distance(value.rbegin(), it);
+					bcs_it = bcs_.insert(bcs_it, typename MapBCS::value_type(*it, it_idx));
+					std::cout << "bcs " << *it << " " << it_idx << std::endl;
+				}
+			}
+			*/
+
+			/*
+			std::fill(bcs2_, bcs2_ + sizeof(bcs2_) / sizeof(*bcs2_), value_.size());
+			for (str_const_iter it = value_.begin(); it != value_.end() - 1; ++it)
+			{
+				if (*it >= BCS_MAX) continue;
+
+				const str_diff_t it_idx = std::distance(it, value.end()) - 1;
+				bcs2_[*it] = it_idx;
+			}
+			*/
+			/*
+			str_const_iter min_it = std::min_element(value_.begin(), value_.end());
+			str_const_iter max_it = std::max_element(value_.begin(), value_.end());
+			str_value_t size = *max_it - *min_it + 1;
+			bcs2_.resize(std::min<int>(size, BCS_MAX), value_.size());
+			min_ = *min_it;
+
+			for (str_const_iter it = value_.begin(); it != value_.end() - 1; ++it)
+			{
+				str_value_t idx = *it - min_;
+				if ( idx >= bcs2_.size()) continue;
+
+				const str_diff_t it_idx = std::distance(it, value.end()) - 1;
+				bcs2_[idx] = it_idx;
+			}
+			*/
+			bcs2_.build(value_);
+			std::cout << "bcs size: " << bcs2_.size() << std::endl;
+
+			/*
+			bcs_.resize(256, value_.size()); // resize & reset
+			for (str_const_iter it = value_.begin(); it != value_.end() - 1; ++it)
+			{
+				const str_diff_t it_idx = std::distance(it, value.end()) - 1;
+				bcs_[*it] = it_idx;
+			}
+			*/
+
+			boost::posix_time::ptime t2(boost::posix_time::microsec_clock::local_time());
+
+			// good suffix shift
+			gss_.resize(value_.size(), value_.size()); // resize & reset
+			for (str_const_rev_iter it = value_.rbegin(); it != value.rend(); ++it)
+			{
+				str_const_rev_iter temp = it;
+				std::advance(temp, 1);
+				for (str_const_rev_iter it2 = temp; it2 != value.rend(); ++it2)
+				{
+					if (*it != *it2 && std::equal(it.base(), value.end(), it2.base()))
+					{
+						const str_diff_t it_idx = std::distance(value.rbegin(), it);
+						gss_[it_idx] = std::distance(it, it2);
+						std::cout << "gss " << it_idx << " " << gss_[it_idx] << std::endl;
+						break;
+					}
+				}
+			}
+			boost::posix_time::ptime t3(boost::posix_time::microsec_clock::local_time());
+			std::cout << "bcs : " << t2 - t1 << std::endl;
+			std::cout << "gss : " << t3 - t2 << std::endl;
+			std::cout << "overall : " << t3 - t1 << std::endl;
+		}
+
+		template<class InputIterator>
+		InputIterator find_horspool(InputIterator first, InputIterator last) const
+		{
+			typedef reverse_iterator<InputIterator> DataRevIter;
+			typedef const reverse_iterator<InputIterator> DataConstRevIter;
+
+			typedef InputIterator DataIter;
+			typedef const InputIterator DataConstIter;
+			typedef typename iterator_traits<InputIterator>::value_type DataValType;
+
+			const string_t& value = value_;
+
+			const DataRevIter rbeg(last);
+			const DataRevIter rend(first + value_.size() - 1);
+
+			const str_const_rev_iter v_rbeg = value_.rbegin();
+			const str_const_rev_iter v_rend = value_.rend();
+			const str_size_t v_size = value_.size();
+
+			boost::posix_time::ptime t1(boost::posix_time::microsec_clock::local_time());
+
+			//const int bcs2_size = bcs2_.size();
+			//const int bcs2_size = BCS_MAX;
+			//typename vector<str_size_t>::const_iterator bcs2_beg = bcs2_.begin();
+
+			InputIterator r = last;
+			str_const_rev_iter v_r1 = v_rbeg;
+			for (DataRevIter rit = rend; rit >= rbeg + v_size;)
+			{
+				//if (*v_rbeg == *rit && std::equal(v_rbeg, v_rend, rit))
+				if (std::equal(v_rbeg, v_rend, rit))
+				{
+					r = rit.base() - v_size;
+					break;
+				}
+
+				//str_diff_t bcs = v_size;
+				/*
+				typename MapBCS::const_iterator bcs_it = bcs_.find(*rit);
+				if (bcs_it != bcs_.end())
+					bcs = bcs_it->second;
+				*/
+				//const DataValType d = *rit - min_;
+				//str_diff_t bcs = (d < 256 ? bcs2_beg_[ (unsigned char)d ] : v_size );
+				//rit -= (d < bcs2_size ? bcs2_[ (unsigned char)d ] : v_size );
+				rit -= bcs2_[*rit];
+
+				/*
+				const DataValType& d = *rit;
+				if (d < bcs2_size)
+					bcs = *(bcs2_beg + (unsigned char)d );
+					*/
+					//bcs = bcs2_[ (unsigned char)*rit ];
+
+				//rit -= bcs;
+			}
+			
+			boost::posix_time::ptime t2(boost::posix_time::microsec_clock::local_time());
+			std::cout << "find(horspool) : " << t2 - t1 << std::endl;
+			std::cout << "pos(horspool) : " << r - first << std::endl;
+
+			return r;
+		}
+
+		template<class InputIterator>
+		InputIterator find_bm_slow(InputIterator first, InputIterator last) const
+		{
+			typedef reverse_iterator<InputIterator> DataRevIter;
+			typedef const reverse_iterator<InputIterator> DataConstRevIter;
+
+			typedef InputIterator DataIter;
+			typedef const InputIterator DataConstIter;
+			typedef typename iterator_traits<InputIterator>::value_type DataValType;
+
+			const string_t& value = value_;
+
+			const DataRevIter rbeg(last);
+			const DataRevIter rend(first + value_.size() - 1);
+
+			const str_const_rev_iter v_rbeg = value_.rbegin();
+			const str_const_rev_iter v_rend = value_.rend();
+			const str_size_t v_size = value_.size();
+
+			//const int bcs2_size = bcs2_.size();
+			//const int bcs2_size = BCS_MAX;
+			//typename vector<str_size_t>::const_iterator bcs2_beg = bcs2_.begin();
+			//const typename vector<str_size_t>::value_type* bcs2_beg_ = &*bcs2_.begin();
+
+			InputIterator r = last;
+			boost::posix_time::ptime t1(boost::posix_time::microsec_clock::local_time());
+			for (DataRevIter rit = rend; rit >= rbeg + v_size;)
+			{
+				const pair<str_const_rev_iter, DataConstRevIter>&
+					result = mismatch(v_rbeg, v_rend, rit);
+
+				if (result.first == v_rend)
+				{
+					r = rit.base();
+					std::advance(r, -v_size);
+					break;
+				}
+
+				/*
+				typename MapBCS::const_iterator bcs_it = bcs_.find(*(result.second));
+				if (bcs_it != bcs_.end())
+					bcs = bcs_it->second;
+				*/
+				//const DataValType d = *(result.second) - min_;
+				//str_diff_t bcs = (d < 256 ? bcs2_beg_[ (unsigned char)d ] : v_size );
+				str_size_t bcs = bcs2_[ *result.second ];
+				/*
+				if (d < 256)
+					bcs = ;
+					//bcs = bcs2_[ (unsigned char)*rit ];
+				*/
+
+				const str_size_t mismatch_idx = std::distance(v_rbeg, result.first);
+				std::advance(rit, -std::max<str_diff_t>(mismatch_idx - bcs, gss_[mismatch_idx])); // MSVC runtime error
+			}
+
+			boost::posix_time::ptime t2(boost::posix_time::microsec_clock::local_time());
+			std::cout << "find(bm_slow) : " << t2 - t1 << std::endl;
+			std::cout << "pos(bm_slow) : " << r - first << std::endl;
+
+			return r;
+		}
+
+		template<class InputIterator>
+		InputIterator find_bf(InputIterator first, InputIterator last) const
+		{
+			boost::posix_time::ptime t0(boost::posix_time::microsec_clock::local_time());
+			InputIterator r = std::search(first, last, value_.begin(), value_.end());
+			boost::posix_time::ptime t1(boost::posix_time::microsec_clock::local_time());
+
+			std::cout << "find(brute force) : " << t1 - t0 << std::endl;
+			std::cout << "pos(brute force) : " << r - first << std::endl;
+			std::copy(r, r + 20, std::ostream_iterator<char>(std::cout, ""));
+			std::cout << std::endl;
+
+			return r;
+		}
+
+		template<class InputIterator>
+		InputIterator find(InputIterator first, InputIterator last) const
+		{
+			typedef reverse_iterator<InputIterator> DataRevIter;
+			typedef const reverse_iterator<InputIterator> DataConstRevIter;
+
+			typedef InputIterator DataIter;
+			typedef const InputIterator DataConstIter;
+
+			const string_t& value = value_;
+
+			const DataRevIter rbeg(last);
+			const DataRevIter rend(first + value_.size() - 1);
+
+			const str_const_rev_iter v_rbeg = value_.rbegin();
+			const str_const_rev_iter v_rend = value_.rend();
+			const str_size_t v_size = value_.size();
+
+			//const int bcs2_size = bcs2_.size();
+			//const int bcs2_size = BCS_MAX;
+			//typename vector<str_size_t>::const_iterator bcs2_beg = bcs2_.begin();
+
+			boost::posix_time::ptime t1(boost::posix_time::microsec_clock::local_time());
+			InputIterator r = last;
+			for (DataRevIter rit = rend; rit >= rbeg;)
+			{
+				str_const_rev_iter v_r1 = v_rbeg;
+				DataRevIter r1 = rit;
+
+				str_diff_t i;
+				for (i = 0; *v_r1 == *r1; ++i)
+				{
+					if (i == v_size - 1)
+					{
+						r = rit.base() - v_size;
+						goto end;
+					}
+					++v_r1;
+					++r1;
+				}
+
+				/*
+			   str_diff_t bcs = v_size;
+				typename MapBCS::const_iterator bcs_it = bcs_.find(*r1);
+				if (bcs_it != bcs_.end())
+					bcs = bcs_it->second;
+				*/
+
+				//bcs = bcs2_[ *r1 ];
+				//bcs = bcs2_[ (unsigned char)*rit ];
+
+				std::advance(rit, -max<str_diff_t>(i - bcs2_[*r1], gss_[i])); // MSVC runtime error
+			}
+end:
+
+			boost::posix_time::ptime t2(boost::posix_time::microsec_clock::local_time());
+			std::cout << "find(bm) : " << t2 - t1 << std::endl;
+			std::cout << "pos(bm) : " << r - first << std::endl;
+
+			return r;
+		}
+
+	private:
+		string_t value_;
+		MapBCS bcs_;
+		MapBCS bcs2_;
+		VecGSS gss_;
+		int min_;
+	};
+}
+
+#endif

File rope/fast_map.h

+#ifndef FAST_MAP_H
+#define FAST_MAP_H
+
+struct key_comp
+{
+	template<class T>
+		bool operator()(const T& lhs, const T& rhs)
+		{
+			return lhs.first < rhs.first;
+		}
+};
+
+template<class T, class U>
+class fast_map
+{
+public:
+	typedef T key_type;
+	typedef U data_type;
+	typedef std::pair<key_type, data_type> value_type;
+
+	typedef std::vector<value_type> Vec;
+	typedef typename Vec::iterator iterator;
+	typedef typename Vec::const_iterator const_iterator;
+
+	fast_map() {}
+
+	iterator insert(iterator it, const value_type& value)
+	{
+		/*
+		if (it != end() && it->first != value.first)
+			it->second = value.second;
+		else
+			if (key_comp()(value, *it))
+				it = values_.insert(it, value);
+		return it;
+		*/
+		return insert(value);
+	}
+	iterator insert(const value_type& value)
+	{
+		iterator it = lower_bound(value.first);
+		if (it != end() && it->first == value.first)
+			it->second = value.second;
+		else
+			it = values_.insert(it, value);
+		return it;
+	}
+
+	iterator lower_bound(const key_type& key)
+	{
+		return std::lower_bound(values_.begin(), values_.end(), value_type(key, data_type()), key_comp());
+	}
+
+	const_iterator lower_bound(const key_type& key) const
+	{
+		return std::lower_bound(values_.begin(), values_.end(), value_type(key, data_type()), key_comp());
+	}
+
+	iterator find(const key_type& key)
+	{
+		iterator it = lower_bound(key);
+		if (it != end() && it->first == key)
+			return it;
+		return end();
+	}
+
+	const_iterator find(const key_type& key) const
+	{
+		const_iterator it = lower_bound(key);
+		if (it != end() && it->first == key)
+			return it;
+		return end();
+	}
+
+	iterator end() { return values_.end(); }
+	const_iterator end() const { return values_.end(); }
+	void clear() { values_.clear(); }
+
+private:
+	Vec values_;
+};
+
+#endif

File tests/SConscript

+opts = Options()
+opts.Add(BoolOption('RELEASE', 'Set to build for release', 0))
+env = Environment(options = opts, CPPDEFINES={'RELEASE' : '${RELEASE}'})
+Help(opts.GenerateHelpText(env))
+
+if env['PLATFORM'] == 'win32':
+	boost_include_dir = 'D:/Program Files/boost/boost_1_34_1'
+	boost_lib_dir = 'D:/Program Files/boost/boost_1_34_1/stage/lib'
+	boost_lib_suffix = '-mgw34-1_34_1'
+
+	env.Append(CPPPATH = boost_include_dir)
+	env.Append(LIBPATH = boost_lib_dir)
+else:
+	boost_include_dir = '/usr/local/include/boost-1_35'
+	boost_lib_dir = ''
+	boost_lib_suffix = '-gcc42-mt'
+
+	env.Append(CPPPATH = boost_include_dir)
+	env.Append(LIBPATH = boost_lib_dir)
+
+#cc_and_link = ['-Wall', '-O3', '-I/usr/include/stlport', '-pthread']
+cc_and_link = ['-Wall', '-O3']
+env.Append(CXXFLAGS = cc_and_link, LINKFLAGS = cc_and_link) #+ ['-lstlport'])
+
+if env['RELEASE'] == False:
+	env.Append(CXXFLAGS = '')
+
+#env.Program(['ac_test.cpp'])
+env.Program(['bm_test.cpp'])

File tests/SConstruct

-opts = Options()
-opts.Add(BoolOption('RELEASE', 'Set to build for release', 0))
-env = Environment(options = opts, CPPDEFINES={'RELEASE' : '${RELEASE}'})
-Help(opts.GenerateHelpText(env))
-
-if env['PLATFORM'] == 'win32':
-	boost_include_dir = 'D:/Program Files/boost/boost_1_34_1'
-	boost_lib_dir = 'D:/Program Files/boost/boost_1_34_1/stage/lib'
-	boost_lib_suffix = '-mgw34-1_34_1'
-
-	env.Append(CPPPATH = boost_include_dir)
-	env.Append(LIBPATH = boost_lib_dir)
-else:
-	boost_include_dir = '/usr/local/include/boost-1_35'
-	boost_lib_dir = ''
-	boost_lib_suffix = '-gcc42-mt'
-
-	env.Append(CPPPATH = boost_include_dir)
-	env.Append(LIBPATH = boost_lib_dir)
-
-cc_and_link = ['-Wall', '-O2']
-env.Append(CXXFLAGS = cc_and_link, LINKFLAGS = cc_and_link)
-
-if env['RELEASE'] == False:
-	env.Append(CXXFLAGS = ' -g')
-
-env.Program(['ac_test.cpp'])

File tests/bm_test.cpp

+#include <iostream>
+#include <fstream>
+#include <string>
+#include <algorithm>
+#include <iterator>
+#include <vector>
+
+#include "../rope/boyer_moore.h"
+using namespace rope;
+
+#include "../rope/fast_map.h"
+
+int main()
+{
+	std::ifstream datafile("./bm.dat");
+	if (!datafile) {
+		std::cerr << "No ./bm.dat file" << std::endl;
+		return EXIT_FAILURE;
+	}
+
+	std::string test_str;
+	std::string line;
+	for (; std::getline(datafile, line);) {
+		test_str += line;
+	}
+
+	typedef fast_map<char, int> map_t;
+	map_t temp;
+	std::cerr << "1" << std::endl;
+	temp.insert(map_t::value_type('a', 1));
+	std::cerr << "2" << std::endl;
+	temp.insert(map_t::value_type('f', 10));
+	temp.insert(map_t::value_type('c', 15));
+
+	std::cerr << "3" << std::endl;
+	map_t::iterator it = temp.find('f');
+	std::cerr << "4" << std::endl;
+	if (it != temp.end())
+		std::cout << it->second << std::endl;
+
+	while (1)
+	{
+		std::string input;
+		std::getline(std::cin, input);
+		BM<> bm(input);
+		std::string::iterator it = bm.find_bf(test_str.begin(), test_str.end());
+		it = bm.find(test_str.begin(), test_str.end());
+		it = bm.find_horspool(test_str.begin(), test_str.end());
+		it = bm.find_bm_slow(test_str.begin(), test_str.end());
+		break;
+	}
+
+	std::vector<int> a;
+	a.push_back(1);
+	a.push_back(2);
+	std::cout << a[1] << std::endl;
+}
+