Commits

Ross Light committed 75e2b45

Clean up internals of catalog/search package

* Move text-processing functions into text.go
* Add foldString function
* Change isTokenizeRune to !isTokenSep
* Rename TOKEN from the query grammar to TERM
* Rename stripNonToken to stripTokenSep
* Refactor tokenize to runesFieldsFunc (so string users can use strings.Fields)
* Add license info for runesFieldsFunc being borrowed from Go stdlib

Comments (0)

Files changed (9)

 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+## Go
+
+Portions of GLaDOS are copied from the Go standard library.
+
+Copyright (c) 2012 The Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 ## Bootstrap
 
 GLaDOS uses [Bootstrap](http://twitter.github.com/bootstrap/), which is released under the

catalog/search/lex.go

 const (
 	invalidItem itemKind = iota
 	eofItem
-	tokenItem
+	termItem
 	orItem
 	notItem
 	tagItem
 	}
 
 	l.backup()
-	return lexToken
+	return lexTerm
 }
 
-func lexToken(l *queryLexer) stateFn {
+func lexTerm(l *queryLexer) stateFn {
 	for {
 		r := l.next()
 		if r == eof || unicode.IsSpace(r) || r == '(' || r == ')' {
 		l.pos = l.start + len(tagPrefix)
 		l.emit(tagItem)
 		l.pos = end
-		l.emit(tokenItem)
+		l.emit(termItem)
 	} else if l.input[l.start:l.pos] == orOperator {
 		l.emit(orItem)
 	} else {
-		l.emit(tokenItem)
+		l.emit(termItem)
 	}
 	return lexDefault
 }

catalog/search/lex_test.go

 	}{
 		{"", []item{{eofItem, ""}}},
 		{" ", []item{{eofItem, ""}}},
-		{"hello", []item{{tokenItem, "hello"}, {eofItem, ""}}},
-		{"tag:hello", []item{{tagItem, "tag:"}, {tokenItem, "hello"}, {eofItem, ""}}},
-		{"-hello", []item{{notItem, "-"}, {tokenItem, "hello"}, {eofItem, ""}}},
-		{"hello world", []item{{tokenItem, "hello"}, {tokenItem, "world"}, {eofItem, ""}}},
-		{"hello OR world", []item{{tokenItem, "hello"}, {orItem, "OR"}, {tokenItem, "world"}, {eofItem, ""}}},
+		{"hello", []item{{termItem, "hello"}, {eofItem, ""}}},
+		{"tag:hello", []item{{tagItem, "tag:"}, {termItem, "hello"}, {eofItem, ""}}},
+		{"-hello", []item{{notItem, "-"}, {termItem, "hello"}, {eofItem, ""}}},
+		{"hello world", []item{{termItem, "hello"}, {termItem, "world"}, {eofItem, ""}}},
+		{"hello OR world", []item{{termItem, "hello"}, {orItem, "OR"}, {termItem, "world"}, {eofItem, ""}}},
 	}
 	for _, test := range tests {
 		items := lexQuery(test.Query)

catalog/search/query.go

 	pairs := make([]int, 0)
 	tokenStart := -1
 	for i, r := range runes {
-		if !isTokenizeRune(r) {
+		if isTokenSep(r) {
 			if tokenStart >= 0 {
 				tok := runes[tokenStart:i]
 				for _, term := range foldedTerms {
 
 func extractTerms(ast queryAST) []string {
 	switch ast := ast.(type) {
-	case token:
+	case term:
 		return []string{string(ast)}
 	case queryAnd:
 		var terms []string
 	default:
 		p.backup()
 		return nil
-	case tokenItem:
-		return token(item.value)
+	case termItem:
+		return term(item.value)
 	case tagItem:
 		item = p.next()
-		if item.kind != tokenItem {
+		if item.kind != termItem {
 			return nil
 		}
 		return tagAtom(item.value)
 	return "-(" + q.ast.String() + ")"
 }
 
-type token string
+type term string
 
-func (token) isQueryAST() {}
+func (term) isQueryAST() {}
 
-func (t token) String() string {
+func (t term) String() string {
 	return string(t)
 }
 
-func (t token) GoString() string {
+func (t term) GoString() string {
 	return "search.token(" + strconv.Quote(string(t)) + ")"
 }
 

catalog/search/query_test.go

 		Expect queryAST
 	}{
 		{"", nil},
-		{"hello", token("hello")},
+		{"hello", term("hello")},
 		{"tag:hello", tagAtom("hello")},
-		{"-hello", queryNot{token("hello")}},
-		{"hello world", queryAnd{token("hello"), token("world")}},
-		{"hello OR world", queryOr{token("hello"), token("world")}},
+		{"-hello", queryNot{term("hello")}},
+		{"hello world", queryAnd{term("hello"), term("world")}},
+		{"hello OR world", queryOr{term("hello"), term("world")}},
 	}
 	for _, test := range tests {
 		ast, err := parseQuery(test.Query)

catalog/search/search.go

 import (
 	"log"
 	"sort"
-	"unicode"
 
 	"bitbucket.org/zombiezen/glados/catalog"
 )
 		ts.searchOr(q, results)
 	case queryNot:
 		ts.searchNot(q, results)
-	case token:
+	case term:
 		ts.searchToken(q, results)
 	case tagAtom:
 		ts.searchTagAtom(q, results)
 }
 
 func (ts *textSearch) searchAnd(q queryAnd, results resultMap) {
-	// Because AND is an intersection, start with the first term and filter
+	// Because AND is an intersection, start with the first subquery and filter
 	// the set as we go.  If the number of results is ever zero, we're finished.
 
 	if len(q) == 0 {
 		return
 	}
 
-	// First term
+	// First subquery
 	ts.search(q[0], results)
 	if len(results) == 0 {
 		return
 		r.Relevance /= float32(len(q))
 	}
 
-	// Subsequent terms
+	// Subsequent subqueries
 	m := make(resultMap)
 	for _, subq := range q[1:] {
 		ts.search(subq, m)
 	}
 }
 
-func (ts *textSearch) searchToken(q token, results resultMap) {
+func (ts *textSearch) searchToken(q term, results resultMap) {
 	qr := fold([]rune(string(q)))
 	for _, ent := range ts.i[string(qr)] {
 		results.Get(ent.shortName).Relevance += ent.kind.Weight()
 	}
-	stripped := stripNonToken(qr)
+	stripped := stripTokenSep(qr)
 	if len(stripped) == len(qr) {
 		return
 	}
 }
 
 func (ts *textSearch) searchTagAtom(q tagAtom, results resultMap) {
-	for _, sn := range ts.tags[string(fold([]rune(string(q))))] {
+	for _, sn := range ts.tags[foldString(string(q))] {
 		results.Put(&Result{ShortName: sn, Relevance: 1.0})
 	}
 }
 
-func stripNonToken(r []rune) []rune {
-	// TODO(light): this function should have a better name
-
-	for i := len(r) - 1; i >= 0; i-- {
-		if !isTokenizeRune(r[i]) {
-			copy(r[i:], r[i+1:])
-			r = r[:len(r)-1]
-		}
-	}
-	return r
-}
-
 func (ts *textSearch) build(sn string) error {
 	p, err := ts.c.GetProject(sn)
 	if err != nil {
 	}
 }
 
-// fold changes every rune in s to its least equivalent folded case, according
-// to unicode.SimpleFold and returns s.
-func fold(s []rune) []rune {
-	for i, r := range s {
-		switch {
-		case r >= 'a' && r <= 'z':
-			// the only characters in ASCII that need folding are lowercase
-			s[i] = r - 'a' + 'A'
-		case r < 128:
-			// do nothing
-		default:
-			rr := unicode.SimpleFold(r)
-			for rr > r {
-				rr = unicode.SimpleFold(rr)
-			}
-			s[i] = rr
-		}
-	}
-	return s
-}
-
-// tokenize splits a slice of runes s around each instance of one or more
-// consecutive non-alphanumeric characters, returning an array of folded
-// substrings or an empty list if s contains only non-alphanumerics.
-func tokenize(s []rune) [][]rune {
-	// borrowed from strings.FieldsFunc in standard library
-
-	n := 0
-	inField := false
-	for _, rune := range s {
-		wasInField := inField
-		inField = isTokenizeRune(rune)
-		if inField && !wasInField {
-			n++
-		}
-	}
-
-	a := make([][]rune, n)
-	na := 0
-	fieldStart := -1
-	for i, rune := range s {
-		if !isTokenizeRune(rune) {
-			if fieldStart >= 0 {
-				a[na] = s[fieldStart:i]
-				na++
-				fieldStart = -1
-			}
-		} else if fieldStart == -1 {
-			fieldStart = i
-		}
-	}
-	if fieldStart >= 0 {
-		a[na] = s[fieldStart:]
-	}
-
-	return a
-}
-
-var tokenizeRanges = []*unicode.RangeTable{unicode.Letter, unicode.Number}
-
-func isTokenizeRune(r rune) bool {
-	return unicode.IsOneOf(tokenizeRanges, r)
-}
-
 type indexEntry struct {
 	shortName string
 	kind      entryKind

catalog/search/search_test.go

 import (
 	"strconv"
 	"testing"
-	"unicode"
 
 	"bitbucket.org/zombiezen/glados/catalog"
 )
 	searchBenchmark(b, "PYTHON OR GO OR PYTHON OR GO OR PYTHON")
 }
 
-func TestFold(t *testing.T) {
-	const asciiPunctuation = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@[\\]^_`{|}~\x7f"
-	tests := []struct {
-		S      string
-		Folded string
-	}{
-		{"", ""},
-		{"abcdefghijklmnopqrstuvwxyz", "ABCDEFGHIJKLMNOPQRSTUVWXYZ"},
-		{"ABCDEFGHIJKLMNOPQRSTUVWXYZ", "ABCDEFGHIJKLMNOPQRSTUVWXYZ"},
-		{asciiPunctuation, asciiPunctuation},
-	}
-	for _, test := range tests {
-		f := string(fold([]rune(test.S)))
-		if f != test.Folded {
-			t.Errorf("fold(%q) = %q; want %q", test.S, f, test.Folded)
-		}
-	}
-}
-
-func BenchmarkFoldASCII(b *testing.B) {
-	const n = 65536
-	b.StopTimer()
-	r := make([]rune, 0, n)
-	for i := 0; i < n; i++ {
-		r = append(r, rune(i%128))
-	}
-	buf := make([]rune, len(r))
-	b.StartTimer()
-
-	for i := 0; i < b.N; i++ {
-		copy(buf, r)
-		fold(buf)
-	}
-	b.SetBytes(n)
-}
-
-func BenchmarkFold(b *testing.B) {
-	const n = 65536
-	b.StopTimer()
-	r := make([]rune, 0, n)
-	for i := 0; i < n; i++ {
-		r = append(r, rune(i))
-	}
-	buf := make([]rune, len(r))
-	b.StartTimer()
-
-	for i := 0; i < b.N; i++ {
-		copy(buf, r)
-		fold(buf)
-	}
-	b.SetBytes(n)
-}
-
-func TestStripNonToken(t *testing.T) {
-	tests := []struct {
-		term string
-		s    string
-	}{
-		{"", ""},
-		{"A", "A"},
-		{"a", "a"},
-		{"a.", "a"},
-	}
-	for _, test := range tests {
-		s := string(stripNonToken([]rune(test.term)))
-		if s != test.s {
-			t.Errorf("stripNonToken(%q) = %q; want %q", test.term, s, test.s)
-		}
-	}
-}
-
-func TestTokenize(t *testing.T) {
-	tests := []struct {
-		s string
-		a []string
-	}{
-		{"", []string{}},
-		{" ", []string{}},
-		{" \t ", []string{}},
-		{" abc ", []string{"abc"}},
-		{"1 2 3 4", []string{"1", "2", "3", "4"}},
-		{"1. 2. 3. -4.", []string{"1", "2", "3", "4"}},
-	}
-	for _, test := range tests {
-		a := makeStringArray(tokenize([]rune(test.s)))
-		if !strarrEq(a, test.a) {
-			t.Errorf("tokenize(%q) = %v; want %v", test.s, a, test.a)
-		}
-	}
-}
-
-func makeStringArray(a [][]rune) []string {
-	s := make([]string, len(a))
-	for i := range a {
-		s[i] = string(a[i])
-	}
-	return s
-}
-
-func strarrEq(a, b []string) bool {
-	if len(a) != len(b) {
-		return false
-	}
-	for i := range a {
-		if a[i] != b[i] {
-			return false
-		}
-	}
-	return true
-}
-
-func TestIsTokenizeRune(t *testing.T) {
-	allowed := []*unicode.RangeTable{unicode.Letter, unicode.Number}
-	for r := rune(0); r <= unicode.MaxRune; r++ {
-		if mine, actual := isTokenizeRune(r), unicode.IsOneOf(allowed, r); mine != actual {
-			t.Errorf("isTokenizeRune(%q) = %t; want %t", r, mine, actual)
-		}
-	}
-}
-
-func BenchmarkIsTokenizeRune(b *testing.B) {
-	for i := 0; i < b.N; i++ {
-		isTokenizeRune(0x10ffff)
-	}
-}
-
 type mockCatalog map[string]*catalog.Project
 
 func (mc mockCatalog) List() ([]string, error) {

catalog/search/text.go

+package search
+
+import (
+	"strings"
+	"unicode"
+)
+
+// fold changes every rune in s to its least equivalent folded case, according
+// to unicode.SimpleFold and returns s.
+func fold(s []rune) []rune {
+	for i, r := range s {
+		switch {
+		case r >= 'a' && r <= 'z':
+			// the only characters in ASCII that need folding are lowercase
+			s[i] = r - 'a' + 'A'
+		case r < 128:
+			// do nothing
+		default:
+			rr := unicode.SimpleFold(r)
+			for rr > r {
+				rr = unicode.SimpleFold(rr)
+			}
+			s[i] = rr
+		}
+	}
+	return s
+}
+
+// foldString allocates a new rune slice for s and calls fold.
+func foldString(s string) string {
+	return string(fold([]rune(s)))
+}
+
+func tokenize(s []rune) [][]rune {
+	return runeFieldsFunc(s, isTokenSep)
+}
+
+func tokenizeString(s string) []string {
+	return strings.FieldsFunc(s, isTokenSep)
+}
+
+var tokenizeRanges = []*unicode.RangeTable{unicode.Letter, unicode.Number}
+
+func isTokenSep(r rune) bool {
+	return !unicode.IsOneOf(tokenizeRanges, r)
+}
+
+func stripTokenSep(r []rune) []rune {
+	for i := len(r) - 1; i >= 0; i-- {
+		if isTokenSep(r[i]) {
+			copy(r[i:], r[i+1:])
+			r = r[:len(r)-1]
+		}
+	}
+	return r
+}
+
+// runeFieldsFunc splits the rune slice s at each run of Unicode code points c
+// satisfying f(c) and returns an array of slices of s. If all code points in s
+// satisfy f(c) or the string is empty, an empty slice is returned.
+func runeFieldsFunc(s []rune, f func(rune) bool) [][]rune {
+	// borrowed from strings.FieldsFunc in standard library
+
+	n := 0
+	inField := false
+	for _, rune := range s {
+		wasInField := inField
+		inField = !f(rune)
+		if inField && !wasInField {
+			n++
+		}
+	}
+
+	a := make([][]rune, n)
+	na := 0
+	fieldStart := -1
+	for i, rune := range s {
+		if f(rune) {
+			if fieldStart >= 0 {
+				a[na] = s[fieldStart:i]
+				na++
+				fieldStart = -1
+			}
+		} else if fieldStart == -1 {
+			fieldStart = i
+		}
+	}
+	if fieldStart >= 0 {
+		a[na] = s[fieldStart:]
+	}
+
+	return a
+}

catalog/search/text_test.go

+package search
+
+import (
+	"testing"
+	"unicode"
+)
+
+func TestFold(t *testing.T) {
+	const asciiPunctuation = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@[\\]^_`{|}~\x7f"
+	tests := []struct {
+		s string
+		f string
+	}{
+		{"", ""},
+		{"abcdefghijklmnopqrstuvwxyz", "ABCDEFGHIJKLMNOPQRSTUVWXYZ"},
+		{"ABCDEFGHIJKLMNOPQRSTUVWXYZ", "ABCDEFGHIJKLMNOPQRSTUVWXYZ"},
+		{asciiPunctuation, asciiPunctuation},
+	}
+	for _, test := range tests {
+		f := foldString(test.s)
+		if f != test.f {
+			t.Errorf("foldString(%q) = %q; want %q", test.s, f, test.f)
+		}
+	}
+}
+
+func BenchmarkFoldASCII(b *testing.B) {
+	const n = 65536
+	b.StopTimer()
+	r := make([]rune, 0, n)
+	for i := 0; i < n; i++ {
+		r = append(r, rune(i%128))
+	}
+	buf := make([]rune, len(r))
+	b.StartTimer()
+
+	for i := 0; i < b.N; i++ {
+		copy(buf, r)
+		fold(buf)
+	}
+	b.SetBytes(n)
+}
+
+func BenchmarkFold(b *testing.B) {
+	const n = 65536
+	b.StopTimer()
+	r := make([]rune, 0, n)
+	for i := 0; i < n; i++ {
+		r = append(r, rune(i))
+	}
+	buf := make([]rune, len(r))
+	b.StartTimer()
+
+	for i := 0; i < b.N; i++ {
+		copy(buf, r)
+		fold(buf)
+	}
+	b.SetBytes(n)
+}
+
+func TestTokenize(t *testing.T) {
+	tests := []struct {
+		s string
+		a []string
+	}{
+		{"", []string{}},
+		{" ", []string{}},
+		{" \t ", []string{}},
+		{" abc ", []string{"abc"}},
+		{"1 2 3 4", []string{"1", "2", "3", "4"}},
+		{"1. 2. 3. -4.", []string{"1", "2", "3", "4"}},
+	}
+	for _, test := range tests {
+		a := makeStringArray(tokenize([]rune(test.s)))
+		if !strarrEq(a, test.a) {
+			t.Errorf("tokenize(%q) = %v; want %v", test.s, a, test.a)
+		}
+	}
+}
+
+func makeStringArray(a [][]rune) []string {
+	s := make([]string, len(a))
+	for i := range a {
+		s[i] = string(a[i])
+	}
+	return s
+}
+
+func strarrEq(a, b []string) bool {
+	if len(a) != len(b) {
+		return false
+	}
+	for i := range a {
+		if a[i] != b[i] {
+			return false
+		}
+	}
+	return true
+}
+
+func TestIsTokenSep(t *testing.T) {
+	allowed := []*unicode.RangeTable{unicode.Letter, unicode.Number}
+	for r := rune(0); r <= unicode.MaxRune; r++ {
+		if mine, actual := isTokenSep(r), !unicode.IsOneOf(allowed, r); mine != actual {
+			t.Errorf("isTokenSep(%q) = %t; want %t", r, mine, actual)
+		}
+	}
+}
+
+func BenchmarkIsTokenSep(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		isTokenSep(0x10ffff)
+	}
+}
+
+func TestStripTokenSep(t *testing.T) {
+	tests := []struct {
+		r string
+		s string
+	}{
+		{"", ""},
+		{"A", "A"},
+		{"a", "a"},
+		{"a.", "a"},
+	}
+	for _, test := range tests {
+		s := string(stripTokenSep([]rune(test.r)))
+		if s != test.s {
+			t.Errorf("stripTokenSep(%q) = %q; want %q", test.r, s, test.s)
+		}
+	}
+}