Commits

Joris Kluivers committed 3a24b95

Initial commit

Comments (0)

Files changed (5)

+Copyright (c) 2012, Joris Kluivers
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the author nor the
+      names of any contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+Originally based on twitter-text-java:
+https://github.com/twitter/twitter-text-java
+
+Copyright 2011 Twitter, Inc.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this work except in compliance with the License.
+   You may obtain a copy of the License below, or at:
+
+       http://www.apache.org/licenses/LICENSE-2.0
Empty file added.
+package microtext
+
+import (
+	"fmt"
+	"bytes"
+)
+
+func LinkToURL(entity Entity, text string, buffer *bytes.Buffer) {
+	LinkToURLMaxLen(entity, text, 0, buffer)
+}
+
+func LinkToURLMaxLen(entity Entity, text string, maxLen int, buffer *bytes.Buffer) {
+	if maxLen < 1 || len(entity.Text) <= maxLen {
+		buffer.WriteString(fmt.Sprintf("<a href=\"%s\">%s</a>", entity.URL, entity.Text))
+		return
+	}
+
+	buffer.WriteString(fmt.Sprintf("<a href=\"%s\">%s...</a>", entity.URL, entity.Text[:maxLen]))
+}
+
+func AutolinkEntities(text string, entities []Entity) string {
+	return AutolinkEntitiesMaxLen(text, entities, 0)
+}
+
+func AutolinkEntitiesMaxLen(text string, entities []Entity, maxLen int) string {
+	var beginIndex int = 0
+	var buffer bytes.Buffer
+
+	for _, entity := range entities {
+		buffer.WriteString(text[beginIndex : entity.Start])
+	
+		LinkToURLMaxLen(entity, text, maxLen, &buffer)
+		
+		beginIndex = entity.End
+	}
+	
+	buffer.WriteString(text[beginIndex:])
+
+	return buffer.String()
+}
+
+func Autolink(text string) string {
+	return AutolinkMaxLen(text, 0)
+}
+
+func AutolinkMaxLen(text string, maxLen int) string {
+	entities := Extract(text)
+
+	return AutolinkEntitiesMaxLen(text, entities, maxLen)
+}
+package microtext
+
+import (
+	"regexp"
+	"log"
+	"strings"
+)
+
+var (
+	LATIN_ACCENTS_CHARS = "\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u00ff" + // Latin-1
+	                      "\u0100-\u024f" + // Latin Extended A and B
+	                      "\u0253\u0254\u0256\u0257\u0259\u025b\u0263\u0268\u026f\u0272\u0289\u028b" + // IPA Extensions
+	                      "\u02bb" + // Hawaiian
+	                      "\u0300-\u036f" + // Combining diacritics
+	                      "\u1e00-\u1eff" // Latin Extended Additional (mostly for Vietnamese)
+	VALID_CHARS = "a-z0-9" + LATIN_ACCENTS_CHARS
+	VALID_CHARS_SET = "[" + VALID_CHARS + "]"
+	VALID_SUBDOMAIN = "(?:(?:" + VALID_CHARS_SET + "[" + VALID_CHARS + "\\-_]*)?" + VALID_CHARS_SET + "\\.)"
+	VALID_DOMAIN_NAME = "(?:(?:" + VALID_CHARS_SET + "[" + VALID_CHARS + "\\-]*)?" + VALID_CHARS_SET + "\\.)"
+	VALID_GTLD = "(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|xxx)"
+	VALID_CCTLD = "(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|" +
+	      "bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|" +
+	      "er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|" +
+	      "hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|" +
+	      "lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|" +
+	      "nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|" +
+	      "sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|" +
+	      "va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)"
+	
+	VALID_DOMAIN = "(?:" +
+		VALID_SUBDOMAIN + "+" + VALID_DOMAIN_NAME + // www.twitter.com, foo.co.jp
+		"(?:" + VALID_GTLD + "|" + VALID_CCTLD + ")" +
+	")" +
+	"|(?:" +	// domain + gTLD
+		VALID_DOMAIN_NAME + VALID_GTLD +
+	")"+
+	"|(?:" + "(https?\\://)" +
+		"(?:" +
+			"(?:" + VALID_DOMAIN_NAME + "(" + VALID_GTLD + "|" + VALID_CCTLD + "))" + // protocol + domain + ccTLD
+			"|(?:" +
+				VALID_CHARS_SET + "+\\." + // protocol + domain + tld
+				"(?:" + VALID_GTLD + "|" + VALID_CCTLD + ")" +
+			")" +
+		")" +
+	")" + 
+	"|(" + VALID_DOMAIN_NAME + VALID_CCTLD + "(/))" // domain + ccTLD + "/"
+	
+	VALID_PORT_NUMBER = "[0-9]+"
+	
+	VALID_GENERAL_PATH_CHARS = "[a-z0-9!\\*';:=\\+,.\\$/%#\\[\\]\\-_~\\|&" + "]"
+	BALANCED_PARENS = "\\(" + VALID_GENERAL_PATH_CHARS + "+\\)"
+	
+	VALID_PATH = "(?:" +
+		"(?:" +
+			VALID_GENERAL_PATH_CHARS + "*" +
+			"(" + BALANCED_PARENS + VALID_GENERAL_PATH_CHARS + "*)*" +
+			VALID_GENERAL_PATH_CHARS +
+		")|(?:" + VALID_GENERAL_PATH_CHARS + "+/)" +
+	")"
+	
+	VALID_URL_QUERY_CHARS = "[a-z0-9!?\\*'\\(\\);:&=\\+\\$/%#\\[\\]\\-_\\.,~\\|]"
+	VALID_URL_QUERY_ENDING_CHARS = "[a-z0-9_&=#/]"
+	
+	VALID_URL_PATTERN = 
+	"(?i:" +
+		"(https?\\://)?" +
+		"(" + VALID_DOMAIN + ")" +
+		"(?::(" + VALID_PORT_NUMBER + "))?" + 
+		"(/" +
+			VALID_PATH + "*" + 
+		")?" +
+		"(\\?" + VALID_URL_QUERY_CHARS + "*" +
+			VALID_URL_QUERY_ENDING_CHARS +
+		")?" +
+	")"
+)
+
+type Entity struct {
+	Type string
+	Text string
+	URL string
+	Start int
+	End int
+}
+
+func Extract(text string) []Entity {
+	log.Printf("Pattern: %s\n", VALID_URL_PATTERN)
+
+	reg := regexp.MustCompile(VALID_URL_PATTERN)
+
+	indices := reg.FindAllStringIndex(text, -1)
+	debug := reg.FindAllStringSubmatch(text, -1)
+	
+	log.Printf("%v", debug)
+	
+	log.Printf("Number of indices found: %d", len(indices))
+	
+	if len(indices) < 1 {
+		return nil
+	}
+	
+	entities := make([]Entity, len(indices))
+	
+	for i:=0; i<len(indices); i++ {
+		entities[i] = Entity{
+			Type: "url",
+			Start: indices[i][0], 
+			End: indices[i][1],
+			Text: text[indices[i][0]: indices[i][1]],
+			URL: text[indices[i][0]: indices[i][1]],
+		}
+		
+		hasPrefix := strings.HasPrefix(entities[i].URL, "http://") || strings.HasPrefix(entities[i].URL, "https://")
+		if !hasPrefix {
+			entities[i].URL = "http://" + entities[i].URL
+		}
+	}
+
+	return entities
+}