Commits

Carl Friedrich Bolz committed 53b7cc7 Merge

merge

Comments (0)

Files changed (9)

pypy/interpreter/baseobjspace.py

     def newlist_str(self, list_s):
         return self.newlist([self.wrap(s) for s in list_s])
 
+    def newlist_unicode(self, list_u):
+        return self.newlist([self.wrap(u) for u in list_u])
+
     def newlist_hint(self, sizehint):
         from pypy.objspace.std.listobject import make_empty_list_with_size
         return make_empty_list_with_size(self, sizehint)

pypy/objspace/std/listobject.py

         storage = strategy.erase(list_s)
         return W_ListObject.from_storage_and_strategy(space, storage, strategy)
 
+    @staticmethod
+    def newlist_unicode(space, list_u):
+        strategy = space.fromcache(UnicodeListStrategy)
+        storage = strategy.erase(list_u)
+        return W_ListObject.from_storage_and_strategy(space, storage, strategy)
+
     def __repr__(self):
         """ representation for debugging purposes """
         return "%s(%s, %s)" % (self.__class__.__name__, self.strategy,

pypy/objspace/std/objspace.py

     def newlist_str(self, list_s):
         return W_ListObject.newlist_str(self, list_s)
 
+    def newlist_unicode(self, list_u):
+        return W_ListObject.newlist_unicode(self, list_u)
+
     def newdict(self, module=False, instance=False, kwargs=False,
                 strdict=False):
         return W_DictMultiObject.allocate_and_init_instance(

pypy/objspace/std/stringobject.py

 from pypy.objspace.std.register_all import register_all
 from pypy.objspace.std.sliceobject import W_SliceObject, normalize_simple_slice
 from pypy.objspace.std.stringtype import (
-    joined2, sliced, stringendswith, stringstartswith, wrapchar, wrapstr)
+    joined2, sliced, wrapchar, wrapstr)
 from rpython.rlib import jit
 from rpython.rlib.objectmodel import (
     compute_hash, compute_unique_id, specialize)
 from rpython.rlib.rarithmetic import ovfcheck
-from rpython.rlib.rstring import StringBuilder, split
+from rpython.rlib.rstring import (StringBuilder, split, rsplit, replace,
+    endswith, startswith)
 
 
 class W_AbstractStringObject(W_Object):
     bylen = len(by)
     if bylen == 0:
         raise OperationError(space.w_ValueError, space.wrap("empty separator"))
-
-    if bylen == 1 and maxsplit < 0:
-        res = []
-        start = 0
-        # fast path: uses str.rfind(character) and str.count(character)
-        by = by[0]    # annotator hack: string -> char
-        count = value.count(by)
-        res = [None] * (count + 1)
-        end = len(value)
-        while count >= 0:
-            assert end >= 0
-            prev = value.rfind(by, 0, end)
-            start = prev + 1
-            assert start >= 0
-            res[count] = value[start:end]
-            count -= 1
-            end = prev
-    else:
-        res = split(value, by, maxsplit)
-
+    res = split(value, by, maxsplit)
     return space.newlist_str(res)
 
 def str_rsplit__String_None_ANY(space, w_self, w_none, w_maxsplit=-1):
     maxsplit = space.int_w(w_maxsplit)
-    res_w = []
+    res = []
     value = w_self._value
     i = len(value)-1
     while True:
         # the word is value[j+1:i+1]
         j1 = j + 1
         assert j1 >= 0
-        res_w.append(sliced(space, value, j1, i+1, w_self))
+        res.append(value[j1:i+1])
 
         # continue to look from the character before the space before the word
         i = j - 1
 
-    res_w.reverse()
-    return space.newlist(res_w)
+    res.reverse()
+    return space.newlist_str(res)
 
-def make_rsplit_with_delim(funcname, sliced):
-    from rpython.tool.sourcetools import func_with_new_name
-
-    def fn(space, w_self, w_by, w_maxsplit=-1):
-        maxsplit = space.int_w(w_maxsplit)
-        res_w = []
-        value = w_self._value
-        end = len(value)
-        by = w_by._value
-        bylen = len(by)
-        if bylen == 0:
-            raise OperationError(space.w_ValueError, space.wrap("empty separator"))
-
-        while maxsplit != 0:
-            next = value.rfind(by, 0, end)
-            if next < 0:
-                break
-            res_w.append(sliced(space, value, next+bylen, end, w_self))
-            end = next
-            maxsplit -= 1   # NB. if it's already < 0, it stays < 0
-
-        res_w.append(sliced(space, value, 0, end, w_self))
-        res_w.reverse()
-        return space.newlist(res_w)
-
-    return func_with_new_name(fn, funcname)
-
-str_rsplit__String_String_ANY = make_rsplit_with_delim('str_rsplit__String_String_ANY',
-                                                       sliced)
+def str_rsplit__String_String_ANY(space, w_self, w_by, w_maxsplit=-1):
+    maxsplit = space.int_w(w_maxsplit)
+    value = w_self._value
+    by = w_by._value
+    if not by:
+        raise OperationError(space.w_ValueError, space.wrap("empty separator"))
+    return space.newlist_str(rsplit(value, by, maxsplit))
 
 def str_join__String_ANY(space, w_self, w_list):
     l = space.listview_str(w_list)
 
     return space.wrap(res)
 
-def _string_replace(space, input, sub, by, maxsplit):
-    if maxsplit == 0:
-        return space.wrap(input)
-
-    if not sub:
-        upper = len(input)
-        if maxsplit > 0 and maxsplit < upper + 2:
-            upper = maxsplit - 1
-            assert upper >= 0
-
-        try:
-            result_size = ovfcheck(upper * len(by))
-            result_size = ovfcheck(result_size + upper)
-            result_size = ovfcheck(result_size + len(by))
-            remaining_size = len(input) - upper
-            result_size = ovfcheck(result_size + remaining_size)
-        except OverflowError:
-            raise OperationError(space.w_OverflowError,
-                space.wrap("replace string is too long")
-            )
-        builder = StringBuilder(result_size)
-        for i in range(upper):
-            builder.append(by)
-            builder.append(input[i])
-        builder.append(by)
-        builder.append_slice(input, upper, len(input))
-    else:
-        # First compute the exact result size
-        count = input.count(sub)
-        if count > maxsplit and maxsplit > 0:
-            count = maxsplit
-        diff_len = len(by) - len(sub)
-        try:
-            result_size = ovfcheck(diff_len * count)
-            result_size = ovfcheck(result_size + len(input))
-        except OverflowError:
-            raise OperationError(space.w_OverflowError,
-                space.wrap("replace string is too long")
-            )
-
-        builder = StringBuilder(result_size)
-        start = 0
-        sublen = len(sub)
-
-        while maxsplit != 0:
-            next = input.find(sub, start)
-            if next < 0:
-                break
-            builder.append_slice(input, start, next)
-            builder.append(by)
-            start = next + sublen
-            maxsplit -= 1   # NB. if it's already < 0, it stays < 0
-
-        builder.append_slice(input, start, len(input))
-
-    return space.wrap(builder.build())
-
 
 def str_replace__String_ANY_ANY_ANY(space, w_self, w_sub, w_by, w_maxsplit):
-    return _string_replace(space, w_self._value, space.buffer_w(w_sub).as_str(),
-                           space.buffer_w(w_by).as_str(),
-                           space.int_w(w_maxsplit))
+    sub = space.buffer_w(w_sub).as_str()
+    by = space.buffer_w(w_by).as_str()
+    maxsplit = space.int_w(w_maxsplit)
+    try:
+        res = replace(w_self._value, sub, by, maxsplit)
+    except OverflowError:
+        raise OperationError(space.w_OverflowError,
+            space.wrap("replace string is too long")
+        )
+    return space.wrap(res)
 
 def str_replace__String_String_String_ANY(space, w_self, w_sub, w_by, w_maxsplit=-1):
-    input = w_self._value
     sub = w_sub._value
     by = w_by._value
     maxsplit = space.int_w(w_maxsplit)
-    return _string_replace(space, input, sub, by, maxsplit)
+    try:
+        res = replace(w_self._value, sub, by, maxsplit)
+    except OverflowError:
+        raise OperationError(space.w_OverflowError,
+            space.wrap("replace string is too long")
+        )
+    return space.wrap(res)
 
 def _strip(space, w_self, w_chars, left, right):
     "internal function called by str_xstrip methods"
 def str_endswith__String_String_ANY_ANY(space, w_self, w_suffix, w_start, w_end):
     (u_self, start, end) = _convert_idx_params(space, w_self, w_start,
                                                w_end, True)
-    return space.newbool(stringendswith(u_self, w_suffix._value, start, end))
+    return space.newbool(endswith(u_self, w_suffix._value, start, end))
 
 def str_endswith__String_ANY_ANY_ANY(space, w_self, w_suffixes, w_start, w_end):
     if not space.isinstance_w(w_suffixes, space.w_tuple):
             return space.call_method(w_u, "endswith", w_suffixes, w_start,
                                      w_end)
         suffix = space.str_w(w_suffix)
-        if stringendswith(u_self, suffix, start, end):
+        if endswith(u_self, suffix, start, end):
             return space.w_True
     return space.w_False
 
 def str_startswith__String_String_ANY_ANY(space, w_self, w_prefix, w_start, w_end):
     (u_self, start, end) = _convert_idx_params(space, w_self, w_start,
                                                w_end, True)
-    return space.newbool(stringstartswith(u_self, w_prefix._value, start, end))
+    return space.newbool(startswith(u_self, w_prefix._value, start, end))
 
 def str_startswith__String_ANY_ANY_ANY(space, w_self, w_prefixes, w_start, w_end):
     if not space.isinstance_w(w_prefixes, space.w_tuple):
             return space.call_method(w_u, "startswith", w_prefixes, w_start,
                                      w_end)
         prefix = space.str_w(w_prefix)
-        if stringstartswith(u_self, prefix, start, end):
+        if startswith(u_self, prefix, start, end):
             return space.w_True
     return space.w_False
 
 def str_splitlines__String_ANY(space, w_self, w_keepends):
     u_keepends = space.int_w(w_keepends)  # truth value, but type checked
     data = w_self._value
-    selflen = len(data)
-    strs_w = []
-    i = j = 0
-    while i < selflen:
-        # Find a line and append it
-        while i < selflen and data[i] != '\n' and data[i] != '\r':
-            i += 1
-        # Skip the line break reading CRLF as one line break
-        eol = i
-        i += 1
-        if i < selflen and data[i-1] == '\r' and data[i] == '\n':
-            i += 1
-        if u_keepends:
-            eol = i
-        strs_w.append(sliced(space, data, j, eol, w_self))
-        j = i
-
-    if j < selflen:
-        strs_w.append(sliced(space, data, j, len(data), w_self))
-    return space.newlist(strs_w)
+    return space.newlist_str(data.splitlines(u_keepends))
 
 def str_zfill__String_ANY(space, w_self, w_width):
     input = w_self._value

pypy/objspace/std/stringtype.py

 str_typedef.registermethods(globals())
 
 
-# ____________________________________________________________
-
-# Helpers for several string implementations
-
-@specialize.argtype(0)
-@jit.elidable
-def stringendswith(u_self, suffix, start, end):
-    begin = end - len(suffix)
-    if begin < start:
-        return False
-    for i in range(len(suffix)):
-        if u_self[begin+i] != suffix[i]:
-            return False
-    return True
-
-@specialize.argtype(0)
-@jit.elidable
-def stringstartswith(u_self, prefix, start, end):
-    stop = start + len(prefix)
-    if stop > end:
-        return False
-    for i in range(len(prefix)):
-        if u_self[start+i] != prefix[i]:
-            return False
-    return True

pypy/objspace/std/test/test_liststrategies.py

         try:
             w_l = space.call_method(w_s, "split")
             w_l2 = space.call_method(w_s, "split", space.wrap(" "))
+            w_l3 = space.call_method(w_s, "rsplit")
+            w_l4 = space.call_method(w_s, "rsplit", space.wrap(" "))
         finally:
             del space.newlist
         assert space.listview_str(w_l) == ["a", "b", "c"]
         assert space.listview_str(w_l2) == ["a", "b", "c"]
+        assert space.listview_str(w_l3) == ["a", "b", "c"]
+        assert space.listview_str(w_l4) == ["a", "b", "c"]
+
+    def test_unicode_uses_newlist_unicode(self):
+        space = self.space
+        w_u = space.wrap(u"a b c")
+        space.newlist = None
+        try:
+            w_l = space.call_method(w_u, "split")
+            w_l2 = space.call_method(w_u, "split", space.wrap(" "))
+            w_l3 = space.call_method(w_u, "rsplit")
+            w_l4 = space.call_method(w_u, "rsplit", space.wrap(" "))
+        finally:
+            del space.newlist
+        assert space.listview_unicode(w_l) == [u"a", u"b", u"c"]
+        assert space.listview_unicode(w_l2) == [u"a", u"b", u"c"]
+        assert space.listview_unicode(w_l3) == [u"a", u"b", u"c"]
+        assert space.listview_unicode(w_l4) == [u"a", u"b", u"c"]
 
     def test_pop_without_argument_is_fast(self):
         space = self.space

pypy/objspace/std/unicodeobject.py

 from pypy.objspace.std.multimethod import FailedToImplement
 from pypy.objspace.std.noneobject import W_NoneObject
 from pypy.objspace.std.sliceobject import W_SliceObject, normalize_simple_slice
-from pypy.objspace.std.stringobject import (
-    W_StringObject, make_rsplit_with_delim)
-from pypy.objspace.std.stringtype import stringendswith, stringstartswith
+from pypy.objspace.std.stringobject import W_StringObject
 from pypy.objspace.std.register_all import register_all
 from rpython.rlib import jit
 from rpython.rlib.rarithmetic import ovfcheck
 from rpython.rlib.objectmodel import (
     compute_hash, compute_unique_id, specialize)
-from rpython.rlib.rstring import UnicodeBuilder
+from rpython.rlib.rstring import (UnicodeBuilder, split, rsplit, replace,
+    startswith, endswith)
 from rpython.rlib.runicode import make_unicode_escape_function
 from rpython.tool.sourcetools import func_with_new_name
 
 def unicode_endswith__Unicode_Unicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
     self, start, end = _convert_idx_params(space, w_self,
                                                    w_start, w_end, True)
-    return space.newbool(stringendswith(self, w_substr._value, start, end))
+    return space.newbool(endswith(self, w_substr._value, start, end))
 
 def unicode_startswith__Unicode_Unicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
     self, start, end = _convert_idx_params(space, w_self, w_start, w_end, True)
     # XXX this stuff can be waaay better for ootypebased backends if
     #     we re-use more of our rpython machinery (ie implement startswith
     #     with additional parameters as rpython)
-    return space.newbool(stringstartswith(self, w_substr._value, start, end))
+    return space.newbool(startswith(self, w_substr._value, start, end))
 
 def unicode_startswith__Unicode_ANY_ANY_ANY(space, w_unistr, w_prefixes,
                                               w_start, w_end):
                                              w_start, w_end, True)
     for w_prefix in space.fixedview(w_prefixes):
         prefix = space.unicode_w(w_prefix)
-        if stringstartswith(unistr, prefix, start, end):
+        if startswith(unistr, prefix, start, end):
             return space.w_True
     return space.w_False
 
                                              w_start, w_end, True)
     for w_suffix in space.fixedview(w_suffixes):
         suffix = space.unicode_w(w_suffix)
-        if stringendswith(unistr, suffix, start, end):
+        if endswith(unistr, suffix, start, end):
             return space.w_True
     return space.w_False
 
             if (self[pos] == u'\r' and pos + 1 < end and
                 self[pos + 1] == u'\n'):
                 # Count CRLF as one linebreak
-                lines.append(W_UnicodeObject(self[start:pos + keepends * 2]))
+                lines.append(self[start:pos + keepends * 2])
                 pos += 1
             else:
-                lines.append(W_UnicodeObject(self[start:pos + keepends]))
+                lines.append(self[start:pos + keepends])
             pos += 1
             start = pos
         else:
             pos += 1
     if not unicodedb.islinebreak(ord(self[end - 1])):
-        lines.append(W_UnicodeObject(self[start:]))
-    return space.newlist(lines)
+        lines.append(self[start:])
+    return space.newlist_unicode(lines)
 
 def unicode_find__Unicode_Unicode_ANY_ANY(space, w_self, w_substr, w_start, w_end):
     self, start, end = _convert_idx_params(space, w_self, w_start, w_end)
 
 def unicode_split__Unicode_None_ANY(space, w_self, w_none, w_maxsplit):
     maxsplit = space.int_w(w_maxsplit)
-    res_w = []
+    res = []
     value = w_self._value
     length = len(value)
     i = 0
             maxsplit -= 1   # NB. if it's already < 0, it stays < 0
 
         # the word is value[i:j]
-        res_w.append(W_UnicodeObject(value[i:j]))
+        res.append(value[i:j])
 
         # continue to look from the character following the space after the word
         i = j + 1
 
-    return space.newlist(res_w)
+    return space.newlist_unicode(res)
 
 def unicode_split__Unicode_Unicode_ANY(space, w_self, w_delim, w_maxsplit):
     self = w_self._value
     if delim_len == 0:
         raise OperationError(space.w_ValueError,
                              space.wrap('empty separator'))
-    parts = _split_with(self, delim, maxsplit)
-    return space.newlist([W_UnicodeObject(part) for part in parts])
+    parts = split(self, delim, maxsplit)
+    return space.newlist_unicode(parts)
 
 
 def unicode_rsplit__Unicode_None_ANY(space, w_self, w_none, w_maxsplit):
     maxsplit = space.int_w(w_maxsplit)
-    res_w = []
+    res = []
     value = w_self._value
     i = len(value)-1
     while True:
         # the word is value[j+1:i+1]
         j1 = j + 1
         assert j1 >= 0
-        res_w.append(W_UnicodeObject(value[j1:i+1]))
+        res.append(value[j1:i+1])
 
         # continue to look from the character before the space before the word
         i = j - 1
 
-    res_w.reverse()
-    return space.newlist(res_w)
+    res.reverse()
+    return space.newlist_unicode(res)
 
-def sliced(space, s, start, stop, orig_obj):
-    assert start >= 0
-    assert stop >= 0
-    if start == 0 and stop == len(s) and space.is_w(space.type(orig_obj), space.w_unicode):
-        return orig_obj
-    return space.wrap( s[start:stop])
-
-unicode_rsplit__Unicode_Unicode_ANY = make_rsplit_with_delim('unicode_rsplit__Unicode_Unicode_ANY',
-                                                             sliced)
-
-def _split_into_chars(self, maxsplit):
-    if maxsplit == 0:
-        return [self]
-    index = 0
-    end = len(self)
-    parts = [u'']
-    maxsplit -= 1
-    while maxsplit != 0:
-        if index >= end:
-            break
-        parts.append(self[index])
-        index += 1
-        maxsplit -= 1
-    parts.append(self[index:])
-    return parts
-
-def _split_with(self, with_, maxsplit=-1):
-    parts = []
-    start = 0
-    end = len(self)
-    length = len(with_)
-    while maxsplit != 0:
-        index = self.find(with_, start, end)
-        if index < 0:
-            break
-        parts.append(self[start:index])
-        start = index + length
-        maxsplit -= 1
-    parts.append(self[start:])
-    return parts
+def unicode_rsplit__Unicode_Unicode_ANY(space, w_self, w_by, w_maxsplit=-1):
+    maxsplit = space.int_w(w_maxsplit)
+    value = w_self._value
+    by = w_by._value
+    if not by:
+        raise OperationError(space.w_ValueError, space.wrap("empty separator"))
+    return space.newlist_unicode(rsplit(value, by, maxsplit))
 
 def unicode_replace__Unicode_Unicode_Unicode_ANY(space, w_self, w_old,
                                                  w_new, w_maxsplit):
-    return _unicode_replace(space, w_self, w_old._value, w_new._value,
-                            w_maxsplit)
+    maxsplit = space.int_w(w_maxsplit)
+    try:
+        return W_UnicodeObject(
+                replace(w_self._value, w_old._value, w_new._value, maxsplit))
+    except OverflowError:
+        raise OperationError(
+            space.w_OverflowError,
+            space.wrap("replace string is too long"))
 
 def unicode_replace__Unicode_ANY_ANY_ANY(space, w_self, w_old, w_new,
                                          w_maxsplit):
         new = unicode(space.bufferstr_w(w_new))
     else:
         new = space.unicode_w(w_new)
-    return _unicode_replace(space, w_self, old, new, w_maxsplit)
-
-def _unicode_replace(space, w_self, old, new, w_maxsplit):
-    if len(old):
-        parts = _split_with(w_self._value, old, space.int_w(w_maxsplit))
-    else:
-        self = w_self._value
-        maxsplit = space.int_w(w_maxsplit)
-        parts = _split_into_chars(self, maxsplit)
-
+    maxsplit = space.int_w(w_maxsplit)
     try:
-        one = ovfcheck(len(parts) * len(new))
-        ovfcheck(one + len(w_self._value))
+        return W_UnicodeObject(replace(w_self._value, old, new, maxsplit))
     except OverflowError:
         raise OperationError(
             space.w_OverflowError,
             space.wrap("replace string is too long"))
 
-    return W_UnicodeObject(new.join(parts))
-
-
 def unicode_encode__Unicode_ANY_ANY(space, w_unistr,
                                     w_encoding=None,
                                     w_errors=None):
 def unicode_expandtabs__Unicode_ANY(space, w_self, w_tabsize):
     self = w_self._value
     tabsize  = space.int_w(w_tabsize)
-    parts = _split_with(self, u'\t')
+    parts = self.split(u'\t')
     result = [parts[0]]
     prevsize = 0
     for ch in parts[0]:

rpython/rlib/rstring.py

 """ String builder interface and string functions
 """
+import sys
 
 from rpython.annotator.model import (SomeObject, SomeString, s_None, SomeChar,
     SomeInteger, SomeUnicodeCodePoint, SomeUnicodeString, SomePtr, SomePBC)
-from rpython.rlib.objectmodel import newlist_hint
+from rpython.rlib.objectmodel import newlist_hint, specialize
 from rpython.rlib.rarithmetic import ovfcheck
 from rpython.rtyper.extregistry import ExtRegistryEntry
 from rpython.tool.pairtype import pairtype
+from rpython.rlib import jit
 
 
 # -------------- public API for string functions -----------------------
+
+@specialize.argtype(0)
 def split(value, by, maxsplit=-1):
+    if isinstance(value, str):
+        assert isinstance(by, str)
+    else:
+        assert isinstance(by, unicode)
     bylen = len(by)
     if bylen == 0:
         raise ValueError("empty separator")
 
+    start = 0
+    if bylen == 1:
+        # fast path: uses str.rfind(character) and str.count(character)
+        by = by[0]    # annotator hack: string -> char
+        count = value.count(by)
+        if 0 <= maxsplit < count:
+            count = maxsplit
+        res = newlist_hint(count + 1)
+        while count > 0:
+            next = value.find(by, start)
+            assert next >= 0 # cannot fail due to the value.count above
+            res.append(value[start:next])
+            start = next + bylen
+            count -= 1
+        res.append(value[start:len(value)])
+        return res
+
     if maxsplit > 0:
         res = newlist_hint(min(maxsplit + 1, len(value)))
     else:
         res = []
-    start = 0
+
     while maxsplit != 0:
         next = value.find(by, start)
         if next < 0:
     return res
 
 
+@specialize.argtype(0)
 def rsplit(value, by, maxsplit=-1):
+    if isinstance(value, str):
+        assert isinstance(by, str)
+    else:
+        assert isinstance(by, unicode)
     if maxsplit > 0:
         res = newlist_hint(min(maxsplit + 1, len(value)))
     else:
     res.reverse()
     return res
 
+
+@specialize.argtype(0)
+def replace(input, sub, by, maxsplit=-1):
+    if isinstance(input, str):
+        assert isinstance(sub, str)
+        assert isinstance(by, str)
+        Builder = StringBuilder
+    else:
+        assert isinstance(sub, unicode)
+        assert isinstance(by, unicode)
+        Builder = UnicodeBuilder
+    if maxsplit == 0:
+        return input
+
+    if not sub:
+        upper = len(input)
+        if maxsplit > 0 and maxsplit < upper + 2:
+            upper = maxsplit - 1
+            assert upper >= 0
+
+        try:
+            result_size = ovfcheck(upper * len(by))
+            result_size = ovfcheck(result_size + upper)
+            result_size = ovfcheck(result_size + len(by))
+            remaining_size = len(input) - upper
+            result_size = ovfcheck(result_size + remaining_size)
+        except OverflowError:
+            raise
+        builder = Builder(result_size)
+        for i in range(upper):
+            builder.append(by)
+            builder.append(input[i])
+        builder.append(by)
+        builder.append_slice(input, upper, len(input))
+    else:
+        # First compute the exact result size
+        count = input.count(sub)
+        if count > maxsplit and maxsplit > 0:
+            count = maxsplit
+        diff_len = len(by) - len(sub)
+        try:
+            result_size = ovfcheck(diff_len * count)
+            result_size = ovfcheck(result_size + len(input))
+        except OverflowError:
+            raise
+
+        builder = Builder(result_size)
+        start = 0
+        sublen = len(sub)
+
+        while maxsplit != 0:
+            next = input.find(sub, start)
+            if next < 0:
+                break
+            builder.append_slice(input, start, next)
+            builder.append(by)
+            start = next + sublen
+            maxsplit -= 1   # NB. if it's already < 0, it stays < 0
+
+        builder.append_slice(input, start, len(input))
+
+    return builder.build()
+
+def _normalize_start_end(length, start, end):
+    if start < 0:
+        start += length
+        if start < 0:
+            start = 0
+    if end < 0:
+        end += length
+        if end < 0:
+            end = 0
+    elif end > length:
+        end = length
+    return start, end
+
+@specialize.argtype(0)
+@jit.elidable
+def startswith(u_self, prefix, start=0, end=sys.maxint):
+    length = len(u_self)
+    start, end = _normalize_start_end(length, start, end)
+    stop = start + len(prefix)
+    if stop > end:
+        return False
+    for i in range(len(prefix)):
+        if u_self[start+i] != prefix[i]:
+            return False
+    return True
+
+@specialize.argtype(0)
+@jit.elidable
+def endswith(u_self, suffix, start=0, end=sys.maxint):
+    length = len(u_self)
+    start, end = _normalize_start_end(length, start, end)
+    begin = end - len(suffix)
+    if begin < start:
+        return False
+    for i in range(len(suffix)):
+        if u_self[begin+i] != suffix[i]:
+            return False
+    return True
+
+
 # -------------- public API ---------------------------------
 
 INIT_SIZE = 100 # XXX tweak
 
     def specialize_call(self, hop):
         hop.exception_cannot_occur()
+
+

rpython/rlib/test/test_rstring.py

 import sys, py
 
 from rpython.rlib.rstring import StringBuilder, UnicodeBuilder, split, rsplit
+from rpython.rlib.rstring import replace, startswith, endswith
+from rpython.rtyper.test.tool import BaseRtypingTest, LLRtypeMixin
 
 def test_split():
     assert split("", 'x') == ['']
     assert split('a|b|c|d', '|') == ['a', 'b', 'c', 'd']
     assert split('a|b|c|d', '|', 2) == ['a', 'b', 'c|d']
     assert split('a//b//c//d', '//') == ['a', 'b', 'c', 'd']
+    assert split('a//b//c//d', '//', 2) == ['a', 'b', 'c//d']
     assert split('endcase test', 'test') == ['endcase ', '']
     py.test.raises(ValueError, split, 'abc', '')
 
+def test_split_unicode():
+    assert split(u"", u'x') == [u'']
+    assert split(u"a", u"a", 1) == [u'', u'']
+    assert split(u" ", u" ", 1) == [u'', u'']
+    assert split(u"aa", u"a", 2) == [u'', u'', u'']
+    assert split(u'a|b|c|d', u'|') == [u'a', u'b', u'c', u'd']
+    assert split(u'a|b|c|d', u'|', 2) == [u'a', u'b', u'c|d']
+    assert split(u'a//b//c//d', u'//') == [u'a', u'b', u'c', u'd']
+    assert split(u'endcase test', u'test') == [u'endcase ', u'']
+    py.test.raises(ValueError, split, u'abc', u'')
+
 def test_rsplit():
     assert rsplit("a", "a", 1) == ['', '']
     assert rsplit(" ", " ", 1) == ['', '']
     assert rsplit('endcase test', 'test') == ['endcase ', '']
     py.test.raises(ValueError, rsplit, "abc", '')
 
+def test_rsplit_unicode():
+    assert rsplit(u"a", u"a", 1) == [u'', u'']
+    assert rsplit(u" ", u" ", 1) == [u'', u'']
+    assert rsplit(u"aa", u"a", 2) == [u'', u'', u'']
+    assert rsplit(u'a|b|c|d', u'|') == [u'a', u'b', u'c', u'd']
+    assert rsplit(u'a|b|c|d', u'|', 2) == [u'a|b', u'c', u'd']
+    assert rsplit(u'a//b//c//d', u'//') == [u'a', u'b', u'c', u'd']
+    assert rsplit(u'endcase test', u'test') == [u'endcase ', u'']
+    py.test.raises(ValueError, rsplit, u"abc", u'')
+
+def test_string_replace():
+    assert replace('one!two!three!', '!', '@', 1) == 'one@two!three!'
+    assert replace('one!two!three!', '!', '') == 'onetwothree'
+    assert replace('one!two!three!', '!', '@', 2) == 'one@two@three!'
+    assert replace('one!two!three!', '!', '@', 3) == 'one@two@three@'
+    assert replace('one!two!three!', '!', '@', 4) == 'one@two@three@'
+    assert replace('one!two!three!', '!', '@', 0) == 'one!two!three!'
+    assert replace('one!two!three!', '!', '@') == 'one@two@three@'
+    assert replace('one!two!three!', 'x', '@') == 'one!two!three!'
+    assert replace('one!two!three!', 'x', '@', 2) == 'one!two!three!'
+    assert replace('abc', '', '-') == '-a-b-c-'
+    assert replace('abc', '', '-', 3) == '-a-b-c'
+    assert replace('abc', '', '-', 0) == 'abc'
+    assert replace('', '', '') == ''
+    assert replace('', '', 'a') == 'a'
+    assert replace('abc', 'ab', '--', 0) == 'abc'
+    assert replace('abc', 'xy', '--') == 'abc'
+    assert replace('123', '123', '') == ''
+    assert replace('123123', '123', '') == ''
+    assert replace('123x123', '123', '') == 'x'
+
+def test_string_replace_overflow():
+    if sys.maxint > 2**31-1:
+        py.test.skip("Wrong platform")
+    s = "a" * (2**16)
+    with py.test.raises(OverflowError):
+        replace(s, "", s)
+    with py.test.raises(OverflowError):
+        replace(s, "a", s)
+    with py.test.raises(OverflowError):
+        replace(s, "a", s, len(s) - 10)
+
+def test_unicode_replace():
+    assert replace(u'one!two!three!', u'!', u'@', 1) == u'one@two!three!'
+    assert replace(u'one!two!three!', u'!', u'') == u'onetwothree'
+    assert replace(u'one!two!three!', u'!', u'@', 2) == u'one@two@three!'
+    assert replace(u'one!two!three!', u'!', u'@', 3) == u'one@two@three@'
+    assert replace(u'one!two!three!', u'!', u'@', 4) == u'one@two@three@'
+    assert replace(u'one!two!three!', u'!', u'@', 0) == u'one!two!three!'
+    assert replace(u'one!two!three!', u'!', u'@') == u'one@two@three@'
+    assert replace(u'one!two!three!', u'x', u'@') == u'one!two!three!'
+    assert replace(u'one!two!three!', u'x', u'@', 2) == u'one!two!three!'
+    assert replace(u'abc', u'', u'-') == u'-a-b-c-'
+    assert replace(u'abc', u'', u'-', 3) == u'-a-b-c'
+    assert replace(u'abc', u'', u'-', 0) == u'abc'
+    assert replace(u'', u'', u'') == u''
+    assert replace(u'', u'', u'a') == u'a'
+    assert replace(u'abc', u'ab', u'--', 0) == u'abc'
+    assert replace(u'abc', u'xy', u'--') == u'abc'
+    assert replace(u'123', u'123', u'') == u''
+    assert replace(u'123123', u'123', u'') == u''
+    assert replace(u'123x123', u'123', u'') == u'x'
+
+def test_unicode_replace_overflow():
+    if sys.maxint > 2**31-1:
+        py.test.skip("Wrong platform")
+    s = u"a" * (2**16)
+    with py.test.raises(OverflowError):
+        replace(s, u"", s)
+    with py.test.raises(OverflowError):
+        replace(s, u"a", s)
+    with py.test.raises(OverflowError):
+        replace(s, u"a", s, len(s) - 10)
+
+def test_startswith():
+    assert startswith('ab', 'ab') is True
+    assert startswith('ab', 'a') is True
+    assert startswith('ab', '') is True
+    assert startswith('x', 'a') is False
+    assert startswith('x', 'x') is True
+    assert startswith('', '') is True
+    assert startswith('', 'a') is False
+    assert startswith('x', 'xx') is False
+    assert startswith('y', 'xx') is False
+    assert startswith('ab', 'a', 0) is True
+    assert startswith('ab', 'a', 1) is False
+    assert startswith('ab', 'b', 1) is True
+    assert startswith('abc', 'bc', 1, 2) is False
+    assert startswith('abc', 'c', -1, 4) is True
+
+def test_endswith():
+    assert endswith('ab', 'ab') is True
+    assert endswith('ab', 'b') is True
+    assert endswith('ab', '') is True
+    assert endswith('x', 'a') is False
+    assert endswith('x', 'x') is True
+    assert endswith('', '') is True
+    assert endswith('', 'a') is False
+    assert endswith('x', 'xx') is False
+    assert endswith('y', 'xx') is False
+    assert endswith('abc', 'ab', 0, 2) is True
+    assert endswith('abc', 'bc', 1) is True
+    assert endswith('abc', 'bc', 2) is False
+    assert endswith('abc', 'b', -3, -1) is True
+
 def test_string_builder():
     s = StringBuilder()
     s.append("a")
     s.append_multiple_char(u'd', 4)
     assert s.build() == 'aabcbdddd'
     assert isinstance(s.build(), unicode)
-        
+
+
+class TestTranslates(LLRtypeMixin, BaseRtypingTest):
+    def test_split_rsplit(self):
+        def fn():
+            res = True
+            res = res and split('a//b//c//d', '//') == ['a', 'b', 'c', 'd']
+            res = res and split('a//b//c//d', '//', 2) == ['a', 'b', 'c//d']
+            res = res and split(u'a//b//c//d', u'//') == [u'a', u'b', u'c', u'd']
+            res = res and split(u'endcase test', u'test') == [u'endcase ', u'']
+            res = res and rsplit('a|b|c|d', '|', 2) == ['a|b', 'c', 'd']
+            res = res and rsplit('a//b//c//d', '//') == ['a', 'b', 'c', 'd']
+            res = res and rsplit(u'a|b|c|d', u'|') == [u'a', u'b', u'c', u'd']
+            res = res and rsplit(u'a|b|c|d', u'|', 2) == [u'a|b', u'c', u'd']
+            res = res and rsplit(u'a//b//c//d', u'//') == [u'a', u'b', u'c', u'd']
+            return res
+        res = self.interpret(fn, [])
+        assert res
+
+    def test_replace(self):
+        def fn():
+            res = True
+            res = res and replace('abc', 'ab', '--', 0) == 'abc'
+            res = res and replace('abc', 'xy', '--') == 'abc'
+            res = res and replace('abc', 'ab', '--', 0) == 'abc'
+            res = res and replace('abc', 'xy', '--') == 'abc'
+            return res
+        res = self.interpret(fn, [])
+        assert res