Bryan O'Sullivan avatar Bryan O'Sullivan committed ef82d79

Implement support for case folding.

Comments (0)

Files changed (8)

     , reverse
 
     -- * Case conversion
+    -- $case
+    , toCaseFold
+    , toLower
     , toUpper
-    , toLower
 
     -- * Folds
     , foldl
 reverse t = S.reverse (stream t)
 {-# INLINE reverse #-}
 
--- | /O(n)/ Convert a string to upper case, using simple case
--- conversion.  The result string may be longer than the input string.
--- For instance, the German eszett (U+00DF) maps to the two-letter
--- sequence SS.
-toUpper :: Text -> Text
-toUpper t = unstream (S.toUpper (stream t))
-{-# INLINE toUpper #-}
+-- ----------------------------------------------------------------------------
+-- ** Case conversions (folds)
+
+-- $case
+--
+-- With Unicode text, it is incorrect to use combinators like @map
+-- toUpper@ to case convert each character of a string individually.
+-- Instead, use the whole-string case conversion functions from this
+-- module.  For correctness in different writing systems, these
+-- functions may map one input character to two or three output
+-- characters.
+
+-- | /O(n)/ Convert a string to folded case.  This function is mainly
+-- useful for performing caseless (or case insensitive) string
+-- comparisons.
+--
+-- A string @x@ is a caseless match for a string @y@ if and only if:
+--
+-- @toCaseFold x == toCaseFold y@
+--
+-- The result string may be longer than the input string, and may
+-- differ from applying 'toLower' to the input string.  For instance,
+-- the Armenian small ligature men now (U+FB13) is case folded to the
+-- bigram men now (U+0574 U+0576), while the micro sign (U+00B5) is
+-- case folded to the Greek small letter letter mu (U+03BC) instead of
+-- itself.
+toCaseFold :: Text -> Text
+toCaseFold t = unstream (S.toCaseFold (stream t))
+{-# INLINE [0] toCaseFold #-}
 
 -- | /O(n)/ Convert a string to lower case, using simple case
 -- conversion.  The result string may be longer than the input string.
 toLower t = unstream (S.toLower (stream t))
 {-# INLINE toLower #-}
 
+-- | /O(n)/ Convert a string to upper case, using simple case
+-- conversion.  The result string may be longer than the input string.
+-- For instance, the German eszett (U+00DF) maps to the two-letter
+-- sequence SS.
+toUpper :: Text -> Text
+toUpper t = unstream (S.toUpper (stream t))
+{-# INLINE toUpper #-}
+
 -- | /O(n)/ The 'transpose' function transposes the rows and columns
 -- of its 'Text' argument.  Note that this function uses 'pack',
 -- 'unpack', and the list version of transpose, and is thus not very

Data/Text/Fusion/CaseMapping.hs

 -- LATIN CAPITAL LETTER I WITH DOT ABOVE
 lowerMapping '\x0130' s = Yield '\x0069' (s :!: '\x0307' :!: '\x0000')
 lowerMapping c s = Yield (toLower c) (s :!: '\0' :!: '\0')
+foldMapping :: forall s. Char -> s -> Step (PairS (PairS s Char) Char) Char
+{-# INLINE foldMapping #-}
+-- MICRO SIGN
+foldMapping '\x00b5' s = Yield '\x03bc' (s :!: '\x0000' :!: '\x0000')
+-- LATIN SMALL LETTER SHARP S
+foldMapping '\x00df' s = Yield '\x0073' (s :!: '\x0073' :!: '\x0000')
+-- LATIN CAPITAL LETTER I WITH DOT ABOVE
+foldMapping '\x0130' s = Yield '\x0069' (s :!: '\x0307' :!: '\x0000')
+-- LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
+foldMapping '\x0149' s = Yield '\x02bc' (s :!: '\x006e' :!: '\x0000')
+-- LATIN SMALL LETTER LONG S
+foldMapping '\x017f' s = Yield '\x0073' (s :!: '\x0000' :!: '\x0000')
+-- LATIN SMALL LETTER J WITH CARON
+foldMapping '\x01f0' s = Yield '\x006a' (s :!: '\x030c' :!: '\x0000')
+-- COMBINING GREEK YPOGEGRAMMENI
+foldMapping '\x0345' s = Yield '\x03b9' (s :!: '\x0000' :!: '\x0000')
+-- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
+foldMapping '\x0390' s = Yield '\x03b9' (s :!: '\x0308' :!: '\x0301')
+-- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
+foldMapping '\x03b0' s = Yield '\x03c5' (s :!: '\x0308' :!: '\x0301')
+-- GREEK SMALL LETTER FINAL SIGMA
+foldMapping '\x03c2' s = Yield '\x03c3' (s :!: '\x0000' :!: '\x0000')
+-- GREEK BETA SYMBOL
+foldMapping '\x03d0' s = Yield '\x03b2' (s :!: '\x0000' :!: '\x0000')
+-- GREEK THETA SYMBOL
+foldMapping '\x03d1' s = Yield '\x03b8' (s :!: '\x0000' :!: '\x0000')
+-- GREEK PHI SYMBOL
+foldMapping '\x03d5' s = Yield '\x03c6' (s :!: '\x0000' :!: '\x0000')
+-- GREEK PI SYMBOL
+foldMapping '\x03d6' s = Yield '\x03c0' (s :!: '\x0000' :!: '\x0000')
+-- GREEK KAPPA SYMBOL
+foldMapping '\x03f0' s = Yield '\x03ba' (s :!: '\x0000' :!: '\x0000')
+-- GREEK RHO SYMBOL
+foldMapping '\x03f1' s = Yield '\x03c1' (s :!: '\x0000' :!: '\x0000')
+-- GREEK LUNATE EPSILON SYMBOL
+foldMapping '\x03f5' s = Yield '\x03b5' (s :!: '\x0000' :!: '\x0000')
+-- ARMENIAN SMALL LIGATURE ECH YIWN
+foldMapping '\x0587' s = Yield '\x0565' (s :!: '\x0582' :!: '\x0000')
+-- LATIN SMALL LETTER H WITH LINE BELOW
+foldMapping '\x1e96' s = Yield '\x0068' (s :!: '\x0331' :!: '\x0000')
+-- LATIN SMALL LETTER T WITH DIAERESIS
+foldMapping '\x1e97' s = Yield '\x0074' (s :!: '\x0308' :!: '\x0000')
+-- LATIN SMALL LETTER W WITH RING ABOVE
+foldMapping '\x1e98' s = Yield '\x0077' (s :!: '\x030a' :!: '\x0000')
+-- LATIN SMALL LETTER Y WITH RING ABOVE
+foldMapping '\x1e99' s = Yield '\x0079' (s :!: '\x030a' :!: '\x0000')
+-- LATIN SMALL LETTER A WITH RIGHT HALF RING
+foldMapping '\x1e9a' s = Yield '\x0061' (s :!: '\x02be' :!: '\x0000')
+-- LATIN SMALL LETTER LONG S WITH DOT ABOVE
+foldMapping '\x1e9b' s = Yield '\x1e61' (s :!: '\x0000' :!: '\x0000')
+-- LATIN CAPITAL LETTER SHARP S
+foldMapping '\x1e9e' s = Yield '\x0073' (s :!: '\x0073' :!: '\x0000')
+-- GREEK SMALL LETTER UPSILON WITH PSILI
+foldMapping '\x1f50' s = Yield '\x03c5' (s :!: '\x0313' :!: '\x0000')
+-- GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA
+foldMapping '\x1f52' s = Yield '\x03c5' (s :!: '\x0313' :!: '\x0300')
+-- GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA
+foldMapping '\x1f54' s = Yield '\x03c5' (s :!: '\x0313' :!: '\x0301')
+-- GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI
+foldMapping '\x1f56' s = Yield '\x03c5' (s :!: '\x0313' :!: '\x0342')
+-- GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI
+foldMapping '\x1f80' s = Yield '\x1f00' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH DASIA AND YPOGEGRAMMENI
+foldMapping '\x1f81' s = Yield '\x1f01' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA AND YPOGEGRAMMENI
+foldMapping '\x1f82' s = Yield '\x1f02' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA AND YPOGEGRAMMENI
+foldMapping '\x1f83' s = Yield '\x1f03' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA AND YPOGEGRAMMENI
+foldMapping '\x1f84' s = Yield '\x1f04' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA AND YPOGEGRAMMENI
+foldMapping '\x1f85' s = Yield '\x1f05' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
+foldMapping '\x1f86' s = Yield '\x1f06' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
+foldMapping '\x1f87' s = Yield '\x1f07' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI
+foldMapping '\x1f88' s = Yield '\x1f00' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI
+foldMapping '\x1f89' s = Yield '\x1f01' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI
+foldMapping '\x1f8a' s = Yield '\x1f02' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI
+foldMapping '\x1f8b' s = Yield '\x1f03' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI
+foldMapping '\x1f8c' s = Yield '\x1f04' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI
+foldMapping '\x1f8d' s = Yield '\x1f05' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
+foldMapping '\x1f8e' s = Yield '\x1f06' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
+foldMapping '\x1f8f' s = Yield '\x1f07' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI
+foldMapping '\x1f90' s = Yield '\x1f20' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER ETA WITH DASIA AND YPOGEGRAMMENI
+foldMapping '\x1f91' s = Yield '\x1f21' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER ETA WITH PSILI AND VARIA AND YPOGEGRAMMENI
+foldMapping '\x1f92' s = Yield '\x1f22' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER ETA WITH DASIA AND VARIA AND YPOGEGRAMMENI
+foldMapping '\x1f93' s = Yield '\x1f23' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER ETA WITH PSILI AND OXIA AND YPOGEGRAMMENI
+foldMapping '\x1f94' s = Yield '\x1f24' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER ETA WITH DASIA AND OXIA AND YPOGEGRAMMENI
+foldMapping '\x1f95' s = Yield '\x1f25' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
+foldMapping '\x1f96' s = Yield '\x1f26' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
+foldMapping '\x1f97' s = Yield '\x1f27' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI
+foldMapping '\x1f98' s = Yield '\x1f20' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI
+foldMapping '\x1f99' s = Yield '\x1f21' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI
+foldMapping '\x1f9a' s = Yield '\x1f22' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI
+foldMapping '\x1f9b' s = Yield '\x1f23' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI
+foldMapping '\x1f9c' s = Yield '\x1f24' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI
+foldMapping '\x1f9d' s = Yield '\x1f25' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
+foldMapping '\x1f9e' s = Yield '\x1f26' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
+foldMapping '\x1f9f' s = Yield '\x1f27' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI
+foldMapping '\x1fa0' s = Yield '\x1f60' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH DASIA AND YPOGEGRAMMENI
+foldMapping '\x1fa1' s = Yield '\x1f61' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI
+foldMapping '\x1fa2' s = Yield '\x1f62' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA AND YPOGEGRAMMENI
+foldMapping '\x1fa3' s = Yield '\x1f63' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA AND YPOGEGRAMMENI
+foldMapping '\x1fa4' s = Yield '\x1f64' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA AND YPOGEGRAMMENI
+foldMapping '\x1fa5' s = Yield '\x1f65' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
+foldMapping '\x1fa6' s = Yield '\x1f66' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
+foldMapping '\x1fa7' s = Yield '\x1f67' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI
+foldMapping '\x1fa8' s = Yield '\x1f60' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI
+foldMapping '\x1fa9' s = Yield '\x1f61' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI
+foldMapping '\x1faa' s = Yield '\x1f62' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI
+foldMapping '\x1fab' s = Yield '\x1f63' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI
+foldMapping '\x1fac' s = Yield '\x1f64' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI
+foldMapping '\x1fad' s = Yield '\x1f65' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
+foldMapping '\x1fae' s = Yield '\x1f66' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
+foldMapping '\x1faf' s = Yield '\x1f67' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI
+foldMapping '\x1fb2' s = Yield '\x1f70' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI
+foldMapping '\x1fb3' s = Yield '\x03b1' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI
+foldMapping '\x1fb4' s = Yield '\x03ac' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH PERISPOMENI
+foldMapping '\x1fb6' s = Yield '\x03b1' (s :!: '\x0342' :!: '\x0000')
+-- GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
+foldMapping '\x1fb7' s = Yield '\x03b1' (s :!: '\x0342' :!: '\x03b9')
+-- GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI
+foldMapping '\x1fbc' s = Yield '\x03b1' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK PROSGEGRAMMENI
+foldMapping '\x1fbe' s = Yield '\x03b9' (s :!: '\x0000' :!: '\x0000')
+-- GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI
+foldMapping '\x1fc2' s = Yield '\x1f74' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER ETA WITH YPOGEGRAMMENI
+foldMapping '\x1fc3' s = Yield '\x03b7' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI
+foldMapping '\x1fc4' s = Yield '\x03ae' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER ETA WITH PERISPOMENI
+foldMapping '\x1fc6' s = Yield '\x03b7' (s :!: '\x0342' :!: '\x0000')
+-- GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
+foldMapping '\x1fc7' s = Yield '\x03b7' (s :!: '\x0342' :!: '\x03b9')
+-- GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
+foldMapping '\x1fcc' s = Yield '\x03b7' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA
+foldMapping '\x1fd2' s = Yield '\x03b9' (s :!: '\x0308' :!: '\x0300')
+-- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
+foldMapping '\x1fd3' s = Yield '\x03b9' (s :!: '\x0308' :!: '\x0301')
+-- GREEK SMALL LETTER IOTA WITH PERISPOMENI
+foldMapping '\x1fd6' s = Yield '\x03b9' (s :!: '\x0342' :!: '\x0000')
+-- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI
+foldMapping '\x1fd7' s = Yield '\x03b9' (s :!: '\x0308' :!: '\x0342')
+-- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA
+foldMapping '\x1fe2' s = Yield '\x03c5' (s :!: '\x0308' :!: '\x0300')
+-- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
+foldMapping '\x1fe3' s = Yield '\x03c5' (s :!: '\x0308' :!: '\x0301')
+-- GREEK SMALL LETTER RHO WITH PSILI
+foldMapping '\x1fe4' s = Yield '\x03c1' (s :!: '\x0313' :!: '\x0000')
+-- GREEK SMALL LETTER UPSILON WITH PERISPOMENI
+foldMapping '\x1fe6' s = Yield '\x03c5' (s :!: '\x0342' :!: '\x0000')
+-- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI
+foldMapping '\x1fe7' s = Yield '\x03c5' (s :!: '\x0308' :!: '\x0342')
+-- GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI
+foldMapping '\x1ff2' s = Yield '\x1f7c' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH YPOGEGRAMMENI
+foldMapping '\x1ff3' s = Yield '\x03c9' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI
+foldMapping '\x1ff4' s = Yield '\x03ce' (s :!: '\x03b9' :!: '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH PERISPOMENI
+foldMapping '\x1ff6' s = Yield '\x03c9' (s :!: '\x0342' :!: '\x0000')
+-- GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
+foldMapping '\x1ff7' s = Yield '\x03c9' (s :!: '\x0342' :!: '\x03b9')
+-- GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
+foldMapping '\x1ffc' s = Yield '\x03c9' (s :!: '\x03b9' :!: '\x0000')
+-- LATIN SMALL LIGATURE FF
+foldMapping '\xfb00' s = Yield '\x0066' (s :!: '\x0066' :!: '\x0000')
+-- LATIN SMALL LIGATURE FI
+foldMapping '\xfb01' s = Yield '\x0066' (s :!: '\x0069' :!: '\x0000')
+-- LATIN SMALL LIGATURE FL
+foldMapping '\xfb02' s = Yield '\x0066' (s :!: '\x006c' :!: '\x0000')
+-- LATIN SMALL LIGATURE FFI
+foldMapping '\xfb03' s = Yield '\x0066' (s :!: '\x0066' :!: '\x0069')
+-- LATIN SMALL LIGATURE FFL
+foldMapping '\xfb04' s = Yield '\x0066' (s :!: '\x0066' :!: '\x006c')
+-- LATIN SMALL LIGATURE LONG S T
+foldMapping '\xfb05' s = Yield '\x0073' (s :!: '\x0074' :!: '\x0000')
+-- LATIN SMALL LIGATURE ST
+foldMapping '\xfb06' s = Yield '\x0073' (s :!: '\x0074' :!: '\x0000')
+-- ARMENIAN SMALL LIGATURE MEN NOW
+foldMapping '\xfb13' s = Yield '\x0574' (s :!: '\x0576' :!: '\x0000')
+-- ARMENIAN SMALL LIGATURE MEN ECH
+foldMapping '\xfb14' s = Yield '\x0574' (s :!: '\x0565' :!: '\x0000')
+-- ARMENIAN SMALL LIGATURE MEN INI
+foldMapping '\xfb15' s = Yield '\x0574' (s :!: '\x056b' :!: '\x0000')
+-- ARMENIAN SMALL LIGATURE VEW NOW
+foldMapping '\xfb16' s = Yield '\x057e' (s :!: '\x0576' :!: '\x0000')
+-- ARMENIAN SMALL LIGATURE MEN XEH
+foldMapping '\xfb17' s = Yield '\x0574' (s :!: '\x056d' :!: '\x0000')
+foldMapping c s = Yield (toLower c) (s :!: '\0' :!: '\0')

Data/Text/Fusion/Common.hs

     , intersperse
 
     -- ** Case conversion
+    -- $case
+    , toCaseFold
+    , toLower
     , toUpper
-    , toLower
 
     -- * Folds
     , foldl
 import qualified Data.List as L
 import qualified Prelude as P
 import Data.Text.Fusion.Internal
-import Data.Text.Fusion.CaseMapping (lowerMapping, upperMapping)
+import Data.Text.Fusion.CaseMapping (foldMapping, lowerMapping, upperMapping)
 
 singleton :: Char -> Stream Char
 singleton c = Stream next False 1 -- HINT maybe too low
 -- ----------------------------------------------------------------------------
 -- ** Case conversions (folds)
 
+-- $case
+--
+-- With Unicode text, it is incorrect to use combinators like @map
+-- toUpper@ to case convert each character of a string individually.
+-- Instead, use the whole-string case conversion functions from this
+-- module.  For correctness in different writing systems, these
+-- functions may map one input character to two or three output
+-- characters.
+
 caseConvert :: (forall s. Char -> s -> Step (PairS (PairS s Char) Char) Char)
             -> Stream Char -> Stream Char
 caseConvert remap (Stream next0 s0 len) = Stream next (s0 :!: '\0' :!: '\0') len
           Yield c s' -> remap c s'
     next (s :!: a :!: b) = Yield a (s :!: b :!: '\0')
 
+-- | /O(n)/ Convert a string to folded case.  This function is mainly
+-- useful for performing caseless (or case insensitive) string
+-- comparisons.
+--
+-- A string @x@ is a caseless match for a string @y@ if and only if:
+--
+-- @toCaseFold x == toCaseFold y@
+--
+-- The result string may be longer than the input string, and may
+-- differ from applying 'toLower' to the input string.  For instance,
+-- the Armenian small ligature men now (U+FB13) is case folded to the
+-- bigram men now (U+0574 U+0576), while the micro sign (U+00B5) is
+-- case folded to the Greek small letter letter mu (U+03BC) instead of
+-- itself.
+toCaseFold :: Stream Char -> Stream Char
+toCaseFold = caseConvert foldMapping
+{-# INLINE [0] toCaseFold #-}
+
 -- | /O(n)/ Convert a string to upper case, using simple case
 -- conversion.  The result string may be longer than the input string.
 -- For instance, the German eszett (U+00DF) maps to the two-letter

Data/Text/Lazy.hs

     , reverse
 
     -- ** Case conversion
+    -- $case
+    , toCaseFold
+    , toLower
     , toUpper
-    , toLower
 
     -- * Folds
     , foldl
   where rev a Empty        = a
         rev a (Chunk t ts) = rev (Chunk (T.reverse t) a) ts
 
--- | /O(n)/ Convert a string to upper case, using simple case
--- conversion.  The result string may be longer than the input string.
--- For instance, the German eszett (U+00DF) maps to the two-letter
--- sequence SS.
-toUpper :: Text -> Text
-toUpper t = unstream (S.toUpper (stream t))
-{-# INLINE toUpper #-}
+-- ----------------------------------------------------------------------------
+-- ** Case conversions (folds)
+
+-- $case
+--
+-- With Unicode text, it is incorrect to use combinators like @map
+-- toUpper@ to case convert each character of a string individually.
+-- Instead, use the whole-string case conversion functions from this
+-- module.  For correctness in different writing systems, these
+-- functions may map one input character to two or three output
+-- characters.
+
+-- | /O(n)/ Convert a string to folded case.  This function is mainly
+-- useful for performing caseless (or case insensitive) string
+-- comparisons.
+--
+-- A string @x@ is a caseless match for a string @y@ if and only if:
+--
+-- @toCaseFold x == toCaseFold y@
+--
+-- The result string may be longer than the input string, and may
+-- differ from applying 'toLower' to the input string.  For instance,
+-- the Armenian small ligature men now (U+FB13) is case folded to the
+-- bigram men now (U+0574 U+0576), while the micro sign (U+00B5) is
+-- case folded to the Greek small letter letter mu (U+03BC) instead of
+-- itself.
+toCaseFold :: Text -> Text
+toCaseFold t = unstream (S.toCaseFold (stream t))
+{-# INLINE [0] toCaseFold #-}
 
 -- | /O(n)/ Convert a string to lower case, using simple case
 -- conversion.  The result string may be longer than the input string.
 toLower t = unstream (S.toLower (stream t))
 {-# INLINE toLower #-}
 
+-- | /O(n)/ Convert a string to upper case, using simple case
+-- conversion.  The result string may be longer than the input string.
+-- For instance, the German eszett (U+00DF) maps to the two-letter
+-- sequence SS.
+toUpper :: Text -> Text
+toUpper t = unstream (S.toUpper (stream t))
+{-# INLINE toUpper #-}
+
 -- | /O(n)/ 'foldl', applied to a binary operator, a starting value
 -- (typically the left-identity of the operator), and a 'Text',
 -- reduces the 'Text' using the binary operator, from left to right.
+module Arsec
+    (
+      comment
+    , semi
+    , showC
+    , unichar
+    , unichars
+    , module Control.Applicative
+    , module Control.Monad
+    , module Data.Char
+    , module Text.ParserCombinators.Parsec.Char
+    , module Text.ParserCombinators.Parsec.Combinator
+    , module Text.ParserCombinators.Parsec.Error
+    , module Text.ParserCombinators.Parsec.Prim
+    ) where
+
+import Control.Monad
+import Control.Applicative
+import Data.Char
+import Numeric
+import Text.ParserCombinators.Parsec.Char hiding (lower, upper)
+import Text.ParserCombinators.Parsec.Combinator hiding (optional)
+import Text.ParserCombinators.Parsec.Error
+import Text.ParserCombinators.Parsec.Prim hiding ((<|>), many)
+
+instance Applicative (GenParser s a) where
+    pure = return
+    (<*>) = ap
+    
+instance Alternative (GenParser s a) where
+    empty = mzero
+    (<|>) = mplus
+
+unichar :: Parser Char
+unichar = chr . fst . head . readHex <$> many1 hexDigit
+
+unichars :: Parser [Char]
+unichars = manyTill (unichar <* spaces) semi
+
+semi :: Parser ()
+semi = char ';' *> spaces *> pure ()
+
+comment :: Parser String
+comment = (char '#' *> manyTill anyToken (char '\n')) <|> string "\n"
+
+showC :: Char -> String
+showC c = "'\\x" ++ d ++ "'"
+    where h = showHex (ord c) ""
+          d = replicate (4 - length h) '0' ++ h

scripts/CaseFolding.hs

+-- This script processes the following source file:
+--
+--   http://unicode.org/Public/UNIDATA/CaseFolding.txt
+
+module CaseFolding
+    (
+      Fold(..)
+    , parseCF
+    , mapCF
+    ) where
+
+import Arsec
+
+data Fold = Fold {
+      code :: Char
+    , status :: Char
+    , mapping :: [Char]
+    , name :: String
+    } deriving (Eq, Ord, Show)
+
+entries :: Parser [Fold]
+entries = many comment *> many (entry <* many comment)
+  where
+    entry = Fold <$> unichar <* semi
+                 <*> oneOf "CFST" <* semi
+                 <*> unichars
+                 <*> (string "# " *> manyTill anyToken (char '\n'))
+
+parseCF :: FilePath -> IO (Either ParseError [Fold])
+parseCF name = parse entries name <$> readFile name
+
+mapCF :: [Fold] -> [String]
+mapCF ms = typ ++ (map nice . filter p $ ms) ++ [last]
+  where
+    typ = ["foldMapping :: forall s. Char -> s -> Step (PairS (PairS s Char) Char) Char"
+           ,"{-# INLINE foldMapping #-}"]
+    last = "foldMapping c s = Yield (toLower c) (s :!: '\\0' :!: '\\0')"
+    nice c = "-- " ++ name c ++ "\n" ++
+             "foldMapping " ++ showC (code c) ++ " s = Yield " ++ x ++ " (s :!: " ++ y ++ " :!: " ++ z ++ ")"
+       where [x,y,z] = (map showC . take 3) (mapping c ++ repeat '\0')
+    p f = status f `elem` "CF" &&
+          mapping f /= [toLower (code f)]

scripts/CaseMapping.hs

+import System.Environment
+import System.IO
+
+import Arsec
+import CaseFolding
+import SpecialCasing
+
+main = do
+  args <- getArgs
+  let oname = case args of
+                [] -> "../Data/Text/Fusion/CaseMapping.hs"
+                [o] -> o
+  psc <- parseSC "SpecialCasing.txt"
+  pcf <- parseCF "CaseFolding.txt"
+  scs <- case psc of
+           Left err -> print err >> return undefined
+           Right ms -> return ms
+  cfs <- case pcf of
+           Left err -> print err >> return undefined
+           Right ms -> return ms
+  h <- openFile oname WriteMode
+  mapM_ (hPutStrLn h) ["{-# LANGUAGE Rank2Types #-}"
+                      ,"-- AUTOMATICALLY GENERATED - DO NOT EDIT"
+                      ,"-- Generated by scripts/SpecialCasing.hs"
+                      ,"module Data.Text.Fusion.CaseMapping where"
+                      ,"import Data.Char"
+                      ,"import Data.Text.Fusion.Internal"
+                      ,""]
+  mapM_ (hPutStrLn h) (mapSC "upper" upper toUpper scs)
+  mapM_ (hPutStrLn h) (mapSC "lower" lower toLower scs)
+  mapM_ (hPutStrLn h) (mapCF cfs)
+  hClose h

scripts/SpecialCasing.hs

-import Control.Applicative
-import Control.Monad
-import Data.Char
-import Numeric
-import System.Environment
-import System.IO
-import Text.ParserCombinators.Parsec hiding (many, optional, upper, lower, (<|>))
-import Text.ParserCombinators.Parsec.Combinator
+-- This script processes the following source file:
+--
+--   http://unicode.org/Public/UNIDATA/SpecialCasing.txt
 
-instance Applicative (GenParser s a) where
-    pure = return
-    (<*>) = ap
-    
-instance Alternative (GenParser s a) where
-    empty = mzero
-    (<|>) = mplus
+module SpecialCasing
+    (
+      Case(..)
+    , parseSC
+    , mapSC
+    ) where
+
+import Arsec
 
 data Case = Case {
       code :: Char
                  <*> unichars
                  <*> manyTill anyToken (string "# ")
                  <*> manyTill anyToken (char '\n')
-    unichar = chr . fst . head . readHex <$> replicateM 4 (satisfy isHexDigit)
-    unichars = manyTill (unichar <* spaces) semi
-    semi = char ';' *> spaces *> pure ()
-    comment = (char '#' *> manyTill anyToken (char '\n')) <|> string "\n"
 
-parseFile :: FilePath -> IO (Either ParseError [Case])
-parseFile name = parse entries name <$> readFile name
+parseSC :: FilePath -> IO (Either ParseError [Case])
+parseSC name = parse entries name <$> readFile name
 
-mapFunc which access twiddle ms = typ ++ (map nice . filter p $ ms) ++ [last]
+mapSC :: String -> (Case -> String) -> (Char -> Char) -> [Case] -> [String]
+mapSC which access twiddle ms = typ ++ (map nice . filter p $ ms) ++ [last]
   where
     typ = [which ++ "Mapping :: forall s. Char -> s -> Step (PairS (PairS s Char) Char) Char"
            ,"{-# INLINE " ++ which ++ "Mapping #-}"]
     p c = [k] /= a && a /= [twiddle k] && null (conditions c)
         where a = access c
               k = code c
-    showC c = "'\\x" ++ d ++ "'"
-        where h = showHex (ord c) ""
-              d = replicate (4 - length h) '0' ++ h
 
 ucFirst (c:cs) = toUpper c : cs
 ucFirst [] = []
-
-main = do
-  args <- getArgs
-  let (iname, oname) = case args of
-                         [] -> ("SpecialCasing.txt", "CaseMapping.hs")
-                         [i] -> (i, "CaseMapping.hs")
-                         [i,o] -> (i,o)
-  p <- parseFile iname
-  ms <- case p of
-          Left err -> print err >> return undefined
-          Right ms -> return ms
-  h <- openFile oname WriteMode
-  mapM_ (hPutStrLn h) ["{-# LANGUAGE Rank2Types #-}"
-                      ,"-- AUTOMATICALLY GENERATED - DO NOT EDIT"
-                      ,"-- Generated by scripts/SpecialCasing.hs"
-                      ,"module Data.Text.Fusion.CaseMapping where"
-                      ,"import Data.Char"
-                      ,"import Data.Text.Fusion.Internal"
-                      ,""]
-  mapM_ (hPutStrLn h) (mapFunc "upper" upper toUpper ms)
-  mapM_ (hPutStrLn h) (mapFunc "lower" lower toLower ms)
-  hClose h
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.