phone2word / phone2word.hs

{-# LANGUAGE BangPatterns #-}
{-

Attempt to convert numbers like 01523 568304 into words/numbers, using the map
used on most phone keypads.

-}

import Char (isDigit, toUpper)
import Control.Monad (guard)
import Data.List (partition, nub, sort, foldl')
import Maybe (catMaybes, isNothing, isJust)
import Prelude hiding (sum)
import System.Environment (getArgs)

import qualified Data.ByteString.Char8 as B
import qualified Data.Map as Map

--
-- Definition of digits <-> letters
--

letters = [ ('0', "")
          , ('1', "")
          , ('2', "ABC")
          , ('3', "DEF")
          , ('4', "GHI")
          , ('5', "JKL")
          , ('6', "MNO")
          , ('7', "PQRS")
          , ('8', "TUV")
          , ('9', "WXYZ")
          ]

numbersMap :: Map.Map Char Char
numbersMap = Map.fromList [ (l,c)  | (c,word) <- letters, l <- word]

getNumber :: Char -> Maybe Char
getNumber char =
    if isDigit char
    then Just char
    else Map.lookup (toUpper char) numbersMap

--
-- Dictionary storage
--

{-

We convert all our words to digit form, and store in a simple Data.Map,
which proves to be sufficiently faster (and only about 70% slower
than a custom tree datastructure).
-}

type WordTree = Map.Map B.ByteString [B.ByteString]

--
-- Building the tree
--

buildWordTree :: [B.ByteString] -> WordTree
buildWordTree words =
    let l = catMaybes $ do
          w <- words
          let digits = convWord w
          return $ case digits of
                     Nothing -> Nothing
                     Just ds -> Just (ds,[w])
    in Map.fromAscListWith (\a1 a2 -> addWordToList (head a1) a2) (sort l)

-- Convert word to digit form if possible
convWord :: B.ByteString -> Maybe B.ByteString
convWord word = let conv = bMap getNumber word
                in if any isNothing conv
                   then Nothing
                   else Just $ B.pack $ catMaybes conv

bMap :: (Char -> a) -> B.ByteString -> [a]
bMap f b = if B.null b
           then []
           else f (B.head b):bMap f (B.tail b)

addWordToList w ws =
    -- eliminate duplicates
    if toUpperBS w `elem` map toUpperBS ws
    then ws
    else w:ws

toUpperBS = B.map toUpper

--
-- Search the tree
--

-- | Find words that correspond exactly to the digits supplied
findWords :: B.ByteString -- ^ the string of digits
          -> WordTree
          -> [B.ByteString]
findWords chars tree = case Map.lookup chars tree of
                         Just x -> x
                         Nothing -> []

--
-- Search for combos
--

{-

Model for understanding the problem:

The string of n digits is a river n steps wide.  Words that can be found in the
digits are like logs which are placed in the river at certain positions away
from the bank.

  0 7 6 6 3 9 6 7 3 7
+---------------------+
|   S O M E           |
|           W O R D   |
|           W O R D S |
|       M E W         |
|       N E W         |
+---------------------+

The aim is to get from one side of the river to the other by stepping on the
logs, and the full solution will find all possible routes, using all the logs.

If there are gaps which stop us crossing, or stop certain logs being used, we
can add strings of the original digits as logs.  For example, in the above case,
we need to add four 'digit logs' to allow all the word logs to be used:

  0 7 6 6 3 9 6 7 3 7
+---------------------+
|   S O M E           |
|           W O R D   |
|           W O R D S |
|       M E W         |
|       N E W         |
| 0                   |
|                   7 |
| 0 7 6               |
|             6 7 3 7 |
+---------------------+

This allows us to generate:

 0-some-words
 0-some-word-7
 076-new-6737
 076-mew-6737

So the algorithm is:

 - find all the 'word logs'
   - necessarily an O(n^2) algorithm, assuming we can check a
     string of digits in constant time
 - add in all necessary 'digit logs'
   - depends on the number and spacing of word logs
 - find all routes across the river
   - potentially O(k^n) solutions, it depends on input.

Note, in finding all routes across the river, we must never use more than one
'digit log' consecutively.  This restriction is needed because the algorithm for
adding 'digit logs' can mean we get digit logs that can be laid end to end.

-}

-- Info about a single interesting 'combination' is stored as a list of
-- ByteStrings, so we can display with dashes easily.
type Combo = [B.ByteString]

showCombo :: Combo -> B.ByteString
showCombo = B.intercalate (B.pack "-")

substring :: Int -> Int -> B.ByteString -> B.ByteString
substring start len word = B.take len $ B.drop start word

hasWord :: B.ByteString -> Bool
hasWord = any (not . isDigit) . B.unpack

-- WordMap - stores the info about words that can be found
-- at each (position,length).
type WordMap = [((Int,Int),[B.ByteString])]

findCombos :: B.ByteString -> WordTree -> [Combo]
findCombos digits tree =
    let wordMap = makeWordMap digits tree
        allMap = addDigitWords digits wordMap
    in findRoutes allMap

   where
     makeWordMap :: B.ByteString -> WordTree -> WordMap
     makeWordMap digits tree =
         do
           pos <- [0..(totalLength - 1)] -- starting position
           len <- [1..totalLength - pos] -- length
           let words = findWords (substring pos len digits) tree
           guard (not $ null words)
           return ((pos, len), words)

     addDigitWords :: B.ByteString -> WordMap -> WordMap
     addDigitWords digits wordMap =
     {-

     Adding the 'digit words'

      - for each 'end of word' (including the left bank):

        - find all the words that appear to the right - W
        - note the shortest of the nearest of these words - n
        - for each word w in W
          - if the start of w comes before the end of n, then add a digit word to fill
            in the interval up to w

      - in doing the above, ensure that duplicate digit words are not added.
      -}
         let wordPos = nub $ map fst $ wordMap
             endPositions = 0 : findEndPositions wordPos
         in (++) wordMap $ nub $ do
           pos <- endPositions
           let rightWords = wordsToRightOf pos wordPos
           let (n_pos,n_len) = head $ sort rightWords -- shortest of nearest words
           (w_pos,w_len) <- rightWords
           -- don't add 'digit word' if start position of current word
           -- is outside the range covered by (n_pos,n_len):
           guard (w_pos < n_pos + n_len)
           let gapWidth = w_pos - pos
           -- if we have a gap of zero, there is nothing to add:
           guard (gapWidth > 0)
           let digitWord = substring pos gapWidth digits
           return ((pos, gapWidth), [digitWord])

       where
         wordsToRightOf pos wordPos = [ (p,len) | (p,len) <- wordPos, p >= pos ]
                                      ++
                                      -- need to make sure there a word which represents the
                                      -- 'right bank', so that we can bridge a gap from last
                                      -- word to the end of the string if needed.
                                      [rightBank]
         findEndPositions wordPos = nub [ pos + len | (pos,len) <- wordPos ]
         rightBank = (totalLength, 1)

     findRoutes :: WordMap -> [Combo]
     findRoutes wordMap =
         let combos = findRoutes' wordMap 0 totalLength
         in
           -- it's possible for boring entries to slip through in the case
           -- where there are no words found at all
           case combos of
             [[seg]] | not $ hasWord seg -> []
             _ -> combos

     findRoutes' :: WordMap -> Int -> Int -> [Combo]
     findRoutes' words start end =
         do
           -- find words that start as position 'start':
           ((p,len),ws) <- filter (\((p,_),_) -> p == start) words
           w <- ws
           if p + len == end
              then return [w] -- last segment
              else do
                c <- findRoutes' words (start + len) end
                -- don't add multiple 'digit words':
                guard (hasWord w || (hasWord $ head c))
                return (w:c)

     --
     -- Shared definitions between different parts of algorithm
     --

     totalLength = B.length digits

--
-- Main
--

sum = foldl' (+) 0

getWords :: String -> IO [B.ByteString]
getWords dictFile = B.readFile dictFile >>= (return . B.lines)

countNodes t = Map.size t

countWords t = sum $ map length $ Map.elems t

main = do
  dictFile:_ <- getArgs
  ws <- getWords dictFile
  let wordTree = buildWordTree ws
  putStr "Words: "
  print (countWords wordTree)
  putStr "Nodes: "
  print (countNodes wordTree)
  mainLoop wordTree

mainLoop wordTree = do
  putStrLn "Enter phone number: "
  number' <- getLine
  putStrLn "Matches: "
  let number = filter isDigit number'
  let combos = findCombos (B.pack number) wordTree
  case length combos of
    0 -> putStrLn " [None]"
    _ -> mapM_ B.putStrLn $ map showCombo combos
  putStrLn ""

  mainLoop wordTree
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.