phone2word / phone2word.hs

{-# LANGUAGE BangPatterns #-}
{-
Attempt to convert numbers like 01523 568304 into
words/numbers, using the map used on most mobile
phone keypads.

-}

import Char (isDigit, toUpper)
import Control.Monad (guard)
import Data.List (partition, nub, sort, foldl')
import Maybe (fromJust)
import Prelude hiding (sum)
import System.Environment (getArgs)

import qualified Data.ByteString.Char8 as B
import qualified Data.Map as Map

--
-- Definitions
--

letters = [ ('0', "")
          , ('1', "")
          , ('2', "ABC")
          , ('3', "DEF")
          , ('4', "GHI")
          , ('5', "JKL")
          , ('6', "MNO")
          , ('7', "PQRS")
          , ('8', "TUV")
          , ('9', "WXYZ")
          ]

--
-- General utilities
--

lettersMap :: Map.Map Char [Char]
lettersMap = Map.fromList letters

numbersMap :: Map.Map Char Char
numbersMap = Map.fromList [ (l,c)  | (c,word) <- letters, l <- word]

getLetters :: Char -> [Char]
getLetters digit = fromJust $ Map.lookup digit lettersMap

getNumber :: Char -> Maybe Char
getNumber char =
    if isDigit char
    then Just char
    else Map.lookup (toUpper char) numbersMap

data WordTree = Node { nodeSubForest :: !(Map.Map Char WordTree) -- children
                     , nodeWords :: ![B.ByteString]              -- words that correspond to this node
                     } deriving (Eq, Read, Show)


--
-- Building the tree
--

emptyNode = Node { nodeSubForest = Map.empty
                 , nodeWords = []
                 }

buildWordTree :: [B.ByteString] -> WordTree
buildWordTree words =
    foldl' (\t w -> addWord w w t) emptyNode words

addWord :: B.ByteString -- ^ remaining characters
        -> B.ByteString -- ^ whole word
        -> WordTree     -- ^ input tree
        -> WordTree
addWord !chars !word !tree =
    if B.null chars
       then tree { nodeWords = addWordToList word (nodeWords tree) }
       else let c = B.head chars
                cs = B.tail chars
                md = getNumber c
            in case md of
                 Nothing -> tree -- no corresponding digit to letter, can't add word
                 Just d ->
                     let matching = Map.lookup d $ nodeSubForest tree
                         node = case matching of
                                  -- if node does not exist, need to create it
                                  Nothing -> emptyNode
                                  Just n -> n
                         node' = addWord cs word node
                     in tree { nodeSubForest = Map.insert d node' $ nodeSubForest tree }

addWordToList w ws =
    -- eliminate duplicates
    if toUpperBS w `elem` map toUpperBS ws
    then ws
    else w:ws

toUpperBS = B.map toUpper

--
-- Search the tree
--

-- | Find words that correspond exactly to the digits supplied
findWords :: B.ByteString -- ^ the string of digits
          -> WordTree
          -> [B.ByteString]
findWords chars tree =
    if B.null chars
       then nodeWords tree
       else let d = B.head chars
                ds = B.tail chars
                nd = Map.lookup d $ nodeSubForest tree
            in case nd of
                 Nothing -> []
                 Just n -> findWords ds n

--
-- Search for combos
--

{-

Model for understanding the problem:

The string of n digits is a river n steps wide.  Words that can be found in the
digits are like logs which are placed in the river at certain positions away
from the bank.

  0 7 6 6 3 9 6 7 3 7
+---------------------+
|   S O M E           |
|           W O R D   |
|           W O R D S |
|       M E W         |
|       N E W         |
+---------------------+

The aim is to get from one side of the river to the other by stepping on the
logs, and the full solution will find all possible routes, using all the logs.

If there are gaps which stop us crossing, or stop certain logs being used, we
can add strings of the original digits as logs.  For example, in the above case,
we need to add four 'digit logs' to allow all the word logs to be used:

  0 7 6 6 3 9 6 7 3 7
+---------------------+
|   S O M E           |
|           W O R D   |
|           W O R D S |
|       M E W         |
|       N E W         |
| 0                   |
|                   7 |
| 0 7 6               |
|             6 7 3 7 |
+---------------------+

This allows us to generate:

 0-some-words
 0-some-word-7
 076-new-6737
 076-mew-6737

So the algorithm is:

 - find all the 'word logs'
   - necessarily an O(n^2) algorithm, assuming we can check a
     string of digits in constant time
 - add in all necessary 'digit logs'
   - depends on the number and spacing of word logs
 - find all routes across the river
   - potentially O(k^n) solutions, it depends on input.

Note, in finding all routes across the river, we must never use more than one
'digit log' consecutively.  This restriction is needed because the algorithm for
adding 'digit logs' can mean we get digit logs that can be laid end to end.

-}

-- Info about a single interesting 'combination' is stored as a list of
-- ByteStrings, so we can display with dashes easily.
type Combo = [B.ByteString]

showCombo :: Combo -> B.ByteString
showCombo = B.intercalate (B.pack "-")

substring :: Int -> Int -> B.ByteString -> B.ByteString
substring start len word = B.take len $ B.drop start word

hasWord :: B.ByteString -> Bool
hasWord = any (not . isDigit) . B.unpack

-- WordMap - stores the info about words that can be found
-- at each (position,length).
type WordMap = [((Int,Int),[B.ByteString])]

findCombos :: B.ByteString -> WordTree -> [Combo]
findCombos digits tree =
    let wordMap = makeWordMap digits tree
        allMap = addDigitWords digits wordMap
    in findRoutes allMap

   where
     makeWordMap :: B.ByteString -> WordTree -> WordMap
     makeWordMap digits tree =
         do
           pos <- [0..(totalLength - 1)] -- starting position
           len <- [1..totalLength - pos] -- length
           let words = findWords (substring pos len digits) tree
           guard (not $ null words)
           return ((pos, len), words)

     addDigitWords :: B.ByteString -> WordMap -> WordMap
     addDigitWords digits wordMap =
     {-

     Adding the 'digit words'

      - for each 'end of word' (including the left bank):

        - find all the words that appear to the right - W
        - note the shortest of the nearest of these words - n
        - for each word w in W
          - if the start of w comes before the end of n, then add a digit word to fill
            in the interval up to w

      - in doing the above, ensure that duplicate digit words are not added.
      -}
         let wordPos = nub $ map fst $ wordMap
             endPositions = 0 : findEndPositions wordPos
         in (++) wordMap $ nub $ do
           pos <- endPositions
           let rightWords = wordsToRightOf pos wordPos
           let (n_pos,n_len) = head $ sort rightWords -- shortest of nearest words
           (w_pos,w_len) <- rightWords
           -- don't add 'digit word' if start position of current word
           -- is outside the range covered by (n_pos,n_len):
           guard (w_pos < n_pos + n_len)
           let gapWidth = w_pos - pos
           -- if we have a gap of zero, there is nothing to add:
           guard (gapWidth > 0)
           let digitWord = substring pos gapWidth digits
           return ((pos, gapWidth), [digitWord])

       where
         wordsToRightOf pos wordPos = [ (p,len) | (p,len) <- wordPos, p >= pos ]
                                      ++
                                      -- need to make sure there a word which represents the
                                      -- 'right bank', so that we can bridge a gap from last
                                      -- word to the end of the string if needed.
                                      [rightBank]
         findEndPositions wordPos = nub [ pos + len | (pos,len) <- wordPos ]
         rightBank = (totalLength, 1)

     findRoutes :: WordMap -> [Combo]
     findRoutes wordMap =
         let combos = findRoutes' wordMap 0 totalLength
         in
           -- it's possible for boring entries to slip through in the case
           -- where there are no words found at all
           case combos of
             [[seg]] | not $ hasWord seg -> []
             _ -> combos

     findRoutes' :: WordMap -> Int -> Int -> [Combo]
     findRoutes' words start end =
         do
           -- find words that start as position 'start':
           ((p,len),ws) <- filter (\((p,_),_) -> p == start) words
           w <- ws
           if p + len == end
              then return [w] -- last segment
              else do
                c <- findRoutes' words (start + len) end
                -- don't add multiple 'digit words':
                guard (hasWord w || (hasWord $ head c))
                return (w:c)

     --
     -- Shared definitions between different parts of algorithm
     --

     totalLength = B.length digits

--
-- Main
--

sum = foldl' (+) 0

getWords :: String -> IO [B.ByteString]
getWords dictFile = B.readFile dictFile >>= (return . B.lines)

countNodes t = 1 + (sum $ map countNodes (Map.elems $ nodeSubForest t))

countWords t = length (nodeWords t) + (sum $ map countWords (Map.elems $ nodeSubForest t))

main = do
  dictFile:_ <- getArgs
  ws <- getWords dictFile
  let wordTree = buildWordTree ws
  putStr "Words: "
  print (countWords wordTree)
  putStr "Nodes: "
  print (countNodes wordTree)
  mainLoop wordTree

mainLoop wordTree = do
  putStrLn "Enter phone number: "
  number' <- getLine
  putStrLn "Matches: "
  let number = filter isDigit number'
  let combos = findCombos (B.pack number) wordTree
  case length combos of
    0 -> putStrLn " [None]"
    _ -> mapM_ B.putStrLn $ map showCombo combos
  putStrLn ""

  mainLoop wordTree
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.