Source

Toy C#-ish compiler / CSharpLex.hs

Full commit
module CSharpLex where

import Data.Char
import Control.Monad
import ParseLib.Abstract

data Token = POpen    | PClose        -- parentheses     ()
           | SOpen    | SClose        -- square brackets []
           | COpen    | CClose        -- curly braces    {}
           | ToLambda | LCall
           | Arrow    | Derive
           | Comma    | Semicolon
           | Dot      | ScopeRes
           | KeyVar   | KeyStatic
           | KeyIf    | KeyElse
           | KeyWhile | KeyReturn
           | KeyClass | KeyVoid
           | KeyDelete| KeyNew
           | KeyFor   | KeyNull
           | StdType   String         -- the 8 standard types
           | Operator  String         -- the 15 operators
           | Id        String         -- identifiers
           | ConstInt  Int
           | ConstBool Bool
           | ConstChar Char
           deriving (Eq, Show, Ord)

lexDigit = satisfy isDigit
-- Characters that may be used as the second or later character in an
-- identifier.
lexLegalIdentChar = satisfy isAlphaNum
                <|> lexDigit
                <|> symbol '_'

keyword :: String -> Parser Char String
keyword []                    = succeed ""
keyword xs@(x:_) | isLetter x = do
                                  ys <- greedy lexLegalIdentChar
                                  guard $ xs == ys
                                  return ys
                 | otherwise  = token xs

greedyChoice :: [Parser s a] -> Parser s a
greedyChoice = foldr (<<|>) empty

terminals :: [(Token, String)]
terminals =
    [( POpen     , "("      )
    ,( PClose    , ")"      )
    ,( SOpen     , "["      )
    ,( SClose    , "]"      )
    ,( COpen     , "{"      )
    ,( CClose    , "}"      )
    ,( Comma     , ","      )
    ,( Semicolon , ";"      )
    ,( Dot       , "."      )
    ,( ScopeRes  , "::"     )
    ,( Derive    , ":"      )
    ,( ToLambda  , "@"      )
    ,( LCall     , "%"      )
    ,( Arrow     , "->"     )
    ,( KeyIf     , "if"     )
    ,( KeyElse   , "else"   )
    ,( KeyWhile  , "while"  )
    ,( KeyFor    , "for"    )
    ,( KeyReturn , "return" )
    ,( KeyStatic , "static" )
    ,( KeyClass  , "class"  )
    ,( KeyVoid   , "void"   )
    ,( KeyNew    , "new"    )
    ,( KeyDelete , "delete" )
    ,( KeyNull   , "null"   )
    ,( KeyVar    , "var"   )
    ]

-- I don't like being able to get the data from comments; the choice of what
-- data exactly to give is frustrating.
comment :: Parser Char String
comment = [] <$ token "/*" <* c'
      <|> [] <$ token "//" <* many (satisfy (/='\n'))
    where c' = greedy (satisfy (/='*')) <* symbol '*' <* ([] <$ symbol '/' <<|> c')

-- We ignore everything.
lexWhiteSpace :: Parser Char String
lexWhiteSpace = space <* many (comment <* space)
    where space = greedy $ satisfy isSpace

lexId :: Parser Char Token
lexId =  (\x xs -> Id (x:xs))
          <$> satisfy isAlpha
          <*> greedy lexLegalIdentChar

lexConstBool :: Parser Char Token
lexConstBool = ConstBool False <$ keyword "false"
           <|> ConstBool True <$ keyword "true"

lexConstInt :: Parser Char Token
lexConstInt = (ConstInt . read) <$> greedy1 lexDigit

lexConstChar :: Parser Char Token
lexConstChar = ConstChar <$ symbol '\'' <*> anySymbol <* symbol '\''

lexEnum :: (String -> Token) -> [String] -> Parser Char Token
lexEnum f xs = f <$> choice (map keyword xs)

lexTerminal :: Parser Char Token
lexTerminal = choice $ map (\(t,s) -> t <$ keyword s) terminals

-- I've cleaned out the types I'm not going to implement.  (By the way,
-- implementing char literals on a system that can't print them is a tiny bit
-- silly.)
stdTypes :: [String]
stdTypes = ["int", "bool", "char"]

operators :: [String]
operators = ["+", "-", "*", "/", "%", "&&", "||",
             "^", "<=", "<", ">=", ">", "==",
             "!=", "="]


lexToken :: Parser Char Token
lexToken = greedyChoice
             [ lexTerminal
             , lexEnum StdType stdTypes
             , lexEnum Operator operators
             , lexConstBool
             , lexConstChar
             , lexConstInt
             , lexId
             ]

lexicalScanner :: Parser Char [Token]
lexicalScanner = lexWhiteSpace *> greedy (lexToken <* lexWhiteSpace) <* eof


sStdType :: Parser Token Token
sStdType = satisfy isStdType
       where isStdType (StdType _) = True
             isStdType _           = False

sId :: Parser Token Token
sId = satisfy isId
       where isId (Id _) = True
             isId _      = False

sConst :: Parser Token Token
sConst = satisfy isConst
       where isConst (ConstInt  _) = True
             isConst (ConstBool _) = True
             isConst (ConstChar _) = True
             isConst KeyNull       = True
             isConst _             = False

sOperator :: [String] -> Parser Token Token
sOperator xs = satisfy isOperator
       where isOperator (Operator x) = x `elem` xs
             isOperator _            = False


sSemi :: Parser Token Token
sSemi = symbol Semicolon

sDerive :: Parser Token Token
sDerive = symbol Derive

sScopeRes :: Parser Token Token
sScopeRes = symbol ScopeRes