Commits

Sergey Astanin  committed f0a4120

snus, utility to create and update library index; DB schema

  • Participants
  • Parent commits 40adc2d

Comments (0)

Files changed (4)

+module Book (Book(..), readBookInfo) where
+
+import Data.ByteString.Lazy.Internal (smallChunkSize)
+import Data.List (intercalate)
+
+import FB2
+
+-- TODO: consider structured author
+data Book = Book {
+    authors :: [String]
+  , title :: String
+  , genres :: [String]
+  , date :: String
+  , lang :: String
+  , archive :: String
+  , path :: String
+  , size :: Int
+  } deriving (Eq)
+
+instance Show Book where
+  show b = unlines $
+    [ (intercalate ", " $ authors b) ++ " /"
+    , "  " ++ title b
+    , "  " ++ date b
+    , "  " ++ (intercalate " " $ genres b)
+    , "  lang : " ++ lang b
+    , "  in   : " ++ (archive b) ++ " : " ++ (path b)
+    ]
+
+unknownBook :: Book
+unknownBook = Book {
+    authors = []
+  , title = ""
+  , genres = []
+  , date = ""
+  , lang = ""
+  , archive = ""
+  , path = ""
+  , size = 0
+  }
+
+readBookInfo :: String -> Book
+readBookInfo bytes =
+  let -- in FB2 all meta data is usually in the beginning,
+      -- we don't need to parse everything, smallChunkSize is just a guess (4k)
+      -- which seems to work on most of the real-world files
+      doc = convParseXml $ take smallChunkSize bytes
+      t = getTitle doc
+      as = getAuthors doc
+      g = getGenres doc
+      l = getLang doc
+      d = getDate doc
+      sz = length bytes
+  in  unknownBook { authors = as, title = t, genres = g
+                  , lang = l, date = d, size = sz }
+
+{-# LANGUAGE FlexibleContexts #-}
+module DB
+  ( initDB, insertBook
+  , Connection(), commit, rollback, withTransaction, disconnect
+  , catchSql, SqlError(..)) where
+
+import System.FilePath
+import System.Directory
+
+import Database.HDBC
+import Database.HDBC.Sqlite3
+import Data.Convertible.Base (Convertible)
+
+import Book
+
+dbdir :: IO FilePath
+dbdir = getAppUserDataDirectory "snusmumrik"
+
+dbfile :: IO FilePath
+dbfile = do
+  dir <- dbdir
+  return $ dir </> "library.sqlite"
+
+initDB :: IO Connection
+initDB = do
+  dir <- dbdir
+  createDirectoryIfMissing True dir
+  file <- dbfile
+  db <- connectSqlite3 file
+  createTables db
+  return db
+
+-- TODO: normalize DB schema, e.g. many authors, genres, langs per book
+
+createTables :: Connection -> IO ()
+createTables db = do
+  run db "create table if not exists \
+         \  books ( \
+         \  id      INTEGER PRIMARY KEY AUTOINCREMENT, \
+         \  title   TEXT, \
+         \  lang    TEXT, \
+         \  date    TEXT, \
+         \  archive TEXT, \
+         \  path    TEXT, \
+         \  size    INTEGER, \
+         \  unique (archive, path) on conflict ignore )" []
+  run db "create table if not exists \
+         \ authors ( \
+         \  id      INTEGER PRIMARY KEY AUTOINCREMENT, \
+         \  name    TEXT   unique on conflict ignore)" []
+  run db "create table if not exists \
+         \ bookauthors ( \
+         \  author_id  INTEGER REFERENCES authors, \
+         \  book_id    INTEGER REFERENCES books)" []
+  run db "create table if not exists \
+         \ genres ( \
+         \  id      INTEGER PRIMARY KEY AUTOINCREMENT, \
+         \  genre   TEXT unique on conflict ignore)" []
+  run db "create table if not exists \
+         \ bookgenres ( \
+         \  genre_id   INTEGER REFERENCES genres, \
+         \  book_id    INTEGER REFERENCES books)" []
+  run db "create index if not exists \
+         \ authorix on authors ( name )" []
+  run db "create index if not exists \
+         \ titleix on books ( title )" []
+  run db "create index if not exists \
+         \ genreix on genres ( genre )" []
+  run db "create index if not exists \
+         \ locationix on books ( archive, path )" []
+  return ()
+
+insertBook :: Connection -> Book -> IO ()
+insertBook db book = do
+  -- insert book
+  run db "insert into books \
+         \  ( title, lang, date, size, archive, path ) \
+         \  values ( ?, ?, ?, ?, ?, ?)" bookValues
+  -- or update
+  -- TODO: implement DB update
+  -- insert authors
+  insertAuthors <- prepare db "insert into authors ( name ) values ( ? )"
+  executeMany insertAuthors $ map (:[]) $ authorNames
+  -- authors to books correspondence
+  insertBookAuthors <- prepare db $ "insert into bookauthors \
+         \ ( author_id, book_id ) \
+         \ select authors.id, books.id \
+         \   from authors, books \
+         \   where books.archive = ? and books.path = ? \
+         \     and authors.name = ?"
+  executeMany insertBookAuthors bookAuthors
+  -- insert genres
+  insertGenre <- prepare db "insert into genres ( genre ) values ( ? )"
+  executeMany insertGenre $ map (:[]) genreNames
+  -- genres to books correspondence
+  insertBookGenres <- prepare db $ "insert into bookgenres \
+         \ ( genre_id, book_id ) \
+         \ select g.id, b.id \
+         \   from genres g, books b \
+         \   where b.archive = ? and b.path = ? \
+         \     and g.genre = ?"
+  executeMany insertBookGenres bookGenres
+  return ()
+  where bookValues = (map fToSql [ title, lang, date ])
+                     ++ [fToSql size]
+                     ++ (map fToSql [ archive, path ])
+        fToSql :: (Convertible a SqlValue) => (Book -> a) -> SqlValue
+        fToSql f = toSql $ f book
+        -- archive, path must be last:
+        bookLoc = reverse . take 2 . reverse $ bookValues
+        authorNames :: [SqlValue]
+        authorNames = map toSql $ authors book
+        bookAuthors :: [[SqlValue]]
+        bookAuthors = map (\a -> bookLoc ++ [a]) authorNames
+        genreNames :: [SqlValue]
+        genreNames = map toSql $ genres book
+        bookGenres :: [[SqlValue]]
+        bookGenres = map (\g -> bookLoc ++ [g]) genreNames
+module FB2
+   ( readByteFile, convParseXml
+   , getAuthors, getTitle, getGenres
+   , getLang, getDate) where
+
+import qualified Data.ByteString.Lazy.Char8 as Lazy
+import Data.ByteString.Lazy.UTF8 (toString)
+
+import Data.Char (isSpace)
+import Data.List (intercalate)
+import Data.Maybe (maybeToList,listToMaybe,fromMaybe)
+
+import Text.XML.Light
+import Codec.Text.IConv (convertFuzzy,Fuzzy(..))
+
+{-
+import System.Environment (getArgs)
+import Control.Monad (mapM_, forM_)
+import Data.ByteString.Lazy.Internal (smallChunkSize)
+main = getArgs >>= mapM_ showInfo
+
+showInfo fname = do
+  raw <- readByteFile fname
+  let doc = convParseXml $ take smallChunkSize raw -- metainfo is in the beginning
+  let t = getTitle doc
+  let a = getAuthors doc
+  let g = getGenres doc
+  let l = getLang doc
+  let d = getDate doc
+  putStrLn $ intercalate ", " a
+  putStrLn $ " " ++ t
+  putStrLn $ " " ++ d
+  putStrLn $ " G: " ++ (intercalate ", " g)
+  putStrLn $ " L: " ++ l
+-}
+
+-- | Read file as a sequence of bytes.
+readByteFile :: String -> IO String
+readByteFile f = return . Lazy.unpack =<< Lazy.readFile f
+
+-- | Convert XML according to document encoding and parse it.
+convParseXml :: String -> [Element]
+convParseXml s =
+  let doc =  onlyElems . parseXML $ s
+      elm1 = head doc
+      convertFrom :: String -> String -> String
+      convertFrom enc = toString . convertFuzzy Transliterate enc "UTF-8" . Lazy.pack
+  in
+    if (qName . elName $ elm1) /= "?xml"
+      then doc
+      else case findAttr (unqual "encoding") elm1 of
+        Nothing -> doc
+        Just enc -> onlyElems . parseXML . convertFrom enc $ s
+
+--
+-- FB2 XML machinery
+--
+
+fbURL :: Maybe String
+fbURL = Just "http://www.gribuser.ru/xml/fictionbook/2.0"
+fbname :: String -> QName
+fbname name = QName name fbURL Nothing
+
+type Filter = [Element] -> [Element]
+
+-- | All descendent sub-elements with a given name.
+subElems :: String -> Filter
+subElems name elms = concatMap (findElements (fbname name)) elms
+
+-- | All text contents of given elements.
+txt :: [Element] -> [String]
+txt elms = map cdData . concatMap (onlyText . elContent) $ elms
+
+-- | Attribute value.
+attr :: String -> Element -> Maybe String
+attr name elm = findAttr (fbname name) elm
+
+getPath :: [String] -> Filter
+getPath names elms =
+  let follow = foldl (\f n -> \els-> subElems n $ f els) id' names :: Filter
+      id' = id :: Filter
+  in  follow elms
+
+-- | Text contents or attrubute values located by given path.
+-- If the last name in the path starts with @, lookup attribute value.
+getValues :: [String] -> [Element] -> [String]
+getValues [] _ = []
+getValues path elms =
+  let tip = last path
+      path' = init path -- if the last item in path is attribute name
+  in  case tip of
+      '@':name -> concatMap maybeToList . map (attr name) . getPath path' $ elms
+      _        -> txt . getPath path $ elms
+
+--
+-- FB2 meta information readers
+--
+
+metaPath :: [String]
+metaPath = [ "FictionBook", "description", "title-info" ]
+authorPath :: [String]
+authorPath = metaPath ++ [ "author" ]
+booktitlePath :: [String]
+booktitlePath = metaPath ++ [ "book-title" ]
+genrePath :: [String]
+genrePath = metaPath ++ [ "genre" ]
+langPath :: [String]
+langPath = metaPath ++ [ "lang" ]
+datePath :: [String]
+datePath = metaPath ++ [ "date" ]
+
+getTitle :: [Element] -> String
+getTitle doc = strip $ firstOrEmpty $ getValues booktitlePath doc
+
+getAuthors :: [Element] -> [String]
+getAuthors doc = map getAuthor . getPath authorPath $ doc
+  where getAuthor :: Element -> String
+        getAuthor e =
+          let ln = map strip $ getValues ["last-name"] [e]
+              fn = map strip $ getValues ["first-name"] [e]
+              mn = map strip $ getValues ["middle-name"] [e]
+          in  intercalate " " $ ln ++ fn ++ mn
+
+getGenres :: [Element] -> [String]
+getGenres doc = map strip $ getValues genrePath doc
+
+getLang :: [Element] -> String
+getLang doc = strip $ firstOrEmpty $ getValues langPath doc
+
+getDate :: [Element] -> String
+getDate doc = strip $ firstOrEmpty $ getValues datePath doc
+
+--
+-- Utilities
+--
+
+strip :: String -> String
+strip = dropWhile isSpace . reverse . dropWhile isSpace . reverse
+
+firstOrEmpty :: [String] -> String
+firstOrEmpty = fromMaybe "" . listToMaybe
+module Main where
+
+import Control.Monad (forM_)
+import System.Directory (canonicalizePath)
+import System.Environment (getArgs)
+import IO (stderr, hPutStrLn)
+
+import qualified System.IO.UTF8 as U
+
+import Codec.Archive.LibZip
+
+import DB
+import Book
+
+-- title-info of the FB2 file is expected to be found within first 8 kbytes.
+fbHeadSize :: Int
+fbHeadSize = 8*1024
+
+main :: IO ()
+main = do
+  db <- initDB
+  args <- getArgs
+  forM_ args $ \arg -> do
+    catchZipError $ do
+      zipfile <- canonicalizePath arg
+      withZip zipfile [] $ \z -> do
+        files <- getFiles z []
+        forM_ files $ \file -> do
+          -- TODO: implement lazy reading in LibZip
+          txt <- readZipFileHead' z file [] fbHeadSize
+          sz <- getFileSize z file []
+          let info = let i = readBookInfo $ toString txt
+                     in  i { archive = zipfile, path = file, size = sz }
+          catchSql (insert file db info)
+            $ \e -> putErr $ "DB error: " ++ file ++ ": " ++ (seErrorMsg e)
+    $ \e -> putErr $ "Archive error: " ++ arg ++ ": " ++ (show e)
+  where
+    insert :: String -> Connection -> Book -> IO ()
+    insert fname db info = do
+      withTransaction db $ \db' -> insertBook db' info
+      U.putStrLn $ fname ++ ":" ++ (concat $ take 2 $ lines $ show info)
+
+putErr :: String -> IO ()
+putErr msg = hPutStrLn stderr msg
+
+toString :: [Word8] -> String
+toString = map w2c
+  where w2c = toEnum . fromEnum