Fork 0

119 Zeilen
3.7 KiB

2012-10-27 13:36:30 +05:30
-- | A GEDCOM to XML converter written using Parsec as a
-- solution for rubyquiz 6 (<http://rubyquiz.com/quiz6.html>).
-- Example GEDCOM document at
2012-10-27 13:36:30 +05:30
-- <http://cpansearch.perl.org/src/PJCJ/Gedcom-1.16/royal.ged>
-- Copyright 2012 Abhinav Sarkar \<abhinav\@abhinavsarkar.net\>
2012-10-27 11:07:22 +05:30
{-# LANGUAGE NoMonomorphismRestriction, RecordWildCards, FlexibleContexts #-}
2012-10-27 11:07:22 +05:30
module GedcomParser (Elem(..), Doc, document, documentToXml, main) where
import Text.Parsec hiding (spaces, Line)
import System.IO
-- a line in a GEDCOM document
data Line = Line {
lineLevel :: Int,
lineTag :: String,
lineValue :: Maybe String,
lineId :: Maybe String
2012-10-27 11:07:22 +05:30
2012-10-27 13:36:30 +05:30
-- | An element in a GEDCOM document
data Elem = Elem {
elemTag :: String,
elemValue :: Maybe String,
elemId :: Maybe String,
elemChildren :: [Elem]
} deriving (Show)
2012-10-27 13:36:30 +05:30
-- | A GEDCOM document
2012-10-27 11:07:22 +05:30
type Doc = [Elem]
2012-08-04 23:58:01 +05:30
indent n = concat . replicate n $ " "
trimValue value = case value of
Nothing -> Nothing
Just v
| v == "" -> Nothing
| otherwise -> Just v
normalizeValue = maybe "" id
spaces = many (char ' ' <|> tab)
whitespaces = many (char ' ' <|> tab <|> newline)
-- parses a line
line level = do
string (show level)
id <- optionMaybe $ between (char '@') (char '@') (many1 alphaNum)
tag <- many1 upper
2012-08-04 23:58:01 +05:30
value <- fmap trimValue $ optionMaybe $ manyTill anyChar newline
return $ Line level tag value id
-- parses an element
element level = do
ml <- optionMaybe $ line level
case ml of
Nothing -> fail ("invalid level " ++ show level)
Just Line{..} -> do
children <- many (element $ level + 1)
return $ Elem lineTag lineValue lineId children
2012-10-27 13:36:30 +05:30
-- | Parser to parse a GEDCOM document from a 'String'
2012-10-27 11:07:22 +05:30
document :: Stream s m Char => ParsecT s u m Doc
2012-08-04 23:58:01 +05:30
document = element 0 `endBy` whitespaces
-- normalizes an element by merging values of CONC and CONT
-- elements with parent element value
normalizeElem element =
conChildren = filter concOrCont $ elemChildren element
text = foldl (\t el -> t
++ (if elemTag el == "CONC" then "\n" else " ")
++ normalizeValue (elemValue el))
"" conChildren
nonConChildren = filter (not . concOrCont) $ elemChildren element
element { elemValue = trimValue $
Just (normalizeValue (elemValue element) ++ text),
elemChildren = map normalizeElem nonConChildren }
concOrCont el = elemTag el `elem` ["CONC", "CONT"]
-- normalizes a document
normalizeDoc = map normalizeElem
-- converts an element to XML
elemToXml indentation Elem{..} =
indent indentation
++ "<" ++ elemTag
++ maybe "" (\i -> " id=\"@" ++ i ++ "@\"") elemId
++ case elemChildren of
2012-08-04 23:58:01 +05:30
[] -> case normalizeValue elemValue of
"" -> " />"
text -> ">" ++ text ++ "</" ++ elemTag ++ ">"
_ -> maybe "" (\v -> " value=\"" ++ v ++ "\"") elemValue ++ ">\n"
++ unlines (map (elemToXml (indentation + 1)) elemChildren)
++ indent indentation ++ "</" ++ elemTag ++ ">"
2012-10-27 13:36:30 +05:30
-- | Converts a GEDCOM document to XML
2012-10-27 11:07:22 +05:30
documentToXml :: Doc -> String
documentToXml doc = "<DOCUMENT>\n"
2012-10-27 11:07:22 +05:30
++ (unlines . map (elemToXml 1) $ doc')
++ "</DOCUMENT>"
2012-10-27 11:07:22 +05:30
where doc' = normalizeDoc doc
2012-10-27 13:36:30 +05:30
-- | Converts a GEDCOM document supplied through STDIN into XML
-- and prints to STDOUT
main = do
text <- getContents
case parse document "GEDCOM Parser" text of
Right [] -> return ()
2012-10-27 11:07:22 +05:30
Right doc -> putStrLn $ documentToXml doc
Left e -> print e