diff --git a/.gitignore b/.gitignore index a976ba5..6b45255 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ *.hi -*.o \ No newline at end of file +*.o +input diff --git a/GedcomParser.hs b/GedcomParser.hs new file mode 100644 index 0000000..f17beae --- /dev/null +++ b/GedcomParser.hs @@ -0,0 +1,108 @@ +-- A GEDCOM to XML converter written using Parsec as a +-- solution for rubyquiz 6 (http://rubyquiz.com/quiz6.html). +-- Example GEDCOM document at +-- http://cpansearch.perl.org/src/PJCJ/Gedcom-1.16/royal.ged + +{-# LANGUAGE NoMonomorphismRestriction, RecordWildCards #-} + +module GedcomParser where + +import Text.Parsec hiding (spaces, Line) +import System.IO + +-- a line in a GEDCOM document +data Line = Line { + lineLevel :: Int, + lineTag :: String, + lineValue :: Maybe String, + lineId :: Maybe String + } deriving (Show) + +-- an element in a GEDCOM document +data Elem = Elem { + elemTag :: String, + elemValue :: Maybe String, + elemId :: Maybe String, + elemChildren :: [Elem] + } deriving (Show) + +indent n = concat . (replicate n) $ " " + +trimValue value = case value of + Nothing -> Nothing + Just v + | v == "" -> Nothing + | otherwise -> Just v + +normalizeValue = maybe "" id + +spaces = many (char ' ' <|> tab) +whitespaces = many (char ' ' <|> tab <|> newline) + +-- parses a line +line level = do + string (show level) + spaces + id <- optionMaybe $ between (char '@') (char '@') (many1 alphaNum) + spaces + tag <- many1 upper + spaces + value <- fmap trimValue $ optionMaybe $ manyTill (anyChar) newline + return $ Line level tag value id + +-- parses an element +element level = do + ml <- optionMaybe $ line level + case ml of + Nothing -> fail ("invalid level " ++ show level) + Just Line{..} -> do + children <- many (element $ level + 1) + return $ Elem lineTag lineValue lineId children + +-- parses a document +document = (element 0) `endBy` whitespaces + +-- normalizes an element by merging values of CONC and CONT +-- elements with parent element value +normalizeElem element = + let + conChildren = filter concOrCont $ elemChildren element + text = foldl (\t el -> t + ++ (if elemTag el == "CONC" then "\n" else " ") + ++ normalizeValue (elemValue el)) + "" conChildren + nonConChildren = filter (not . concOrCont) $ elemChildren element + in + element { elemValue = trimValue $ + Just (normalizeValue (elemValue element) ++ text), + elemChildren = map normalizeElem nonConChildren } + where + concOrCont el = elemTag el `elem` ["CONC", "CONT"] + +-- normalizes a document +normalizeDoc = map normalizeElem + +-- converts an element to XML +elemToXml indentation Elem{..} = + indent indentation + ++ "<" ++ elemTag + ++ maybe "" (\i -> " id=\"@" ++ i ++ "@\"") elemId + ++ case elemChildren of + [] -> ">" ++ normalizeValue elemValue ++ "" + _ -> maybe "" (\v -> " value=\"" ++ v ++ "\"") elemValue ++ ">\n" + ++ unlines (map (elemToXml (indentation + 1)) elemChildren) + ++ indent indentation ++ "" + +-- converts a document to XML +documentToXml doc = "\n" + ++ (unlines . map (elemToXml 1) $ doc) + ++ "" + +-- converts a GEDCOM document supplied through STDIN into XML +-- and prints to STDOUT +main = do + text <- getContents + case parse document "GEDCOM Parser" text of + Right [] -> return () + Right doc -> putStrLn $ documentToXml (normalizeDoc doc) + Left e -> print e \ No newline at end of file