From 7dc6db944f4ed3d92cf5e18a3e0524b64a132ebe Mon Sep 17 00:00:00 2001 From: Abhinav Sarkar Date: Sat, 19 Dec 2015 11:55:08 +0530 Subject: [PATCH] Adds incremental refresh sql generation for dimension tables. - Adds partial unique indexes for tables to handle null columns. - Adds validation for time column in fact tables. --- app/Main.hs | 29 +++++---- src/Ringo.hs | 7 ++- src/Ringo/Extractor/Internal.hs | 7 +-- src/Ringo/Generator.hs | 102 +++++++++++++++++++++----------- src/Ringo/Types.hs | 9 ++- src/Ringo/Validator.hs | 18 ++---- 6 files changed, 99 insertions(+), 73 deletions(-) diff --git a/app/Main.hs b/app/Main.hs index 377448e..1039b8a 100644 --- a/app/Main.hs +++ b/app/Main.hs @@ -14,7 +14,7 @@ import Ringo import Ringo.ArgParser import Ringo.InputParser -data SQLType = Create | Populate | Update deriving (Eq, Show) +data SQLType = Create | FullRefresh | IncRefresh deriving (Eq, Show) main :: IO () main = do @@ -36,30 +36,29 @@ writeSQLFiles outputDir env@Env{..} = forM_ sqls $ \(sqlType, table, sql) -> do createDirectoryIfMissing True dirName writeFile fileName sql where - dimTables = map (\fact -> (fact, extractDimensionTables env fact)) envFacts - factTables = map (\fact -> (fact, extractFactTable env fact)) envFacts + dimTables = [ (fact, extractDimensionTables env fact) | fact <- envFacts ] + factTables = [ (fact, extractFactTable env fact) | fact <- envFacts ] dimTableDefnSQLs = [ (Create, tableName table, unlines . map sqlStr . tableDefnSQL $ table) - | (_, tabs) <- dimTables - , table <- tabs + | (_, tabs) <- dimTables + , table <- tabs , table `notElem` envTables ] - factTableDefnSQLs = [ (Create - , tableName table, unlines . map sqlStr $ factTableDefnSQL env fact table) + + factTableDefnSQLs = [ (Create , tableName table, unlines . map sqlStr $ factTableDefnSQL env fact table) | (fact, table) <- factTables ] - dimTableInsertSQLs = [ (Populate - , tableName table - , sqlStr $ dimensionTableInsertSQL env fact (tableName table)) - | (fact, tabs) <- dimTables - , table <- tabs - , table `notElem` envTables ] + dimTablePopulateSQLs typ gen = [ (typ , tableName table, sqlStr $ gen env fact (tableName table)) + | (fact, tabs) <- dimTables + , table <- tabs + , table `notElem` envTables ] - factTableInsertSQLs = [ (Populate, tableName table, sqlStr $ factTableInsertSQL env fact) + factTableInsertSQLs = [ (FullRefresh, tableName table, sqlStr $ factTableInsertSQL env fact) | (fact, table) <- factTables ] sqls = concat [ dimTableDefnSQLs , factTableDefnSQLs - , dimTableInsertSQLs + , dimTablePopulateSQLs FullRefresh $ dimensionTablePopulateSQL FullPopulation + , dimTablePopulateSQLs IncRefresh $ dimensionTablePopulateSQL IncrementalPopulation , factTableInsertSQLs ] diff --git a/src/Ringo.hs b/src/Ringo.hs index 673879f..532e13b 100644 --- a/src/Ringo.hs +++ b/src/Ringo.hs @@ -4,7 +4,7 @@ module Ringo , extractDimensionTables , G.tableDefnSQL , factTableDefnSQL - , dimensionTableInsertSQL + , dimensionTablePopulateSQL , factTableInsertSQL , validateTable , validateFact @@ -27,8 +27,9 @@ extractDimensionTables env = flip runReader env . E.extractDimensionTables factTableDefnSQL :: Env -> Fact -> Table -> [Text] factTableDefnSQL env fact = flip runReader env . G.factTableDefnSQL fact -dimensionTableInsertSQL :: Env -> Fact -> TableName -> Text -dimensionTableInsertSQL env fact = flip runReader env . G.dimensionTableInsertSQL fact +dimensionTablePopulateSQL :: TablePopulationMode -> Env -> Fact -> TableName -> Text +dimensionTablePopulateSQL popMode env fact = + flip runReader env . G.dimensionTablePopulateSQL popMode fact factTableInsertSQL :: Env -> Fact -> Text factTableInsertSQL env = flip runReader env . G.factTableInsertSQL diff --git a/src/Ringo/Extractor/Internal.hs b/src/Ringo/Extractor/Internal.hs index d1ed874..7d0135a 100644 --- a/src/Ringo/Extractor/Internal.hs +++ b/src/Ringo/Extractor/Internal.hs @@ -10,7 +10,7 @@ import Control.Applicative ((<$>)) import Control.Monad.Reader (Reader, asks) import Data.Function (on) -import Data.Maybe (mapMaybe, fromMaybe, fromJust) +import Data.Maybe (mapMaybe, fromMaybe, fromJust, catMaybes) import Data.Monoid ((<>)) import Data.List (nub, nubBy) import Data.Text (Text) @@ -48,10 +48,7 @@ extractDimensionTables fact = do let table = fromJust . findTable (factTableName fact) $ tables return $ dimsFromIds tables ++ dimsFromVals settings (tableColumns table) where - dimsFromIds tables = - forMaybe (factColumns fact) $ \fcol -> case fcol of - DimId d _ -> findTable d tables - _ -> Nothing + dimsFromIds tables = catMaybes [ findTable d tables | DimId d _ <- factColumns fact ] dimsFromVals Settings {..} tableColumns = map (\(dim, cols) -> diff --git a/src/Ringo/Generator.hs b/src/Ringo/Generator.hs index 16131e4..d2fc140 100644 --- a/src/Ringo/Generator.hs +++ b/src/Ringo/Generator.hs @@ -1,7 +1,7 @@ module Ringo.Generator ( tableDefnSQL , factTableDefnSQL - , dimensionTableInsertSQL + , dimensionTablePopulateSQL , factTableInsertSQL ) where @@ -13,9 +13,10 @@ import Control.Applicative ((<$>)) #endif import Control.Monad.Reader (Reader, asks) -import Data.List (intersperse, nub, find) -import Data.Maybe (fromJust, mapMaybe) +import Data.List (nub, find, subsequences, partition, sortBy) +import Data.Maybe (fromJust, mapMaybe, catMaybes) import Data.Monoid ((<>)) +import Data.Ord (comparing) import Data.Text (Text) import Ringo.Extractor.Internal @@ -31,24 +32,44 @@ columnDefnSQL Column {..} = columnName <> " " <> columnType <> " " <> nullableDefnSQL columnNullable colNamesString :: [ColumnName] -> Text -colNamesString cNames = Text.concat (intersperse ", " cNames) +colNamesString = Text.intercalate ", " -constraintDefnSQL :: TableName -> TableConstraint -> Text -constraintDefnSQL tableName constraint = - "ALTER TABLE ONLY " <> tableName <> " ADD " - <> case constraint of - PrimaryKey cName -> "PRIMARY KEY (" <> cName <> ")" - UniqueKey cNames -> "UNIQUE (" <> colNamesString cNames <> ")" - ForeignKey oTableName cNamePairs -> - "FOREIGN KEY (" <> colNamesString (map fst cNamePairs) <> ") REFERENCES " - <> oTableName <> " (" <> colNamesString (map snd cNamePairs) <> ")" +fullColName :: TableName -> ColumnName -> ColumnName +fullColName tName cName = tName <> "." <> cName + +constraintDefnSQL :: Table -> TableConstraint -> [Text] +constraintDefnSQL Table {..} constraint = + let alterTableSQL = "ALTER TABLE ONLY " <> tableName <> " ADD " + in case constraint of + PrimaryKey cName -> [ alterTableSQL <> "PRIMARY KEY (" <> cName <> ")" ] + ForeignKey oTableName cNamePairs -> + [ alterTableSQL <> "FOREIGN KEY (" <> colNamesString (map fst cNamePairs) <> ") REFERENCES " + <> oTableName <> " (" <> colNamesString (map snd cNamePairs) <> ")" ] + UniqueKey cNames -> let + (notNullCols, nullCols) = + both (map columnName) + $ partition ((== NotNull) . columnNullable) + $ catMaybes [ findColumn cName tableColumns | cName <- cNames ] + combinations = + map (\cs -> (cs, [ c | c <- nullCols, c `notElem` cs ])) + . sortBy (comparing length) + $ subsequences nullCols + in [ "CREATE UNIQUE INDEX ON " <> tableName + <> " (" <> colNamesString (notNullCols ++ nnCols) <> ")" + <> if null whereClauses + then "" + else "\nWHERE "<> Text.intercalate "\nAND " whereClauses + | (nnCols, nCols) <- combinations + , not $ null (notNullCols ++ nnCols) + , let whereClauses = + [ c <> " IS NOT NULL" | c <- nnCols ] ++ [ c <> " IS NULL" | c <- nCols ] ] tableDefnSQL :: Table -> [Text] -tableDefnSQL Table {..} = - tableSQL : map (constraintDefnSQL tableName) tableConstraints +tableDefnSQL table@Table {..} = + tableSQL : concatMap (constraintDefnSQL table) tableConstraints where tableSQL = "CREATE TABLE " <> tableName <> " (\n" - <> (Text.concat . intersperse ",\n" . map columnDefnSQL $ tableColumns) + <> (Text.intercalate ",\n" . map columnDefnSQL $ tableColumns) <> "\n)" factTableDefnSQL :: Fact -> Table -> Reader Env [Text] @@ -71,21 +92,33 @@ factTableDefnSQL fact table = do dimColumnMapping :: Text -> Fact -> TableName -> [(ColumnName, ColumnName)] dimColumnMapping dimPrefix fact dimTableName = - forMaybe (factColumns fact) $ \fCol -> case fCol of - DimVal dName cName | dimPrefix <> dName == dimTableName -> - Just (dimColumnName dName cName, cName) - _ -> Nothing + [ (dimColumnName dName cName, cName) + | DimVal dName cName <- factColumns fact , dimPrefix <> dName == dimTableName] -dimensionTableInsertSQL :: Fact -> TableName -> Reader Env Text -dimensionTableInsertSQL fact dimTableName = do - dimPrefix <- settingDimPrefix <$> asks envSettings - let colMapping = dimColumnMapping dimPrefix fact dimTableName - - return $ "INSERT INTO " <> dimTableName <> " (\n" - <> colNamesString (map fst colMapping) - <> "\n) SELECT DISTINCT \n" - <> colNamesString (map snd colMapping) - <> "\nFROM " <> factTableName fact +dimensionTablePopulateSQL :: TablePopulationMode -> Fact -> TableName -> Reader Env Text +dimensionTablePopulateSQL popMode fact dimTableName = do + dimPrefix <- settingDimPrefix <$> asks envSettings + let colMapping = dimColumnMapping dimPrefix fact dimTableName + baseSelectC = "SELECT DISTINCT\n" <> colNamesString (map snd colMapping) <> "\n" + <> "FROM " <> factTableName fact + insertC selectC = "INSERT INTO " <> dimTableName + <> " (\n" <> colNamesString (map fst colMapping) <> "\n) " + <> "SELECT x.* FROM (\n" <> selectC <> ") x" + timeCol = head [ cName | DimTime cName <- factColumns fact ] + return $ case popMode of + FullPopulation -> insertC baseSelectC + IncrementalPopulation -> + insertC (baseSelectC <> "\nWHERE " + <> timeCol <> " > ? AND " <> timeCol <> " <= ?" + <> " AND (\n" + <> Text.intercalate "\nOR " [ c <> " IS NOT NULL" | (_, c) <- colMapping ] + <> "\n)") + <> "\nLEFT JOIN " <> dimTableName <> " ON\n" + <> Text.intercalate " \nAND " + [ fullColName dimTableName c1 <> " IS NOT DISTINCT FROM " <> fullColName "x" c2 + | (c1, c2) <- colMapping ] + <> "\nWHERE " <> Text.intercalate " \nAND " + [ fullColName dimTableName c <> " IS NULL" | (c, _) <- colMapping ] factTableInsertSQL :: Fact -> Reader Env Text factTableInsertSQL fact = do @@ -131,7 +164,7 @@ factTableInsertSQL fact = do [ fullColName tableName c1 <> " = " <> fullColName factSourceTableName c2 | (c1, c2) <- dimColumnMapping settingDimPrefix dimFact tableName ] in "SELECT " <> dimIdColName <> " FROM " <> tableName <> "\nWHERE " - <> (Text.concat . intersperse "\n AND " $ dimLookupWhereClauses) + <> (Text.intercalate "\n AND " $ dimLookupWhereClauses) in (colName, insertSQL, True) colMap = [ (cName, if addAs then asName cName sql else sql, addAs) @@ -147,18 +180,17 @@ factTableInsertSQL fact = do <> extractedFactTableName settingFactPrefix settingFactInfix (factName fact) settingTimeUnit <> " (\n" <> unlineCols (map fst3 colMap) <> "\n)" <> "\nSELECT \n" <> unlineCols (map snd3 colMap) - <> "\nFROM " <> fTableName <> "\n" <> Text.concat (intersperse "\n" joinClauses) + <> "\nFROM " <> fTableName <> "\n" <> Text.intercalate"\n" joinClauses <> "\nGROUP BY \n" <> unlineCols (map ((groupByColPrefix <>) . fst3) . filter thd3 $ colMap) where groupByColPrefix = "xxff_" - fullColName tName cName = tName <> "." <> cName asName cName sql = "(" <> sql <> ")" <> " as " <> groupByColPrefix <> cName - unlineCols = Text.concat . intersperse ",\n " + unlineCols = Text.intercalate ",\n " joinClausePreds table oTableName = fmap (\(ForeignKey _ colPairs) -> - Text.concat . intersperse " AND " + Text.intercalate " AND " . map (\(c1, c2) -> fullColName (tableName table) c1 <> " = " <> fullColName oTableName c2) $ colPairs ) . find (\cons -> case cons of diff --git a/src/Ringo/Types.hs b/src/Ringo/Types.hs index 59bc797..019a5b9 100644 --- a/src/Ringo/Types.hs +++ b/src/Ringo/Types.hs @@ -93,9 +93,10 @@ defSettings = Settings , settingFactInfix = "_by_" } -data ValidationError = MissingTable !TableName - | MissingFact !TableName - | MissingColumn !TableName !ColumnName +data ValidationError = MissingTable !TableName + | MissingFact !TableName + | MissingColumn !TableName !ColumnName + | MissingTimeColumn !TableName deriving (Eq, Show) data Env = Env @@ -103,3 +104,5 @@ data Env = Env , envFacts :: ![Fact] , envSettings :: !Settings } deriving (Eq, Show) + +data TablePopulationMode = FullPopulation | IncrementalPopulation deriving (Eq, Show) diff --git a/src/Ringo/Validator.hs b/src/Ringo/Validator.hs index da0fc0c..14448ab 100644 --- a/src/Ringo/Validator.hs +++ b/src/Ringo/Validator.hs @@ -1,7 +1,6 @@ module Ringo.Validator ( validateTable , validateFact - , withFactValidation ) where #if MIN_VERSION_base(4,8,0) @@ -40,10 +39,12 @@ validateFact Fact {..} = do case findTable factTableName tables of Nothing -> return [ MissingTable factTableName ] Just table -> do - tableVs <- validateTable table - parentVs <- concat <$> mapM checkFactParents factParentNames - let colVs = concatMap (checkColumn tables table) factColumns - return $ tableVs ++ parentVs ++ colVs + tableVs <- validateTable table + parentVs <- concat <$> mapM checkFactParents factParentNames + let colVs = concatMap (checkColumn tables table) factColumns + let timeVs = [ MissingTimeColumn factTableName + | null [ c | DimTime c <- factColumns ] ] + return $ tableVs ++ parentVs ++ colVs ++ timeVs where checkFactParents fName = do facts <- asks envFacts @@ -58,10 +59,3 @@ validateFact Fact {..} = do checkColumnTable tables factCol = case factCol of DimId tName _ -> maybe [ MissingTable tName ] (const []) $ findTable tName tables _ -> [] - -withFactValidation :: Fact -> Reader Env a -> Reader Env (Either [ValidationError] a) -withFactValidation fact func = do - errors <- validateFact fact - if not $ null errors - then return $ Left errors - else fmap Right func