Adds incremental refresh sql generation for dimension tables.

- Adds partial unique indexes for tables to handle null columns.
- Adds validation for time column in fact tables.
pull/1/head
Abhinav Sarkar 2015-12-19 11:55:08 +05:30
parent e46d3684cd
commit 7dc6db944f
6 changed files with 99 additions and 73 deletions

View File

@ -14,7 +14,7 @@ import Ringo
import Ringo.ArgParser import Ringo.ArgParser
import Ringo.InputParser import Ringo.InputParser
data SQLType = Create | Populate | Update deriving (Eq, Show) data SQLType = Create | FullRefresh | IncRefresh deriving (Eq, Show)
main :: IO () main :: IO ()
main = do main = do
@ -36,30 +36,29 @@ writeSQLFiles outputDir env@Env{..} = forM_ sqls $ \(sqlType, table, sql) -> do
createDirectoryIfMissing True dirName createDirectoryIfMissing True dirName
writeFile fileName sql writeFile fileName sql
where where
dimTables = map (\fact -> (fact, extractDimensionTables env fact)) envFacts dimTables = [ (fact, extractDimensionTables env fact) | fact <- envFacts ]
factTables = map (\fact -> (fact, extractFactTable env fact)) envFacts factTables = [ (fact, extractFactTable env fact) | fact <- envFacts ]
dimTableDefnSQLs = [ (Create, tableName table, unlines . map sqlStr . tableDefnSQL $ table) dimTableDefnSQLs = [ (Create, tableName table, unlines . map sqlStr . tableDefnSQL $ table)
| (_, tabs) <- dimTables | (_, tabs) <- dimTables
, table <- tabs , table <- tabs
, table `notElem` envTables ] , table `notElem` envTables ]
factTableDefnSQLs = [ (Create
, tableName table, unlines . map sqlStr $ factTableDefnSQL env fact table) factTableDefnSQLs = [ (Create , tableName table, unlines . map sqlStr $ factTableDefnSQL env fact table)
| (fact, table) <- factTables ] | (fact, table) <- factTables ]
dimTableInsertSQLs = [ (Populate dimTablePopulateSQLs typ gen = [ (typ , tableName table, sqlStr $ gen env fact (tableName table))
, tableName table | (fact, tabs) <- dimTables
, sqlStr $ dimensionTableInsertSQL env fact (tableName table)) , table <- tabs
| (fact, tabs) <- dimTables , table `notElem` envTables ]
, table <- tabs
, table `notElem` envTables ]
factTableInsertSQLs = [ (Populate, tableName table, sqlStr $ factTableInsertSQL env fact) factTableInsertSQLs = [ (FullRefresh, tableName table, sqlStr $ factTableInsertSQL env fact)
| (fact, table) <- factTables ] | (fact, table) <- factTables ]
sqls = concat [ dimTableDefnSQLs sqls = concat [ dimTableDefnSQLs
, factTableDefnSQLs , factTableDefnSQLs
, dimTableInsertSQLs , dimTablePopulateSQLs FullRefresh $ dimensionTablePopulateSQL FullPopulation
, dimTablePopulateSQLs IncRefresh $ dimensionTablePopulateSQL IncrementalPopulation
, factTableInsertSQLs , factTableInsertSQLs
] ]

View File

@ -4,7 +4,7 @@ module Ringo
, extractDimensionTables , extractDimensionTables
, G.tableDefnSQL , G.tableDefnSQL
, factTableDefnSQL , factTableDefnSQL
, dimensionTableInsertSQL , dimensionTablePopulateSQL
, factTableInsertSQL , factTableInsertSQL
, validateTable , validateTable
, validateFact , validateFact
@ -27,8 +27,9 @@ extractDimensionTables env = flip runReader env . E.extractDimensionTables
factTableDefnSQL :: Env -> Fact -> Table -> [Text] factTableDefnSQL :: Env -> Fact -> Table -> [Text]
factTableDefnSQL env fact = flip runReader env . G.factTableDefnSQL fact factTableDefnSQL env fact = flip runReader env . G.factTableDefnSQL fact
dimensionTableInsertSQL :: Env -> Fact -> TableName -> Text dimensionTablePopulateSQL :: TablePopulationMode -> Env -> Fact -> TableName -> Text
dimensionTableInsertSQL env fact = flip runReader env . G.dimensionTableInsertSQL fact dimensionTablePopulateSQL popMode env fact =
flip runReader env . G.dimensionTablePopulateSQL popMode fact
factTableInsertSQL :: Env -> Fact -> Text factTableInsertSQL :: Env -> Fact -> Text
factTableInsertSQL env = flip runReader env . G.factTableInsertSQL factTableInsertSQL env = flip runReader env . G.factTableInsertSQL

View File

@ -10,7 +10,7 @@ import Control.Applicative ((<$>))
import Control.Monad.Reader (Reader, asks) import Control.Monad.Reader (Reader, asks)
import Data.Function (on) import Data.Function (on)
import Data.Maybe (mapMaybe, fromMaybe, fromJust) import Data.Maybe (mapMaybe, fromMaybe, fromJust, catMaybes)
import Data.Monoid ((<>)) import Data.Monoid ((<>))
import Data.List (nub, nubBy) import Data.List (nub, nubBy)
import Data.Text (Text) import Data.Text (Text)
@ -48,10 +48,7 @@ extractDimensionTables fact = do
let table = fromJust . findTable (factTableName fact) $ tables let table = fromJust . findTable (factTableName fact) $ tables
return $ dimsFromIds tables ++ dimsFromVals settings (tableColumns table) return $ dimsFromIds tables ++ dimsFromVals settings (tableColumns table)
where where
dimsFromIds tables = dimsFromIds tables = catMaybes [ findTable d tables | DimId d _ <- factColumns fact ]
forMaybe (factColumns fact) $ \fcol -> case fcol of
DimId d _ -> findTable d tables
_ -> Nothing
dimsFromVals Settings {..} tableColumns = dimsFromVals Settings {..} tableColumns =
map (\(dim, cols) -> map (\(dim, cols) ->

View File

@ -1,7 +1,7 @@
module Ringo.Generator module Ringo.Generator
( tableDefnSQL ( tableDefnSQL
, factTableDefnSQL , factTableDefnSQL
, dimensionTableInsertSQL , dimensionTablePopulateSQL
, factTableInsertSQL , factTableInsertSQL
) where ) where
@ -13,9 +13,10 @@ import Control.Applicative ((<$>))
#endif #endif
import Control.Monad.Reader (Reader, asks) import Control.Monad.Reader (Reader, asks)
import Data.List (intersperse, nub, find) import Data.List (nub, find, subsequences, partition, sortBy)
import Data.Maybe (fromJust, mapMaybe) import Data.Maybe (fromJust, mapMaybe, catMaybes)
import Data.Monoid ((<>)) import Data.Monoid ((<>))
import Data.Ord (comparing)
import Data.Text (Text) import Data.Text (Text)
import Ringo.Extractor.Internal import Ringo.Extractor.Internal
@ -31,24 +32,44 @@ columnDefnSQL Column {..} =
columnName <> " " <> columnType <> " " <> nullableDefnSQL columnNullable columnName <> " " <> columnType <> " " <> nullableDefnSQL columnNullable
colNamesString :: [ColumnName] -> Text colNamesString :: [ColumnName] -> Text
colNamesString cNames = Text.concat (intersperse ", " cNames) colNamesString = Text.intercalate ", "
constraintDefnSQL :: TableName -> TableConstraint -> Text fullColName :: TableName -> ColumnName -> ColumnName
constraintDefnSQL tableName constraint = fullColName tName cName = tName <> "." <> cName
"ALTER TABLE ONLY " <> tableName <> " ADD "
<> case constraint of constraintDefnSQL :: Table -> TableConstraint -> [Text]
PrimaryKey cName -> "PRIMARY KEY (" <> cName <> ")" constraintDefnSQL Table {..} constraint =
UniqueKey cNames -> "UNIQUE (" <> colNamesString cNames <> ")" let alterTableSQL = "ALTER TABLE ONLY " <> tableName <> " ADD "
ForeignKey oTableName cNamePairs -> in case constraint of
"FOREIGN KEY (" <> colNamesString (map fst cNamePairs) <> ") REFERENCES " PrimaryKey cName -> [ alterTableSQL <> "PRIMARY KEY (" <> cName <> ")" ]
<> oTableName <> " (" <> colNamesString (map snd cNamePairs) <> ")" ForeignKey oTableName cNamePairs ->
[ alterTableSQL <> "FOREIGN KEY (" <> colNamesString (map fst cNamePairs) <> ") REFERENCES "
<> oTableName <> " (" <> colNamesString (map snd cNamePairs) <> ")" ]
UniqueKey cNames -> let
(notNullCols, nullCols) =
both (map columnName)
$ partition ((== NotNull) . columnNullable)
$ catMaybes [ findColumn cName tableColumns | cName <- cNames ]
combinations =
map (\cs -> (cs, [ c | c <- nullCols, c `notElem` cs ]))
. sortBy (comparing length)
$ subsequences nullCols
in [ "CREATE UNIQUE INDEX ON " <> tableName
<> " (" <> colNamesString (notNullCols ++ nnCols) <> ")"
<> if null whereClauses
then ""
else "\nWHERE "<> Text.intercalate "\nAND " whereClauses
| (nnCols, nCols) <- combinations
, not $ null (notNullCols ++ nnCols)
, let whereClauses =
[ c <> " IS NOT NULL" | c <- nnCols ] ++ [ c <> " IS NULL" | c <- nCols ] ]
tableDefnSQL :: Table -> [Text] tableDefnSQL :: Table -> [Text]
tableDefnSQL Table {..} = tableDefnSQL table@Table {..} =
tableSQL : map (constraintDefnSQL tableName) tableConstraints tableSQL : concatMap (constraintDefnSQL table) tableConstraints
where where
tableSQL = "CREATE TABLE " <> tableName <> " (\n" tableSQL = "CREATE TABLE " <> tableName <> " (\n"
<> (Text.concat . intersperse ",\n" . map columnDefnSQL $ tableColumns) <> (Text.intercalate ",\n" . map columnDefnSQL $ tableColumns)
<> "\n)" <> "\n)"
factTableDefnSQL :: Fact -> Table -> Reader Env [Text] factTableDefnSQL :: Fact -> Table -> Reader Env [Text]
@ -71,21 +92,33 @@ factTableDefnSQL fact table = do
dimColumnMapping :: Text -> Fact -> TableName -> [(ColumnName, ColumnName)] dimColumnMapping :: Text -> Fact -> TableName -> [(ColumnName, ColumnName)]
dimColumnMapping dimPrefix fact dimTableName = dimColumnMapping dimPrefix fact dimTableName =
forMaybe (factColumns fact) $ \fCol -> case fCol of [ (dimColumnName dName cName, cName)
DimVal dName cName | dimPrefix <> dName == dimTableName -> | DimVal dName cName <- factColumns fact , dimPrefix <> dName == dimTableName]
Just (dimColumnName dName cName, cName)
_ -> Nothing
dimensionTableInsertSQL :: Fact -> TableName -> Reader Env Text dimensionTablePopulateSQL :: TablePopulationMode -> Fact -> TableName -> Reader Env Text
dimensionTableInsertSQL fact dimTableName = do dimensionTablePopulateSQL popMode fact dimTableName = do
dimPrefix <- settingDimPrefix <$> asks envSettings dimPrefix <- settingDimPrefix <$> asks envSettings
let colMapping = dimColumnMapping dimPrefix fact dimTableName let colMapping = dimColumnMapping dimPrefix fact dimTableName
baseSelectC = "SELECT DISTINCT\n" <> colNamesString (map snd colMapping) <> "\n"
return $ "INSERT INTO " <> dimTableName <> " (\n" <> "FROM " <> factTableName fact
<> colNamesString (map fst colMapping) insertC selectC = "INSERT INTO " <> dimTableName
<> "\n) SELECT DISTINCT \n" <> " (\n" <> colNamesString (map fst colMapping) <> "\n) "
<> colNamesString (map snd colMapping) <> "SELECT x.* FROM (\n" <> selectC <> ") x"
<> "\nFROM " <> factTableName fact timeCol = head [ cName | DimTime cName <- factColumns fact ]
return $ case popMode of
FullPopulation -> insertC baseSelectC
IncrementalPopulation ->
insertC (baseSelectC <> "\nWHERE "
<> timeCol <> " > ? AND " <> timeCol <> " <= ?"
<> " AND (\n"
<> Text.intercalate "\nOR " [ c <> " IS NOT NULL" | (_, c) <- colMapping ]
<> "\n)")
<> "\nLEFT JOIN " <> dimTableName <> " ON\n"
<> Text.intercalate " \nAND "
[ fullColName dimTableName c1 <> " IS NOT DISTINCT FROM " <> fullColName "x" c2
| (c1, c2) <- colMapping ]
<> "\nWHERE " <> Text.intercalate " \nAND "
[ fullColName dimTableName c <> " IS NULL" | (c, _) <- colMapping ]
factTableInsertSQL :: Fact -> Reader Env Text factTableInsertSQL :: Fact -> Reader Env Text
factTableInsertSQL fact = do factTableInsertSQL fact = do
@ -131,7 +164,7 @@ factTableInsertSQL fact = do
[ fullColName tableName c1 <> " = " <> fullColName factSourceTableName c2 [ fullColName tableName c1 <> " = " <> fullColName factSourceTableName c2
| (c1, c2) <- dimColumnMapping settingDimPrefix dimFact tableName ] | (c1, c2) <- dimColumnMapping settingDimPrefix dimFact tableName ]
in "SELECT " <> dimIdColName <> " FROM " <> tableName <> "\nWHERE " in "SELECT " <> dimIdColName <> " FROM " <> tableName <> "\nWHERE "
<> (Text.concat . intersperse "\n AND " $ dimLookupWhereClauses) <> (Text.intercalate "\n AND " $ dimLookupWhereClauses)
in (colName, insertSQL, True) in (colName, insertSQL, True)
colMap = [ (cName, if addAs then asName cName sql else sql, addAs) colMap = [ (cName, if addAs then asName cName sql else sql, addAs)
@ -147,18 +180,17 @@ factTableInsertSQL fact = do
<> extractedFactTableName settingFactPrefix settingFactInfix (factName fact) settingTimeUnit <> extractedFactTableName settingFactPrefix settingFactInfix (factName fact) settingTimeUnit
<> " (\n" <> unlineCols (map fst3 colMap) <> "\n)" <> " (\n" <> unlineCols (map fst3 colMap) <> "\n)"
<> "\nSELECT \n" <> unlineCols (map snd3 colMap) <> "\nSELECT \n" <> unlineCols (map snd3 colMap)
<> "\nFROM " <> fTableName <> "\n" <> Text.concat (intersperse "\n" joinClauses) <> "\nFROM " <> fTableName <> "\n" <> Text.intercalate"\n" joinClauses
<> "\nGROUP BY \n" <> "\nGROUP BY \n"
<> unlineCols (map ((groupByColPrefix <>) . fst3) . filter thd3 $ colMap) <> unlineCols (map ((groupByColPrefix <>) . fst3) . filter thd3 $ colMap)
where where
groupByColPrefix = "xxff_" groupByColPrefix = "xxff_"
fullColName tName cName = tName <> "." <> cName
asName cName sql = "(" <> sql <> ")" <> " as " <> groupByColPrefix <> cName asName cName sql = "(" <> sql <> ")" <> " as " <> groupByColPrefix <> cName
unlineCols = Text.concat . intersperse ",\n " unlineCols = Text.intercalate ",\n "
joinClausePreds table oTableName = joinClausePreds table oTableName =
fmap (\(ForeignKey _ colPairs) -> fmap (\(ForeignKey _ colPairs) ->
Text.concat . intersperse " AND " Text.intercalate " AND "
. map (\(c1, c2) -> fullColName (tableName table) c1 <> " = " <> fullColName oTableName c2) . map (\(c1, c2) -> fullColName (tableName table) c1 <> " = " <> fullColName oTableName c2)
$ colPairs ) $ colPairs )
. find (\cons -> case cons of . find (\cons -> case cons of

View File

@ -93,9 +93,10 @@ defSettings = Settings
, settingFactInfix = "_by_" , settingFactInfix = "_by_"
} }
data ValidationError = MissingTable !TableName data ValidationError = MissingTable !TableName
| MissingFact !TableName | MissingFact !TableName
| MissingColumn !TableName !ColumnName | MissingColumn !TableName !ColumnName
| MissingTimeColumn !TableName
deriving (Eq, Show) deriving (Eq, Show)
data Env = Env data Env = Env
@ -103,3 +104,5 @@ data Env = Env
, envFacts :: ![Fact] , envFacts :: ![Fact]
, envSettings :: !Settings , envSettings :: !Settings
} deriving (Eq, Show) } deriving (Eq, Show)
data TablePopulationMode = FullPopulation | IncrementalPopulation deriving (Eq, Show)

View File

@ -1,7 +1,6 @@
module Ringo.Validator module Ringo.Validator
( validateTable ( validateTable
, validateFact , validateFact
, withFactValidation
) where ) where
#if MIN_VERSION_base(4,8,0) #if MIN_VERSION_base(4,8,0)
@ -40,10 +39,12 @@ validateFact Fact {..} = do
case findTable factTableName tables of case findTable factTableName tables of
Nothing -> return [ MissingTable factTableName ] Nothing -> return [ MissingTable factTableName ]
Just table -> do Just table -> do
tableVs <- validateTable table tableVs <- validateTable table
parentVs <- concat <$> mapM checkFactParents factParentNames parentVs <- concat <$> mapM checkFactParents factParentNames
let colVs = concatMap (checkColumn tables table) factColumns let colVs = concatMap (checkColumn tables table) factColumns
return $ tableVs ++ parentVs ++ colVs let timeVs = [ MissingTimeColumn factTableName
| null [ c | DimTime c <- factColumns ] ]
return $ tableVs ++ parentVs ++ colVs ++ timeVs
where where
checkFactParents fName = do checkFactParents fName = do
facts <- asks envFacts facts <- asks envFacts
@ -58,10 +59,3 @@ validateFact Fact {..} = do
checkColumnTable tables factCol = case factCol of checkColumnTable tables factCol = case factCol of
DimId tName _ -> maybe [ MissingTable tName ] (const []) $ findTable tName tables DimId tName _ -> maybe [ MissingTable tName ] (const []) $ findTable tName tables
_ -> [] _ -> []
withFactValidation :: Fact -> Reader Env a -> Reader Env (Either [ValidationError] a)
withFactValidation fact func = do
errors <- validateFact fact
if not $ null errors
then return $ Left errors
else fmap Right func