From 3978f33cd03488b8a118ea07ee6a5ebd69654ead Mon Sep 17 00:00:00 2001 From: Abhinav Sarkar Date: Mon, 28 Dec 2015 18:09:02 +0530 Subject: [PATCH] Adds coalescing to default values for fact table columns. - Removes FKs from fact tables - Some coercions in SQL for faster processing --- ringo.cabal | 4 +- src/Ringo/Extractor.hs | 16 ++--- src/Ringo/Extractor/Internal.hs | 5 +- src/Ringo/Generator.hs | 107 ++++++++++++++++---------------- src/Ringo/Types.hs | 1 + src/Ringo/Validator.hs | 8 ++- 6 files changed, 72 insertions(+), 69 deletions(-) diff --git a/ringo.cabal b/ringo.cabal index e7e52d2..5848220 100644 --- a/ringo.cabal +++ b/ringo.cabal @@ -28,7 +28,7 @@ library mtl >=2.1 && <2.3 ghc-options: -Wall -fno-warn-unused-do-bind -funbox-strict-fields -fno-warn-orphans -O2 default-extensions: OverloadedStrings, RecordWildCards, ScopedTypeVariables, BangPatterns, - TupleSections, CPP + TupleSections, CPP, NamedFieldPuns default-language: Haskell2010 executable ringo @@ -49,7 +49,7 @@ executable ringo ringo ghc-options: -Wall -fno-warn-unused-do-bind -funbox-strict-fields -fno-warn-orphans -O2 default-extensions: OverloadedStrings, RecordWildCards, ScopedTypeVariables, BangPatterns, - TupleSections, CPP + TupleSections, CPP, NamedFieldPuns default-language: Haskell2010 test-suite ringo-test diff --git a/src/Ringo/Extractor.hs b/src/Ringo/Extractor.hs index 21cdf3e..82b3c45 100644 --- a/src/Ringo/Extractor.hs +++ b/src/Ringo/Extractor.hs @@ -30,7 +30,7 @@ extractFactTable fact = do columns = concatFor (factColumns fact) $ \col -> case col of DimTime cName -> - [ Column (timeUnitColumnName dimIdColName cName settingTimeUnit) "integer" NotNull ] + [ Column (timeUnitColumnName dimIdColName cName settingTimeUnit) "bigint" NotNull ] NoDimId cName -> [ fromJust . findColumn cName . tableColumns $ table] FactCount _ cName -> [ Column cName countColType NotNull ] FactSum scName cName -> [ Column cName (sourceColumnType scName) NotNull ] @@ -41,17 +41,13 @@ extractFactTable fact = do FactCountDistinct _ cName -> [ Column cName "json" NotNull ] _ -> [] - fks = for allDims $ \(fact', tab@Table {..}) -> + fkCols = for allDims $ \(_, Table {..}) -> let colName = factDimFKIdColumnName settingDimPrefix dimIdColName tableName colType = idColTypeToFKIdColType settingDimTableIdColumnType - colNullable = - if tab `elem` tables || fact /= fact' || any ((== Null) . columnNullable) tableColumns - then Null - else NotNull - in ( Column colName colType colNullable , ForeignKey tableName [(colName, dimIdColName)] ) + in Column colName colType NotNull ukColNames = - (++ map (columnName . fst) fks) + (++ map columnName fkCols) . forMaybe (factColumns fact) $ \col -> case col of DimTime cName -> Just (timeUnitColumnName dimIdColName cName settingTimeUnit) NoDimId cName -> Just cName @@ -60,8 +56,8 @@ extractFactTable fact = do return Table { tableName = extractedFactTableName settingFactPrefix settingFactInfix (factName fact) settingTimeUnit - , tableColumns = columns ++ map fst fks - , tableConstraints = UniqueKey ukColNames : map snd fks + , tableColumns = columns ++ fkCols + , tableConstraints = [ UniqueKey ukColNames ] } extractDependencies :: Fact -> Reader Env Dependencies diff --git a/src/Ringo/Extractor/Internal.hs b/src/Ringo/Extractor/Internal.hs index 7d0135a..eba45d5 100644 --- a/src/Ringo/Extractor/Internal.hs +++ b/src/Ringo/Extractor/Internal.hs @@ -61,7 +61,10 @@ extractDimensionTables fact = do }) . Map.toList . Map.mapWithKey - (\dim -> map (\col -> col { columnName = dimColumnName dim (columnName col) }) . nub) + (\dim -> map (\col -> col { columnName = dimColumnName dim (columnName col) + , columnNullable = NotNull + }) + . nub) . Map.fromListWith (flip (++)) . mapMaybe (\fcol -> do DimVal d col <- fcol diff --git a/src/Ringo/Generator.hs b/src/Ringo/Generator.hs index 4883b05..f20900c 100644 --- a/src/Ringo/Generator.hs +++ b/src/Ringo/Generator.hs @@ -13,10 +13,9 @@ import Control.Applicative ((<$>)) #endif import Control.Monad.Reader (Reader, asks) -import Data.List (nub, find, subsequences, partition, sortBy) -import Data.Maybe (fromJust, fromMaybe, mapMaybe, catMaybes) +import Data.List (nub, find) +import Data.Maybe (fromJust, fromMaybe, mapMaybe) import Data.Monoid ((<>)) -import Data.Ord (comparing) import Data.Text (Text) import Ringo.Extractor.Internal @@ -31,8 +30,8 @@ columnDefnSQL :: Column -> Text columnDefnSQL Column {..} = columnName <> " " <> columnType <> " " <> nullableDefnSQL columnNullable -colNamesString :: [ColumnName] -> Text -colNamesString = Text.intercalate ", " +joinColumnNames :: [ColumnName] -> Text +joinColumnNames = Text.intercalate ",\n" fullColName :: TableName -> ColumnName -> ColumnName fullColName tName cName = tName <> "." <> cName @@ -43,34 +42,16 @@ constraintDefnSQL Table {..} constraint = in case constraint of PrimaryKey cName -> [ alterTableSQL <> "PRIMARY KEY (" <> cName <> ")" ] ForeignKey oTableName cNamePairs -> - [ alterTableSQL <> "FOREIGN KEY (" <> colNamesString (map fst cNamePairs) <> ") REFERENCES " - <> oTableName <> " (" <> colNamesString (map snd cNamePairs) <> ")" ] - UniqueKey cNames -> ["CREATE UNIQUE INDEX ON " <> tableName <> "(" <> colNamesString cNames <> ")"] - -- let - -- (notNullCols, nullCols) = - -- both (map columnName) - -- $ partition ((== NotNull) . columnNullable) - -- $ catMaybes [ findColumn cName tableColumns | cName <- cNames ] - -- combinations = - -- map (\cs -> (cs, [ c | c <- nullCols, c `notElem` cs ])) - -- . sortBy (comparing length) - -- $ subsequences nullCols - -- in [ "CREATE UNIQUE INDEX ON " <> tableName - -- <> " (" <> colNamesString (notNullCols ++ nnCols) <> ")" - -- <> if null whereClauses - -- then "" - -- else "\nWHERE "<> Text.intercalate "\nAND " whereClauses - -- | (nnCols, nCols) <- combinations - -- , not $ null (notNullCols ++ nnCols) - -- , let whereClauses = - -- [ c <> " IS NOT NULL" | c <- nnCols ] ++ [ c <> " IS NULL" | c <- nCols ] ] + [ alterTableSQL <> "FOREIGN KEY (" <> joinColumnNames (map fst cNamePairs) <> ") REFERENCES " + <> oTableName <> " (" <> joinColumnNames (map snd cNamePairs) <> ")" ] + UniqueKey cNames -> ["CREATE UNIQUE INDEX ON " <> tableName <> " (" <> joinColumnNames cNames <> ")"] tableDefnSQL :: Table -> [Text] tableDefnSQL table@Table {..} = tableSQL : concatMap (constraintDefnSQL table) tableConstraints where tableSQL = "CREATE TABLE " <> tableName <> " (\n" - <> (Text.intercalate ",\n" . map columnDefnSQL $ tableColumns) + <> (joinColumnNames . map columnDefnSQL $ tableColumns) <> "\n)" factTableDefnSQL :: Fact -> Table -> Reader Env [Text] @@ -96,12 +77,14 @@ dimColumnMapping dimPrefix fact dimTableName = [ (dimColumnName dName cName, cName) | DimVal dName cName <- factColumns fact , dimPrefix <> dName == dimTableName] -coalesceColumn :: Column -> Text -coalesceColumn Column{..} = +coalesceColumn :: TableName -> Column -> Text +coalesceColumn tName Column{..} = if columnNullable == Null - then "coalesce(" <> columnName <> "," <> defVal columnType <> ")" - else columnName + then "coalesce(" <> fqColName <> "," <> defVal columnType <> ")" + else fqColName where + fqColName = fullColName tName columnName + defVal colType | "integer" `Text.isPrefixOf` colType = "-42" | "timestamp" `Text.isPrefixOf` colType = "'00-00-00 00:00:00'" @@ -117,14 +100,15 @@ dimensionTablePopulateSQL popMode fact dimTableName = do let factTable = fromJust $ findTable (factTableName fact) tables colMapping = dimColumnMapping dimPrefix fact dimTableName baseSelectC = "SELECT DISTINCT\n" - <> colNamesString + <> joinColumnNames (map (\(_, c) -> - coalesceColumn . fromJust . findColumn c $ (tableColumns factTable)) + let col = fromJust . findColumn c $ tableColumns factTable + in coalesceColumn (factTableName fact) col) colMapping) <> "\n" <> "FROM " <> factTableName fact insertC selectC = "INSERT INTO " <> dimTableName - <> " (\n" <> colNamesString (map fst colMapping) <> "\n) " + <> " (\n" <> joinColumnNames (map fst colMapping) <> "\n) " <> "SELECT x.* FROM (\n" <> selectC <> ") x" timeCol = head [ cName | DimTime cName <- factColumns fact ] return $ case popMode of @@ -156,21 +140,23 @@ factTablePopulateSQL popMode fact = do allDims <- extractAllDimensionTables fact tables <- asks envTables let fTableName = factTableName fact - table = fromJust . findTable fTableName $ tables + fTable = fromJust . findTable fTableName $ tables dimIdColName = settingDimTableIdColumnName - tablePKColName = head [ cName | PrimaryKey cName <- tableConstraints table ] + tablePKColName = head [ cName | PrimaryKey cName <- tableConstraints fTable ] timeUnitColumnInsertSQL cName = let colName = timeUnitColumnName dimIdColName cName settingTimeUnit in ( colName - , "floor(extract(epoch from " <> fullColName fTableName cName <> ")/" - <> Text.pack (show $ timeUnitToSeconds settingTimeUnit) <> ")" + , "extract(epoch from " <> fullColName fTableName cName <> ")::bigint/" + <> Text.pack (show $ timeUnitToSeconds settingTimeUnit) , True ) factColMap = concatFor (factColumns fact) $ \col -> case col of DimTime cName -> [ timeUnitColumnInsertSQL cName ] - NoDimId cName -> [ (cName, fullColName fTableName cName, True) ] + NoDimId cName -> + let sCol = fromJust . findColumn cName $ tableColumns fTable + in [ (cName, coalesceColumn fTableName sCol, True) ] FactCount scName cName -> [ (cName, "count(" <> maybe "*" (fullColName fTableName) scName <> ")", False) ] FactSum scName cName -> @@ -188,24 +174,32 @@ factTablePopulateSQL popMode fact = do FactCountDistinct _ cName -> [ (cName, "'{}'::json", False)] _ -> [] - dimColMap = for allDims $ \(dimFact, factTable@Table {..}) -> + dimColMap = for allDims $ \(dimFact, factTable@Table {tableName}) -> let colName = factDimFKIdColumnName settingDimPrefix dimIdColName tableName + col = fromJust . findColumn colName $ tableColumns factSourceTable factSourceTableName = factTableName dimFact - insertSQL = if factTable `elem` tables - then fullColName factSourceTableName colName + factSourceTable = fromJust . findTable factSourceTableName $ tables + insertSQL = if factTable `elem` tables -- existing dimension table + then (if columnNullable col == Null then coalesceFKId else id) + $ fullColName factSourceTableName colName else let dimLookupWhereClauses = - [ fullColName tableName c1 <> " = " <> fullColName factSourceTableName c2 - | (c1, c2) <- dimColumnMapping settingDimPrefix dimFact tableName ] + [ fullColName tableName c1 <> " = " <> coalesceColumn factSourceTableName col2 + | (c1, c2) <- dimColumnMapping settingDimPrefix dimFact tableName + , let col2 = fromJust . findColumn c2 $ tableColumns factSourceTable ] in "SELECT " <> dimIdColName <> " FROM " <> tableName <> "\nWHERE " - <> Text.intercalate "\n AND " dimLookupWhereClauses - in (colName, insertSQL, True) + <> Text.intercalate "\n AND " dimLookupWhereClauses + insertSQL' = if factSourceTableName == fTableName + then insertSQL + else coalesceFKId insertSQL + + in (colName, insertSQL', True) colMap = [ (cName, (sql, groupByColPrefix <> cName), addAs) | (cName, sql, addAs) <- factColMap ++ dimColMap ] joinClauses = - mapMaybe (\tName -> (\p -> "LEFT JOIN " <> tName <> "\nON "<> p) <$> joinClausePreds table tName) + mapMaybe (\tName -> (\p -> "LEFT JOIN " <> tName <> "\nON "<> p) <$> joinClausePreds fTable tName) . nub . map (factTableName . fst) $ allDims @@ -260,14 +254,14 @@ factTablePopulateSQL popMode fact = do in "UPDATE " <> extFactTableName <> "\nSET " <> cName <> " = " <> fullColName "xyz" cName <> "\nFROM (" - <> "\nSELECT " <> Text.intercalate ",\n" (origGroupByCols ++ [aggSelectClause]) + <> "\nSELECT " <> joinColumnNames (origGroupByCols ++ [aggSelectClause]) <> "\nFROM (\n" <> selectSQL <> "\n) zyx" - <> "\nGROUP BY \n" <> Text.intercalate ",\n" origGroupByCols + <> "\nGROUP BY \n" <> joinColumnNames origGroupByCols <> "\n) xyz" <> "\n WHERE\n" <> Text.intercalate "\nAND " - [ coalesceFKId (fullColName extFactTableName .fromJust . Text.stripPrefix groupByColPrefix $ col) - <> " = " <> coalesceFKId (fullColName "xyz" col) + [ fullColName extFactTableName .fromJust . Text.stripPrefix groupByColPrefix $ col + <> " = " <> fullColName "xyz" col | col <- origGroupByCols ] return $ insertIntoInsertSQL <> "\n" <> toSelectSQL insertIntoSelectSQL : @@ -287,20 +281,23 @@ factTablePopulateSQL popMode fact = do $ table toSelectSQL FactTablePopulateSelectSQL {..} = - "SELECT \n" <> Text.intercalate ",\n " (map (uncurry asName) ftpsSelectCols) + "SELECT \n" <> joinColumnNames (map (uncurry asName) ftpsSelectCols) <> "\nFROM " <> ftpsSelectTable <> (if not . null $ ftpsJoinClauses - then "\n" <> Text.intercalate"\n" ftpsJoinClauses + then "\n" <> Text.intercalate "\n" ftpsJoinClauses else "") <> (if not . null $ ftpsWhereClauses then "\nWHERE " <> Text.intercalate "\nAND " ftpsWhereClauses else "") <> "\nGROUP BY \n" - <> Text.intercalate ",\n " ftpsGroupByCols + <> joinColumnNames ftpsGroupByCols where asName sql alias = "(" <> sql <> ")" <> " as " <> alias - coalesceFKId col = "coalesce(" <> col <> ", -1)" + coalesceFKId col = + if "coalesce" `Text.isPrefixOf` col + then col + else "coalesce((" <> col <> "), -1)" bucketCount :: Double -> Integer bucketCount errorRate = diff --git a/src/Ringo/Types.hs b/src/Ringo/Types.hs index 7fa04e1..ad9f64f 100644 --- a/src/Ringo/Types.hs +++ b/src/Ringo/Types.hs @@ -105,6 +105,7 @@ data ValidationError = MissingTable !TableName | MissingFact !TableName | MissingColumn !TableName !ColumnName | MissingTimeColumn !TableName + | NullableColumn !TableName !ColumnName deriving (Eq, Show) data Env = Env diff --git a/src/Ringo/Validator.hs b/src/Ringo/Validator.hs index 14448ab..5f65125 100644 --- a/src/Ringo/Validator.hs +++ b/src/Ringo/Validator.hs @@ -10,6 +10,7 @@ import Control.Applicative ((<$>)) import Control.Monad.Reader (Reader, asks) +import Data.Maybe (isJust, fromJust) import Ringo.Types import Ringo.Utils @@ -44,7 +45,12 @@ validateFact Fact {..} = do let colVs = concatMap (checkColumn tables table) factColumns let timeVs = [ MissingTimeColumn factTableName | null [ c | DimTime c <- factColumns ] ] - return $ tableVs ++ parentVs ++ colVs ++ timeVs + let notNullVs = [ NullableColumn factTableName c + | DimTime c <- factColumns + , let col = findColumn c (tableColumns table) + , isJust col + , columnNullable (fromJust col) == Null ] + return $ tableVs ++ parentVs ++ colVs ++ timeVs ++ notNullVs where checkFactParents fName = do facts <- asks envFacts