Adds coalescing to default values for fact table columns.

- Removes FKs from fact tables
- Some coercions in SQL for faster processing
pull/1/head
Abhinav Sarkar 2015-12-28 18:09:02 +05:30
parent b994955399
commit 3978f33cd0
6 changed files with 72 additions and 69 deletions

View File

@ -28,7 +28,7 @@ library
mtl >=2.1 && <2.3 mtl >=2.1 && <2.3
ghc-options: -Wall -fno-warn-unused-do-bind -funbox-strict-fields -fno-warn-orphans -O2 ghc-options: -Wall -fno-warn-unused-do-bind -funbox-strict-fields -fno-warn-orphans -O2
default-extensions: OverloadedStrings, RecordWildCards, ScopedTypeVariables, BangPatterns, default-extensions: OverloadedStrings, RecordWildCards, ScopedTypeVariables, BangPatterns,
TupleSections, CPP TupleSections, CPP, NamedFieldPuns
default-language: Haskell2010 default-language: Haskell2010
executable ringo executable ringo
@ -49,7 +49,7 @@ executable ringo
ringo ringo
ghc-options: -Wall -fno-warn-unused-do-bind -funbox-strict-fields -fno-warn-orphans -O2 ghc-options: -Wall -fno-warn-unused-do-bind -funbox-strict-fields -fno-warn-orphans -O2
default-extensions: OverloadedStrings, RecordWildCards, ScopedTypeVariables, BangPatterns, default-extensions: OverloadedStrings, RecordWildCards, ScopedTypeVariables, BangPatterns,
TupleSections, CPP TupleSections, CPP, NamedFieldPuns
default-language: Haskell2010 default-language: Haskell2010
test-suite ringo-test test-suite ringo-test

View File

@ -30,7 +30,7 @@ extractFactTable fact = do
columns = concatFor (factColumns fact) $ \col -> case col of columns = concatFor (factColumns fact) $ \col -> case col of
DimTime cName -> DimTime cName ->
[ Column (timeUnitColumnName dimIdColName cName settingTimeUnit) "integer" NotNull ] [ Column (timeUnitColumnName dimIdColName cName settingTimeUnit) "bigint" NotNull ]
NoDimId cName -> [ fromJust . findColumn cName . tableColumns $ table] NoDimId cName -> [ fromJust . findColumn cName . tableColumns $ table]
FactCount _ cName -> [ Column cName countColType NotNull ] FactCount _ cName -> [ Column cName countColType NotNull ]
FactSum scName cName -> [ Column cName (sourceColumnType scName) NotNull ] FactSum scName cName -> [ Column cName (sourceColumnType scName) NotNull ]
@ -41,17 +41,13 @@ extractFactTable fact = do
FactCountDistinct _ cName -> [ Column cName "json" NotNull ] FactCountDistinct _ cName -> [ Column cName "json" NotNull ]
_ -> [] _ -> []
fks = for allDims $ \(fact', tab@Table {..}) -> fkCols = for allDims $ \(_, Table {..}) ->
let colName = factDimFKIdColumnName settingDimPrefix dimIdColName tableName let colName = factDimFKIdColumnName settingDimPrefix dimIdColName tableName
colType = idColTypeToFKIdColType settingDimTableIdColumnType colType = idColTypeToFKIdColType settingDimTableIdColumnType
colNullable = in Column colName colType NotNull
if tab `elem` tables || fact /= fact' || any ((== Null) . columnNullable) tableColumns
then Null
else NotNull
in ( Column colName colType colNullable , ForeignKey tableName [(colName, dimIdColName)] )
ukColNames = ukColNames =
(++ map (columnName . fst) fks) (++ map columnName fkCols)
. forMaybe (factColumns fact) $ \col -> case col of . forMaybe (factColumns fact) $ \col -> case col of
DimTime cName -> Just (timeUnitColumnName dimIdColName cName settingTimeUnit) DimTime cName -> Just (timeUnitColumnName dimIdColName cName settingTimeUnit)
NoDimId cName -> Just cName NoDimId cName -> Just cName
@ -60,8 +56,8 @@ extractFactTable fact = do
return Table return Table
{ tableName = { tableName =
extractedFactTableName settingFactPrefix settingFactInfix (factName fact) settingTimeUnit extractedFactTableName settingFactPrefix settingFactInfix (factName fact) settingTimeUnit
, tableColumns = columns ++ map fst fks , tableColumns = columns ++ fkCols
, tableConstraints = UniqueKey ukColNames : map snd fks , tableConstraints = [ UniqueKey ukColNames ]
} }
extractDependencies :: Fact -> Reader Env Dependencies extractDependencies :: Fact -> Reader Env Dependencies

View File

@ -61,7 +61,10 @@ extractDimensionTables fact = do
}) })
. Map.toList . Map.toList
. Map.mapWithKey . Map.mapWithKey
(\dim -> map (\col -> col { columnName = dimColumnName dim (columnName col) }) . nub) (\dim -> map (\col -> col { columnName = dimColumnName dim (columnName col)
, columnNullable = NotNull
})
. nub)
. Map.fromListWith (flip (++)) . Map.fromListWith (flip (++))
. mapMaybe (\fcol -> do . mapMaybe (\fcol -> do
DimVal d col <- fcol DimVal d col <- fcol

View File

@ -13,10 +13,9 @@ import Control.Applicative ((<$>))
#endif #endif
import Control.Monad.Reader (Reader, asks) import Control.Monad.Reader (Reader, asks)
import Data.List (nub, find, subsequences, partition, sortBy) import Data.List (nub, find)
import Data.Maybe (fromJust, fromMaybe, mapMaybe, catMaybes) import Data.Maybe (fromJust, fromMaybe, mapMaybe)
import Data.Monoid ((<>)) import Data.Monoid ((<>))
import Data.Ord (comparing)
import Data.Text (Text) import Data.Text (Text)
import Ringo.Extractor.Internal import Ringo.Extractor.Internal
@ -31,8 +30,8 @@ columnDefnSQL :: Column -> Text
columnDefnSQL Column {..} = columnDefnSQL Column {..} =
columnName <> " " <> columnType <> " " <> nullableDefnSQL columnNullable columnName <> " " <> columnType <> " " <> nullableDefnSQL columnNullable
colNamesString :: [ColumnName] -> Text joinColumnNames :: [ColumnName] -> Text
colNamesString = Text.intercalate ", " joinColumnNames = Text.intercalate ",\n"
fullColName :: TableName -> ColumnName -> ColumnName fullColName :: TableName -> ColumnName -> ColumnName
fullColName tName cName = tName <> "." <> cName fullColName tName cName = tName <> "." <> cName
@ -43,34 +42,16 @@ constraintDefnSQL Table {..} constraint =
in case constraint of in case constraint of
PrimaryKey cName -> [ alterTableSQL <> "PRIMARY KEY (" <> cName <> ")" ] PrimaryKey cName -> [ alterTableSQL <> "PRIMARY KEY (" <> cName <> ")" ]
ForeignKey oTableName cNamePairs -> ForeignKey oTableName cNamePairs ->
[ alterTableSQL <> "FOREIGN KEY (" <> colNamesString (map fst cNamePairs) <> ") REFERENCES " [ alterTableSQL <> "FOREIGN KEY (" <> joinColumnNames (map fst cNamePairs) <> ") REFERENCES "
<> oTableName <> " (" <> colNamesString (map snd cNamePairs) <> ")" ] <> oTableName <> " (" <> joinColumnNames (map snd cNamePairs) <> ")" ]
UniqueKey cNames -> ["CREATE UNIQUE INDEX ON " <> tableName <> "(" <> colNamesString cNames <> ")"] UniqueKey cNames -> ["CREATE UNIQUE INDEX ON " <> tableName <> " (" <> joinColumnNames cNames <> ")"]
-- let
-- (notNullCols, nullCols) =
-- both (map columnName)
-- $ partition ((== NotNull) . columnNullable)
-- $ catMaybes [ findColumn cName tableColumns | cName <- cNames ]
-- combinations =
-- map (\cs -> (cs, [ c | c <- nullCols, c `notElem` cs ]))
-- . sortBy (comparing length)
-- $ subsequences nullCols
-- in [ "CREATE UNIQUE INDEX ON " <> tableName
-- <> " (" <> colNamesString (notNullCols ++ nnCols) <> ")"
-- <> if null whereClauses
-- then ""
-- else "\nWHERE "<> Text.intercalate "\nAND " whereClauses
-- | (nnCols, nCols) <- combinations
-- , not $ null (notNullCols ++ nnCols)
-- , let whereClauses =
-- [ c <> " IS NOT NULL" | c <- nnCols ] ++ [ c <> " IS NULL" | c <- nCols ] ]
tableDefnSQL :: Table -> [Text] tableDefnSQL :: Table -> [Text]
tableDefnSQL table@Table {..} = tableDefnSQL table@Table {..} =
tableSQL : concatMap (constraintDefnSQL table) tableConstraints tableSQL : concatMap (constraintDefnSQL table) tableConstraints
where where
tableSQL = "CREATE TABLE " <> tableName <> " (\n" tableSQL = "CREATE TABLE " <> tableName <> " (\n"
<> (Text.intercalate ",\n" . map columnDefnSQL $ tableColumns) <> (joinColumnNames . map columnDefnSQL $ tableColumns)
<> "\n)" <> "\n)"
factTableDefnSQL :: Fact -> Table -> Reader Env [Text] factTableDefnSQL :: Fact -> Table -> Reader Env [Text]
@ -96,12 +77,14 @@ dimColumnMapping dimPrefix fact dimTableName =
[ (dimColumnName dName cName, cName) [ (dimColumnName dName cName, cName)
| DimVal dName cName <- factColumns fact , dimPrefix <> dName == dimTableName] | DimVal dName cName <- factColumns fact , dimPrefix <> dName == dimTableName]
coalesceColumn :: Column -> Text coalesceColumn :: TableName -> Column -> Text
coalesceColumn Column{..} = coalesceColumn tName Column{..} =
if columnNullable == Null if columnNullable == Null
then "coalesce(" <> columnName <> "," <> defVal columnType <> ")" then "coalesce(" <> fqColName <> "," <> defVal columnType <> ")"
else columnName else fqColName
where where
fqColName = fullColName tName columnName
defVal colType defVal colType
| "integer" `Text.isPrefixOf` colType = "-42" | "integer" `Text.isPrefixOf` colType = "-42"
| "timestamp" `Text.isPrefixOf` colType = "'00-00-00 00:00:00'" | "timestamp" `Text.isPrefixOf` colType = "'00-00-00 00:00:00'"
@ -117,14 +100,15 @@ dimensionTablePopulateSQL popMode fact dimTableName = do
let factTable = fromJust $ findTable (factTableName fact) tables let factTable = fromJust $ findTable (factTableName fact) tables
colMapping = dimColumnMapping dimPrefix fact dimTableName colMapping = dimColumnMapping dimPrefix fact dimTableName
baseSelectC = "SELECT DISTINCT\n" baseSelectC = "SELECT DISTINCT\n"
<> colNamesString <> joinColumnNames
(map (\(_, c) -> (map (\(_, c) ->
coalesceColumn . fromJust . findColumn c $ (tableColumns factTable)) let col = fromJust . findColumn c $ tableColumns factTable
in coalesceColumn (factTableName fact) col)
colMapping) colMapping)
<> "\n" <> "\n"
<> "FROM " <> factTableName fact <> "FROM " <> factTableName fact
insertC selectC = "INSERT INTO " <> dimTableName insertC selectC = "INSERT INTO " <> dimTableName
<> " (\n" <> colNamesString (map fst colMapping) <> "\n) " <> " (\n" <> joinColumnNames (map fst colMapping) <> "\n) "
<> "SELECT x.* FROM (\n" <> selectC <> ") x" <> "SELECT x.* FROM (\n" <> selectC <> ") x"
timeCol = head [ cName | DimTime cName <- factColumns fact ] timeCol = head [ cName | DimTime cName <- factColumns fact ]
return $ case popMode of return $ case popMode of
@ -156,21 +140,23 @@ factTablePopulateSQL popMode fact = do
allDims <- extractAllDimensionTables fact allDims <- extractAllDimensionTables fact
tables <- asks envTables tables <- asks envTables
let fTableName = factTableName fact let fTableName = factTableName fact
table = fromJust . findTable fTableName $ tables fTable = fromJust . findTable fTableName $ tables
dimIdColName = settingDimTableIdColumnName dimIdColName = settingDimTableIdColumnName
tablePKColName = head [ cName | PrimaryKey cName <- tableConstraints table ] tablePKColName = head [ cName | PrimaryKey cName <- tableConstraints fTable ]
timeUnitColumnInsertSQL cName = timeUnitColumnInsertSQL cName =
let colName = timeUnitColumnName dimIdColName cName settingTimeUnit let colName = timeUnitColumnName dimIdColName cName settingTimeUnit
in ( colName in ( colName
, "floor(extract(epoch from " <> fullColName fTableName cName <> ")/" , "extract(epoch from " <> fullColName fTableName cName <> ")::bigint/"
<> Text.pack (show $ timeUnitToSeconds settingTimeUnit) <> ")" <> Text.pack (show $ timeUnitToSeconds settingTimeUnit)
, True , True
) )
factColMap = concatFor (factColumns fact) $ \col -> case col of factColMap = concatFor (factColumns fact) $ \col -> case col of
DimTime cName -> [ timeUnitColumnInsertSQL cName ] DimTime cName -> [ timeUnitColumnInsertSQL cName ]
NoDimId cName -> [ (cName, fullColName fTableName cName, True) ] NoDimId cName ->
let sCol = fromJust . findColumn cName $ tableColumns fTable
in [ (cName, coalesceColumn fTableName sCol, True) ]
FactCount scName cName -> FactCount scName cName ->
[ (cName, "count(" <> maybe "*" (fullColName fTableName) scName <> ")", False) ] [ (cName, "count(" <> maybe "*" (fullColName fTableName) scName <> ")", False) ]
FactSum scName cName -> FactSum scName cName ->
@ -188,24 +174,32 @@ factTablePopulateSQL popMode fact = do
FactCountDistinct _ cName -> [ (cName, "'{}'::json", False)] FactCountDistinct _ cName -> [ (cName, "'{}'::json", False)]
_ -> [] _ -> []
dimColMap = for allDims $ \(dimFact, factTable@Table {..}) -> dimColMap = for allDims $ \(dimFact, factTable@Table {tableName}) ->
let colName = factDimFKIdColumnName settingDimPrefix dimIdColName tableName let colName = factDimFKIdColumnName settingDimPrefix dimIdColName tableName
col = fromJust . findColumn colName $ tableColumns factSourceTable
factSourceTableName = factTableName dimFact factSourceTableName = factTableName dimFact
insertSQL = if factTable `elem` tables factSourceTable = fromJust . findTable factSourceTableName $ tables
then fullColName factSourceTableName colName insertSQL = if factTable `elem` tables -- existing dimension table
then (if columnNullable col == Null then coalesceFKId else id)
$ fullColName factSourceTableName colName
else let else let
dimLookupWhereClauses = dimLookupWhereClauses =
[ fullColName tableName c1 <> " = " <> fullColName factSourceTableName c2 [ fullColName tableName c1 <> " = " <> coalesceColumn factSourceTableName col2
| (c1, c2) <- dimColumnMapping settingDimPrefix dimFact tableName ] | (c1, c2) <- dimColumnMapping settingDimPrefix dimFact tableName
, let col2 = fromJust . findColumn c2 $ tableColumns factSourceTable ]
in "SELECT " <> dimIdColName <> " FROM " <> tableName <> "\nWHERE " in "SELECT " <> dimIdColName <> " FROM " <> tableName <> "\nWHERE "
<> Text.intercalate "\n AND " dimLookupWhereClauses <> Text.intercalate "\n AND " dimLookupWhereClauses
in (colName, insertSQL, True) insertSQL' = if factSourceTableName == fTableName
then insertSQL
else coalesceFKId insertSQL
in (colName, insertSQL', True)
colMap = [ (cName, (sql, groupByColPrefix <> cName), addAs) colMap = [ (cName, (sql, groupByColPrefix <> cName), addAs)
| (cName, sql, addAs) <- factColMap ++ dimColMap ] | (cName, sql, addAs) <- factColMap ++ dimColMap ]
joinClauses = joinClauses =
mapMaybe (\tName -> (\p -> "LEFT JOIN " <> tName <> "\nON "<> p) <$> joinClausePreds table tName) mapMaybe (\tName -> (\p -> "LEFT JOIN " <> tName <> "\nON "<> p) <$> joinClausePreds fTable tName)
. nub . nub
. map (factTableName . fst) . map (factTableName . fst)
$ allDims $ allDims
@ -260,14 +254,14 @@ factTablePopulateSQL popMode fact = do
in "UPDATE " <> extFactTableName in "UPDATE " <> extFactTableName
<> "\nSET " <> cName <> " = " <> fullColName "xyz" cName <> "\nSET " <> cName <> " = " <> fullColName "xyz" cName
<> "\nFROM (" <> "\nFROM ("
<> "\nSELECT " <> Text.intercalate ",\n" (origGroupByCols ++ [aggSelectClause]) <> "\nSELECT " <> joinColumnNames (origGroupByCols ++ [aggSelectClause])
<> "\nFROM (\n" <> selectSQL <> "\n) zyx" <> "\nFROM (\n" <> selectSQL <> "\n) zyx"
<> "\nGROUP BY \n" <> Text.intercalate ",\n" origGroupByCols <> "\nGROUP BY \n" <> joinColumnNames origGroupByCols
<> "\n) xyz" <> "\n) xyz"
<> "\n WHERE\n" <> "\n WHERE\n"
<> Text.intercalate "\nAND " <> Text.intercalate "\nAND "
[ coalesceFKId (fullColName extFactTableName .fromJust . Text.stripPrefix groupByColPrefix $ col) [ fullColName extFactTableName .fromJust . Text.stripPrefix groupByColPrefix $ col
<> " = " <> coalesceFKId (fullColName "xyz" col) <> " = " <> fullColName "xyz" col
| col <- origGroupByCols ] | col <- origGroupByCols ]
return $ insertIntoInsertSQL <> "\n" <> toSelectSQL insertIntoSelectSQL : return $ insertIntoInsertSQL <> "\n" <> toSelectSQL insertIntoSelectSQL :
@ -287,20 +281,23 @@ factTablePopulateSQL popMode fact = do
$ table $ table
toSelectSQL FactTablePopulateSelectSQL {..} = toSelectSQL FactTablePopulateSelectSQL {..} =
"SELECT \n" <> Text.intercalate ",\n " (map (uncurry asName) ftpsSelectCols) "SELECT \n" <> joinColumnNames (map (uncurry asName) ftpsSelectCols)
<> "\nFROM " <> ftpsSelectTable <> "\nFROM " <> ftpsSelectTable
<> (if not . null $ ftpsJoinClauses <> (if not . null $ ftpsJoinClauses
then "\n" <> Text.intercalate"\n" ftpsJoinClauses then "\n" <> Text.intercalate "\n" ftpsJoinClauses
else "") else "")
<> (if not . null $ ftpsWhereClauses <> (if not . null $ ftpsWhereClauses
then "\nWHERE " <> Text.intercalate "\nAND " ftpsWhereClauses then "\nWHERE " <> Text.intercalate "\nAND " ftpsWhereClauses
else "") else "")
<> "\nGROUP BY \n" <> "\nGROUP BY \n"
<> Text.intercalate ",\n " ftpsGroupByCols <> joinColumnNames ftpsGroupByCols
where where
asName sql alias = "(" <> sql <> ")" <> " as " <> alias asName sql alias = "(" <> sql <> ")" <> " as " <> alias
coalesceFKId col = "coalesce(" <> col <> ", -1)" coalesceFKId col =
if "coalesce" `Text.isPrefixOf` col
then col
else "coalesce((" <> col <> "), -1)"
bucketCount :: Double -> Integer bucketCount :: Double -> Integer
bucketCount errorRate = bucketCount errorRate =

View File

@ -105,6 +105,7 @@ data ValidationError = MissingTable !TableName
| MissingFact !TableName | MissingFact !TableName
| MissingColumn !TableName !ColumnName | MissingColumn !TableName !ColumnName
| MissingTimeColumn !TableName | MissingTimeColumn !TableName
| NullableColumn !TableName !ColumnName
deriving (Eq, Show) deriving (Eq, Show)
data Env = Env data Env = Env

View File

@ -10,6 +10,7 @@ import Control.Applicative ((<$>))
import Control.Monad.Reader (Reader, asks) import Control.Monad.Reader (Reader, asks)
import Data.Maybe (isJust, fromJust)
import Ringo.Types import Ringo.Types
import Ringo.Utils import Ringo.Utils
@ -44,7 +45,12 @@ validateFact Fact {..} = do
let colVs = concatMap (checkColumn tables table) factColumns let colVs = concatMap (checkColumn tables table) factColumns
let timeVs = [ MissingTimeColumn factTableName let timeVs = [ MissingTimeColumn factTableName
| null [ c | DimTime c <- factColumns ] ] | null [ c | DimTime c <- factColumns ] ]
return $ tableVs ++ parentVs ++ colVs ++ timeVs let notNullVs = [ NullableColumn factTableName c
| DimTime c <- factColumns
, let col = findColumn c (tableColumns table)
, isJust col
, columnNullable (fromJust col) == Null ]
return $ tableVs ++ parentVs ++ colVs ++ timeVs ++ notNullVs
where where
checkFactParents fName = do checkFactParents fName = do
facts <- asks envFacts facts <- asks envFacts