Refactors generator populate fact module to simplify the code.

- Extracts smaller functions out of big ones
- Moves count distinct update related code to a separate module
master
Abhinav Sarkar 6 years ago
parent d27145b553
commit 18dde15427
No known key found for this signature in database
GPG Key ID: 7C9166A6F5465AD5
  1. 1
      ringo/ringo.cabal
  2. 294
      ringo/src/Ringo/Generator/Populate/Fact.hs
  3. 100
      ringo/src/Ringo/Generator/Populate/Fact/CountDistinct.hs

@ -26,6 +26,7 @@ library
Ringo.Generator.Create,
Ringo.Generator.Populate.Dimension,
Ringo.Generator.Populate.Fact,
Ringo.Generator.Populate.Fact.CountDistinct,
Ringo.Types.Internal,
Ringo.Utils
build-depends: base >=4.7 && <5,

@ -9,16 +9,17 @@
module Ringo.Generator.Populate.Fact
( factTablePopulationSQL
, factTablePopulationStatements
, ilog2FunctionString
) where
import qualified Data.Text as Text
import Prelude.Compat
import Control.Monad.Reader (Reader, asks)
import Database.HsSqlPpp.Syntax ( QueryExpr(..), Statement, makeSelect
, SelectList(..), SelectItem(..), JoinType(..) )
import Database.HsSqlPpp.Syntax ( QueryExpr(..), ScalarExpr, Statement, makeSelect, NameComponent
, JoinType(..) )
import Data.List (nub)
import Data.Maybe (fromJust, fromMaybe, listToMaybe)
import Data.Maybe (fromJust, listToMaybe)
import Data.Monoid ((<>))
import Data.Text (Text)
import Text.RawString.QQ (r)
@ -26,6 +27,7 @@ import Text.RawString.QQ (r)
import Ringo.Extractor.Internal
import Ringo.Generator.Internal
import Ringo.Generator.Sql
import Ringo.Generator.Populate.Fact.CountDistinct
import Ringo.Types.Internal
import Ringo.Utils
@ -55,181 +57,145 @@ $$
LANGUAGE 'plpgsql' IMMUTABLE;
|]
factCountDistinctUpdateStatements :: TablePopulationMode -> Fact -> Text -> QueryExpr -> Reader Env [Statement]
factCountDistinctUpdateStatements popMode fact groupByColPrefix expr = case expr of
Select {selSelectList = SelectList _ origSelectItems, ..} -> do
Settings {..} <- asks envSettings
tables <- asks envTables
let fTableName = factTableName fact
fTable = fromJust . findTable fTableName $ tables
tablePKColName = head [ cName | PrimaryKey cName <- tableConstraints fTable ]
extFactTableName =
suffixTableName popMode settingTableNameSuffixTemplate
$ extractedFactTableName settingFactPrefix settingFactInfix (factName fact) settingTimeUnit
return $ forMaybe (factColumns fact) $ \FactColumn {factColTargetColumn = cName, ..} ->
case factColType of
FactCountDistinct {factColMaybeSourceColumn = scName} ->
let unqCol = cast (eqi fTableName (fromMaybe tablePKColName scName)) "text"
bucketSelectCols =
[ sia (binop "&" (app "hashtext" [ unqCol ])
(num . Text.pack . show $ bucketCount settingFactCountDistinctErrorRate - 1))
(nmc $ cName <> "_bnum")
, sia (binop "-"
(num "31")
(app "ilog2"
[ app "min" [ binop "&"
(app "hashtext" [ unqCol ])
(prefop "~" (parens (binop "<<" (num "1") (num "31"))))]]))
(nmc $ cName <> "_bhash")
]
groupByCols = map ppScalarExpr selGroupBy
selectList =
[ i | i@(SelectItem _ _ a) <- origSelectItems , a `elem` map nmc groupByCols ]
selectStmt =
makeSelect
{ selSelectList = sl $ selectList ++ bucketSelectCols
, selTref = selTref
, selWhere = binop "and" (postop "isnotnull" unqCol) <$> selWhere
, selGroupBy = selGroupBy ++ [ ei $ cName <> "_bnum" ]
}
aggSelectClause =
sia (app "json_object_agg" [ ei (cName <> "_bnum"), ei (cName <> "_bhash") ]) (nmc cName)
in Just $ update extFactTableName
[ (cName, eqi "xyz" cName) ]
[ subtrefa "xyz"
makeSelect
{ selSelectList = sl $ map (si . ei) groupByCols ++ [ aggSelectClause ]
, selTref = [ subtrefa "zyx" selectStmt ]
, selGroupBy = selGroupBy
} ] $
foldBinop "and"
[ binop "=" (eqi extFactTableName . fromJust . Text.stripPrefix groupByColPrefix $ col)
(eqi "xyz" col)
| col <- groupByCols ]
_ -> Nothing
_ -> return []
where
bucketCount :: Double -> Integer
bucketCount errorRate =
let power :: Double = fromIntegral (ceiling . logBase 2 $ (1.04 / errorRate) ** 2 :: Integer)
in ceiling $ 2 ** power
factTablePopulationSQL :: TablePopulationMode -> Fact -> Reader Env [Text]
factTablePopulationSQL popMode fact = do
stmts <- factTablePopulationStatements popMode fact
return $ case stmts of
[] -> []
[i] -> [ ppStatement i ]
i:us -> [ ppStatement i, ilog2FunctionString ] ++ map ppStatement us
factTablePopulationStatements :: TablePopulationMode -> Fact -> Reader Env [Statement]
factTablePopulationStatements popMode fact = do
allDims <- extractAllDimensionTables fact
Settings {..} <- asks envSettings
tables <- asks envTables
defaults <- asks envTypeDefaults
let fTableName = factTableName fact
fTable = fromJust . findTable fTableName $ tables
dimIdColName = settingDimTableIdColumnName
coalesceFKId ex =
app "coalesce" [ ex, num . Text.pack . show $ settingForeignKeyIdCoalesceValue ]
timeUnitColumnInsertSQL cName =
let colName = timeUnitColumnName dimIdColName cName settingTimeUnit
in ( colName
, cast (app "floor" [ binop "/" (extEpoch (eqi fTableName cName))
(num . Text.pack . show . timeUnitToSeconds $ settingTimeUnit) ])
"bigint"
, True
)
dimIdColumnInsertSQL cName =
let sCol = fromJust . findColumn cName $ tableColumns fTable
in (cName, coalesceColumn defaults fTableName sCol, True)
Settings {..} <- asks envSettings
allDims <- extractAllDimensionTables fact
selExprs <- selectExprs popMode fact allDims groupByColPrefix
popQueryExpr <- populateQueryExpr popMode fact allDims selExprs groupByColPrefix
let extFactTableName = suffixTableName popMode settingTableNameSuffixTemplate
$ extractedFactTableName settingFactPrefix settingFactInfix (factName fact) settingTimeUnit
insertIntoStmt = insert extFactTableName (map fst3 selExprs) popQueryExpr
updateStmts <- factCountDistinctUpdateStatements popMode fact groupByColPrefix popQueryExpr
return $ insertIntoStmt : updateStmts
where
groupByColPrefix = "xxff_"
selectExprs :: TablePopulationMode
-> Fact
-> [(Fact, Table)]
-> Text
-> Reader Env [(ColumnName, (ScalarExpr, NameComponent), Bool)]
selectExprs popMode fact allDims groupByColPrefix = do
factSelExprs <- factColumnSelectExprs fact
dimSelExprs <- dimColumnSelectExprs popMode allDims
return [ (cName, (expr, nmc $ groupByColPrefix <> cName), addToGroupBy)
| (cName, expr, addToGroupBy) <- factSelExprs ++ dimSelExprs ]
factColumnSelectExprs :: Fact -> Reader Env [(ColumnName, ScalarExpr, Bool)]
factColumnSelectExprs fact = do
Settings {..} <- asks envSettings
tables <- asks envTables
typeDefaults <- asks envTypeDefaults
let fTableName = factTableName fact
fTable = fromJust . findTable fTableName $ tables
dimIdColName = settingDimTableIdColumnName
app' f cName = app f [ eqi fTableName cName ]
factColMap = concatFor (factColumns fact) $ \FactColumn {factColTargetColumn = cName, ..} ->
case factColType of
DimTime -> [ timeUnitColumnInsertSQL cName ]
NoDimId -> [ dimIdColumnInsertSQL cName ]
TenantId -> [ dimIdColumnInsertSQL cName ]
FactCount {..} ->
[ (cName, app "count" [ maybe star (eqi fTableName) factColMaybeSourceColumn ], False) ]
FactCountDistinct {..} -> [ (cName, cast (str "{}") "json", False) ]
FactSum {..} -> [ (cName, app' "sum" factColSourceColumn, False) ]
FactMax {..} -> [ (cName, app' "max" factColSourceColumn, False) ]
FactMin {..} -> [ (cName, app' "min" factColSourceColumn, False) ]
FactAverage {..} ->
[ ( cName <> settingAvgCountColumnSuffix, app' "count" factColSourceColumn, False )
, ( cName <> settingAvgSumColumnSuffix , app' "sum" factColSourceColumn , False)
]
_ -> []
dimColMap = for allDims $ \(dimFact, factTable@Table {tableName}) -> let
dimFKIdColName =
factDimFKIdColumnName settingDimPrefix dimIdColName dimFact factTable tables
factSourceTableName = factTableName dimFact
factSourceTable = fromJust . findTable factSourceTableName $ tables
dimFKIdColumn = fromJust . findColumn dimFKIdColName $ tableColumns factSourceTable
dimLookupWhereClauses = Just . foldBinop "and" $
[ binop "=" (eqi tableName dimColName) (coalesceColumn defaults factSourceTableName sourceCol)
| (dimColName, sourceColName) <- dimColumnMapping settingDimPrefix dimFact tableName
, let sourceCol = fromJust . findColumn sourceColName $ tableColumns factSourceTable ]
insertExpr = if factTable `elem` tables -- existing dimension table
then (if columnNullable dimFKIdColumn == Null then coalesceFKId else id)
$ eqi factSourceTableName dimFKIdColName
else coalesceFKId . subQueryExp $
makeSelect
{ selSelectList = sl [ si $ ei dimIdColName ]
, selTref =
[ trefa (suffixTableName popMode settingTableNameSuffixTemplate tableName) tableName ]
, selWhere = dimLookupWhereClauses
}
in (dimFKIdColName, insertExpr, True)
colMap = [ (cName, (expr, nmc $ groupByColPrefix <> cName), addToGroupBy)
| (cName, expr, addToGroupBy) <- factColMap ++ dimColMap ]
joinClauses =
return $ concatFor (factColumns fact) $ \FactColumn {factColTargetColumn = cName, ..} ->
case factColType of
DimTime -> [ timeUnitColumnSelectExpr fTableName dimIdColName settingTimeUnit cName ]
NoDimId -> [ dimIdColumnSelectExpr fTableName fTable typeDefaults cName ]
TenantId -> [ dimIdColumnSelectExpr fTableName fTable typeDefaults cName ]
FactCount {..} ->
[ (cName, app "count" [ maybe star (eqi fTableName) factColMaybeSourceColumn ], False) ]
FactCountDistinct {..} -> [ (cName, cast (str "{}") "json", False) ]
FactSum {..} -> [ (cName, app' "sum" factColSourceColumn, False) ]
FactMax {..} -> [ (cName, app' "max" factColSourceColumn, False) ]
FactMin {..} -> [ (cName, app' "min" factColSourceColumn, False) ]
FactAverage {..} ->
[ ( cName <> settingAvgCountColumnSuffix, app' "count" factColSourceColumn, False )
, ( cName <> settingAvgSumColumnSuffix , app' "sum" factColSourceColumn , False)
]
_ -> []
timeUnitColumnSelectExpr :: TableName -> ColumnName -> TimeUnit -> ColumnName -> (ColumnName, ScalarExpr, Bool)
timeUnitColumnSelectExpr fTableName dimIdColName settingTimeUnit cName =
let colName = timeUnitColumnName dimIdColName cName settingTimeUnit
in ( colName
, cast (app "floor" [ binop "/" (extEpoch (eqi fTableName cName))
(num . Text.pack . show . timeUnitToSeconds $ settingTimeUnit) ])
"bigint"
, True
)
dimIdColumnSelectExpr :: TableName -> Table -> TypeDefaults -> ColumnName -> (ColumnName, ScalarExpr, Bool)
dimIdColumnSelectExpr fTableName fTable typeDefaults cName =
let sCol = fromJust . findColumn cName $ tableColumns fTable
in (cName, coalesceColumn typeDefaults fTableName sCol, True)
dimColumnSelectExprs :: TablePopulationMode -> [(Fact, Table)] -> Reader Env [(ColumnName, ScalarExpr, Bool)]
dimColumnSelectExprs popMode allDims = do
settings@Settings {..} <- asks envSettings
tables <- asks envTables
typeDefaults <- asks envTypeDefaults
let dimIdColName = settingDimTableIdColumnName
return $ for allDims $ \(dimFact, factTable@Table {tableName}) -> let
dimFKIdColName =
factDimFKIdColumnName settingDimPrefix dimIdColName dimFact factTable tables
factSourceTableName = factTableName dimFact
factSourceTable = fromJust . findTable factSourceTableName $ tables
dimFKIdColumn = fromJust . findColumn dimFKIdColName $ tableColumns factSourceTable
dimLookupWhereClauses = Just . foldBinop "and" $
[ binop "=" (eqi tableName dimColName) (coalesceColumn typeDefaults factSourceTableName sourceCol)
| (dimColName, sourceColName) <- dimColumnMapping settingDimPrefix dimFact tableName
, let sourceCol = fromJust . findColumn sourceColName $ tableColumns factSourceTable ]
insertExpr = if factTable `elem` tables -- existing dimension table
then (if columnNullable dimFKIdColumn == Null then coalesceFKId settings else id)
$ eqi factSourceTableName dimFKIdColName
else coalesceFKId settings . subQueryExp $
makeSelect
{ selSelectList = sl [ si $ ei dimIdColName ]
, selTref =
[ trefa (suffixTableName popMode settingTableNameSuffixTemplate tableName) tableName ]
, selWhere = dimLookupWhereClauses
}
in (dimFKIdColName, insertExpr, True)
where
coalesceFKId Settings {..} ex =
app "coalesce" [ ex, num . Text.pack . show $ settingForeignKeyIdCoalesceValue ]
populateQueryExpr :: TablePopulationMode
-> Fact
-> [(Fact, Table)]
-> [(ColumnName, (ScalarExpr, NameComponent), Bool)]
-> Text
-> Reader Env QueryExpr
populateQueryExpr popMode fact allDims selExprs groupByColPrefix = do
Settings {..} <- asks envSettings
tables <- asks envTables
let fTableName = factTableName fact
fTable = fromJust . findTable fTableName $ tables
joinClauses =
map (tref &&& joinClausePreds fTable)
. filter (/= fTableName)
. nub
. map (factTableName . fst)
$ allDims
timeCol = eqi fTableName $ head [ cName | FactColumn cName DimTime <- factColumns fact ]
extFactTableName = suffixTableName popMode settingTableNameSuffixTemplate
$ extractedFactTableName settingFactPrefix settingFactInfix (factName fact) settingTimeUnit
populateSelectExpr =
makeSelect
{ selSelectList = sl . map (uncurry sia . snd3) $ colMap
, selTref = [ foldl (\tf (t, oc) -> tjoin tf LeftOuter t oc) (tref fTableName) joinClauses ]
, selWhere = Just . foldBinop "and" $
binop "<" timeCol placeholder :
[ binop ">=" timeCol placeholder | popMode == IncrementalPopulation ]
, selGroupBy = map (ei . (groupByColPrefix <>) . fst3) . filter thd3 $ colMap
}
insertIntoStmt = insert extFactTableName (map fst3 colMap) populateSelectExpr
updateStmts <- factCountDistinctUpdateStatements popMode fact groupByColPrefix populateSelectExpr
return $ insertIntoStmt : updateStmts
timeCol = eqi fTableName $ head [ cName | FactColumn cName DimTime <- factColumns fact ]
return $ makeSelect
{ selSelectList = sl . map (uncurry sia . snd3) $ selExprs
, selTref = [ foldl (\tf (t, oc) -> tjoin tf LeftOuter t oc) (tref fTableName) joinClauses ]
, selWhere = Just . foldBinop "and" $
binop "<" timeCol placeholder :
[ binop ">=" timeCol placeholder | popMode == IncrementalPopulation ]
, selGroupBy = map (ei . (groupByColPrefix <>) . fst3) . filter thd3 $ selExprs
}
where
groupByColPrefix = "xxff_"
joinClausePreds table oTableName =
foldBinop "and"
. map (\(c1, c2) -> binop "=" (eqi (tableName table) c1) (eqi oTableName c2))
<$> listToMaybe [ colPairs | ForeignKey tName colPairs <- tableConstraints table
, tName == oTableName ]
factTablePopulationSQL :: TablePopulationMode -> Fact -> Reader Env [Text]
factTablePopulationSQL popMode fact = do
stmts <- factTablePopulationStatements popMode fact
return $ case stmts of
[] -> []
[i] -> [ ppStatement i ]
i:us -> [ ppStatement i, ilog2FunctionString ] ++ map ppStatement us

@ -0,0 +1,100 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE GADTs #-}
module Ringo.Generator.Populate.Fact.CountDistinct (factCountDistinctUpdateStatements) where
import qualified Data.Text as Text
import Prelude.Compat
import Control.Monad (forM)
import Control.Monad.Reader (Reader, asks)
import Database.HsSqlPpp.Syntax ( QueryExpr(..), ScalarExpr, Statement, makeSelect
, SelectList(..), SelectItem(..) )
import Data.Maybe (fromJust, fromMaybe, catMaybes)
import Data.Monoid ((<>))
import Data.Text (Text)
import Ringo.Extractor.Internal
import Ringo.Generator.Internal
import Ringo.Generator.Sql
import Ringo.Types.Internal
factCountDistinctUpdateStatements :: TablePopulationMode -> Fact -> Text -> QueryExpr -> Reader Env [Statement]
factCountDistinctUpdateStatements popMode fact groupByColPrefix expr = case expr of
select@Select {..} -> do
Settings {..} <- asks envSettings
let extFactTableName =
suffixTableName popMode settingTableNameSuffixTemplate
$ extractedFactTableName settingFactPrefix settingFactInfix (factName fact) settingTimeUnit
fmap catMaybes $ forM (factColumns fact) $ \FactColumn {factColTargetColumn = cName, ..} ->
case factColType of
FactCountDistinct {factColMaybeSourceColumn = scName} -> do
let groupByCols = map ppScalarExpr selGroupBy
selectStmt <- queryExpr fact cName scName groupByCols select
let aggSelectClause =
sia (app "json_object_agg" [ ei (cName <> "_bnum"), ei (cName <> "_bhash") ]) (nmc cName)
return $ Just $ update extFactTableName
[ (cName, eqi "xyz" cName) ]
[ subtrefa "xyz"
makeSelect
{ selSelectList = sl $ map (si . ei) groupByCols ++ [ aggSelectClause ]
, selTref = [ subtrefa "zyx" selectStmt ]
, selGroupBy = selGroupBy
} ] $
foldBinop "and"
[ binop "=" (eqi extFactTableName . fromJust . Text.stripPrefix groupByColPrefix $ col)
(eqi "xyz" col)
| col <- groupByCols ]
_ -> return Nothing
_ -> return []
queryExpr :: Fact -> ColumnName -> Maybe ColumnName -> [ColumnName] -> QueryExpr -> Reader Env QueryExpr
queryExpr fact targetCol sourceCol groupByCols select = case select of
Select {selSelectList = SelectList _ origSelectItems, ..} -> do
Settings {..} <- asks envSettings
tables <- asks envTables
let fTableName = factTableName fact
fTable = fromJust . findTable fTableName $ tables
tablePKColName = head [ cName | PrimaryKey cName <- tableConstraints fTable ]
unqCol = cast (eqi fTableName (fromMaybe tablePKColName sourceCol)) "text"
selectList = [ i | i@(SelectItem _ _ a) <- origSelectItems , a `elem` map nmc groupByCols ]
bucketSelectList <- bucketSelectItems targetCol unqCol
return $ makeSelect
{ selSelectList = sl $ selectList ++ bucketSelectList
, selTref = selTref
, selWhere = binop "and" (postop "isnotnull" unqCol) <$> selWhere
, selGroupBy = selGroupBy ++ [ ei $ targetCol <> "_bnum" ]
}
_ -> error "Must be a Select"
bucketSelectItems :: ColumnName -> ScalarExpr -> Reader Env [SelectItem]
bucketSelectItems targetCol unqCol = do
Settings {..} <- asks envSettings
return [ sia (binop "&" (app "hashtext" [ unqCol ])
(num . Text.pack . show $ bucketCount settingFactCountDistinctErrorRate - 1))
(nmc $ targetCol <> "_bnum")
, sia (binop "-"
(num "31")
(app "ilog2"
[ app "min" [ binop "&"
(app "hashtext" [ unqCol ])
(prefop "~" (parens (binop "<<" (num "1") (num "31"))))]]))
(nmc $ targetCol <> "_bhash")
]
where
bucketCount :: Double -> Integer
bucketCount errorRate =
let power :: Double = fromIntegral (ceiling . logBase 2 $ (1.04 / errorRate) ** 2 :: Integer)
in ceiling $ 2 ** power
Loading…
Cancel
Save