From ff4ca5e235ac3a6435d147468159f18efa5f1ab9 Mon Sep 17 00:00:00 2001 From: Abhinav Sarkar Date: Tue, 22 Dec 2015 19:46:37 +0530 Subject: [PATCH] Adds sql generation for count distinct fact columns population. --- app/Main.hs | 2 +- app/Ringo/ArgParser.hs | 5 ++ src/Ringo.hs | 2 +- src/Ringo/Extractor.hs | 2 +- src/Ringo/Generator.hs | 136 ++++++++++++++++++++++++++++++++--------- src/Ringo/Types.hs | 50 +++++++-------- 6 files changed, 142 insertions(+), 55 deletions(-) diff --git a/app/Main.hs b/app/Main.hs index 3e01464..9973998 100644 --- a/app/Main.hs +++ b/app/Main.hs @@ -69,7 +69,7 @@ writeFiles outputDir env@Env{..} = do , table <- tabs , table `notElem` envTables ] - factTablePopulateSQLs typ gen = [ (typ, tableName table, sqlStr $ gen env fact) + factTablePopulateSQLs typ gen = [ (typ, tableName table, unlines . map sqlStr $ gen env fact) | (fact, table) <- factTables ] sqls = concat [ dimTableDefnSQLs diff --git a/app/Ringo/ArgParser.hs b/app/Ringo/ArgParser.hs index 073121b..79e5dbe 100644 --- a/app/Ringo/ArgParser.hs +++ b/app/Ringo/ArgParser.hs @@ -49,6 +49,11 @@ settingsParser = let Settings {..} = defSettings <*> minorOption "fact-count-col-type" settingFactCountColumnType "Type of fact table count columns" + <*> option auto (long "fact-count-distinct-error-rate" + <> hidden + <> value settingFactCountDistinctErrorRate + <> showDefault + <> help "Error rate for count distinct calulations") <*> minorOption "fact-infix" settingFactInfix "Infix for fact tables" diff --git a/src/Ringo.hs b/src/Ringo.hs index 3895534..7a17df3 100644 --- a/src/Ringo.hs +++ b/src/Ringo.hs @@ -35,7 +35,7 @@ dimensionTablePopulateSQL :: TablePopulationMode -> Env -> Fact -> TableName -> dimensionTablePopulateSQL popMode env fact = flip runReader env . G.dimensionTablePopulateSQL popMode fact -factTablePopulateSQL :: TablePopulationMode -> Env -> Fact -> Text +factTablePopulateSQL :: TablePopulationMode -> Env -> Fact -> [Text] factTablePopulateSQL popMode env = flip runReader env . G.factTablePopulateSQL popMode diff --git a/src/Ringo/Extractor.hs b/src/Ringo/Extractor.hs index 98a7848..21cdf3e 100644 --- a/src/Ringo/Extractor.hs +++ b/src/Ringo/Extractor.hs @@ -38,7 +38,7 @@ extractFactTable fact = do [ Column (cName <> settingAvgCountColumSuffix) countColType NotNull , Column (cName <> settingAvgSumColumnSuffix) (sourceColumnType scName) NotNull ] - FactCountDistinct _ cName -> [ Column cName (countColType <> "[]") NotNull ] + FactCountDistinct _ cName -> [ Column cName "json" NotNull ] _ -> [] fks = for allDims $ \(fact', tab@Table {..}) -> diff --git a/src/Ringo/Generator.hs b/src/Ringo/Generator.hs index c7f38cd..fb8e755 100644 --- a/src/Ringo/Generator.hs +++ b/src/Ringo/Generator.hs @@ -14,7 +14,7 @@ import Control.Applicative ((<$>)) import Control.Monad.Reader (Reader, asks) import Data.List (nub, find, subsequences, partition, sortBy) -import Data.Maybe (fromJust, mapMaybe, catMaybes) +import Data.Maybe (fromJust, fromMaybe, mapMaybe, catMaybes) import Data.Monoid ((<>)) import Data.Ord (comparing) import Data.Text (Text) @@ -120,14 +120,23 @@ dimensionTablePopulateSQL popMode fact dimTableName = do <> "\nWHERE " <> Text.intercalate " \nAND " [ fullColName dimTableName c <> " IS NULL" | (c, _) <- colMapping ] -factTablePopulateSQL :: TablePopulationMode -> Fact -> Reader Env Text +data FactTablePopulateSelectSQL = FactTablePopulateSelectSQL + { ftpsSelectCols :: ![(Text, Text)] + , ftpsSelectTable :: !Text + , ftpsJoinClauses :: ![Text] + , ftpsWhereClauses :: ![Text] + , ftpsGroupByCols :: ![Text] + } deriving (Show, Eq) + +factTablePopulateSQL :: TablePopulationMode -> Fact -> Reader Env [Text] factTablePopulateSQL popMode fact = do - Settings {..} <- asks envSettings - allDims <- extractAllDimensionTables fact - tables <- asks envTables - let fTableName = factTableName fact - table = fromJust . findTable fTableName $ tables - dimIdColName = settingDimTableIdColumnName + Settings {..} <- asks envSettings + allDims <- extractAllDimensionTables fact + tables <- asks envTables + let fTableName = factTableName fact + table = fromJust . findTable fTableName $ tables + dimIdColName = settingDimTableIdColumnName + tablePKColName = head [ cName | PrimaryKey cName <- tableConstraints table ] timeUnitColumnInsertSQL cName = let colName = timeUnitColumnName dimIdColName cName settingTimeUnit @@ -138,13 +147,13 @@ factTablePopulateSQL popMode fact = do ) factColMap = concatFor (factColumns fact) $ \col -> case col of - DimTime cName -> [ timeUnitColumnInsertSQL cName ] - NoDimId cName -> [ (cName, fullColName fTableName cName, True) ] - FactCount scName cName -> + DimTime cName -> [ timeUnitColumnInsertSQL cName ] + NoDimId cName -> [ (cName, fullColName fTableName cName, True) ] + FactCount scName cName -> [ (cName, "count(" <> maybe "*" (fullColName fTableName) scName <> ")", False) ] - FactSum scName cName -> + FactSum scName cName -> [ (cName, "sum(" <> fullColName fTableName scName <> ")", False) ] - FactAverage scName cName -> + FactAverage scName cName -> [ ( cName <> settingAvgCountColumSuffix , "count(" <> fullColName fTableName scName <> ")" , False @@ -154,6 +163,7 @@ factTablePopulateSQL popMode fact = do , False ) ] + FactCountDistinct _ cName -> [ (cName, "'{}'::json", False)] _ -> [] dimColMap = for allDims $ \(dimFact, factTable@Table {..}) -> @@ -169,31 +179,79 @@ factTablePopulateSQL popMode fact = do <> Text.intercalate "\n AND " dimLookupWhereClauses in (colName, insertSQL, True) - colMap = [ (cName, if addAs then asName cName sql else sql, addAs) + colMap = [ (cName, (sql, groupByColPrefix <> cName), addAs) | (cName, sql, addAs) <- factColMap ++ dimColMap ] joinClauses = - mapMaybe (\tName -> (\p -> "LEFT JOIN " <> tName <> " ON "<> p) <$> joinClausePreds table tName) + mapMaybe (\tName -> (\p -> "LEFT JOIN " <> tName <> "\nON "<> p) <$> joinClausePreds table tName) . nub . map (factTableName . fst) $ allDims timeCol = fullColName fTableName $ head [ cName | DimTime cName <- factColumns fact ] - return $ "INSERT INTO " - <> extractedFactTableName settingFactPrefix settingFactInfix (factName fact) settingTimeUnit - <> " (\n" <> unlineCols (map fst3 colMap) <> "\n)" - <> "\nSELECT \n" <> unlineCols (map snd3 colMap) - <> "\nFROM " <> fTableName <> "\n" <> Text.intercalate"\n" joinClauses - <> (if popMode == IncrementalPopulation - then "\nWHERE " <> timeCol <> " > ? AND " <> timeCol <> " <= ?" - else "") - <> "\nGROUP BY \n" - <> unlineCols (map ((groupByColPrefix <>) . fst3) . filter thd3 $ colMap) + extFactTableName = + extractedFactTableName settingFactPrefix settingFactInfix (factName fact) settingTimeUnit + + insertIntoSelectSQL = + FactTablePopulateSelectSQL + { ftpsSelectCols = map snd3 colMap + , ftpsSelectTable = fTableName + , ftpsJoinClauses = joinClauses + , ftpsWhereClauses = if popMode == IncrementalPopulation + then [timeCol <> " > ?", timeCol <> " <= ?"] + else [] + , ftpsGroupByCols = map ((groupByColPrefix <>) . fst3) . filter thd3 $ colMap + } + + insertIntoInsertSQL = "INSERT INTO " <> extFactTableName + <> " (\n" <> Text.intercalate ",\n " (map fst3 colMap) <> "\n)" + + countDistinctCols = [ col | col@(FactCountDistinct _ _) <- factColumns fact] + + updateSQLs = + let origGroupByCols = ftpsGroupByCols insertIntoSelectSQL + origSelectCols = ftpsSelectCols insertIntoSelectSQL + + in for countDistinctCols $ \(FactCountDistinct scName cName) -> + let unqCol = fullColName fTableName (fromMaybe tablePKColName scName) <> "::text" + + bucketSelectCols = + [ ( "hashtext(" <> unqCol <> ") & " + <> Text.pack (show $ bucketCount settingFactCountDistinctErrorRate - 1) + , cName <> "_bnum") + , ( "31 - floor(log(2, min(hashtext(" <> unqCol <> ") & ~(1 << 31))))::int" + , cName <> "_bhash" + ) + ] + + selectSQL = toSelectSQL $ + insertIntoSelectSQL + { ftpsSelectCols = filter ((`elem` origGroupByCols) . snd) origSelectCols ++ bucketSelectCols + , ftpsGroupByCols = origGroupByCols ++ [cName <> "_bnum"] + , ftpsWhereClauses = ftpsWhereClauses insertIntoSelectSQL ++ [ unqCol <> " IS NOT NULL" ] + } + + aggSelectClause = + "json_object_agg(" <> cName <> "_bnum, " <> cName <> "_bhash) AS " <> cName + + in "UPDATE " <> extFactTableName + <> "\nSET " <> cName <> " = " <> fullColName "xyz" cName + <> "\nFROM (" + <> "\nSELECT " <> Text.intercalate ",\n" (origGroupByCols ++ [aggSelectClause]) + <> "\nFROM (\n" <> selectSQL <> "\n) zyx" + <> "\nGROUP BY \n" <> Text.intercalate ",\n" origGroupByCols + <> "\n) xyz" + <> "\n WHERE\n" + <> Text.intercalate "\nAND " + [ coalesceFKId (fullColName extFactTableName .fromJust . Text.stripPrefix groupByColPrefix $ col) + <> " = " <> coalesceFKId (fullColName "xyz" col) + | col <- origGroupByCols ] + + return $ insertIntoInsertSQL <> "\n" <> toSelectSQL insertIntoSelectSQL : + if null countDistinctCols then [] else updateSQLs where - groupByColPrefix = "xxff_" - asName cName sql = "(" <> sql <> ")" <> " as " <> groupByColPrefix <> cName - unlineCols = Text.intercalate ",\n " + groupByColPrefix = "xxff_" joinClausePreds table oTableName = fmap (\(ForeignKey _ colPairs) -> @@ -205,3 +263,25 @@ factTablePopulateSQL popMode fact = do _ -> False) . tableConstraints $ table + + toSelectSQL FactTablePopulateSelectSQL {..} = + "SELECT \n" <> Text.intercalate ",\n " (map (uncurry asName) ftpsSelectCols) + <> "\nFROM " <> ftpsSelectTable + <> (if not . null $ ftpsJoinClauses + then "\n" <> Text.intercalate"\n" ftpsJoinClauses + else "") + <> (if not . null $ ftpsWhereClauses + then "\nWHERE " <> Text.intercalate "\nAND " ftpsWhereClauses + else "") + <> "\nGROUP BY \n" + <> Text.intercalate ",\n " ftpsGroupByCols + where + asName sql alias = "(" <> sql <> ")" <> " as " <> alias + + coalesceFKId col = "coalesce(" <> col <> ", -1)" + + bucketCount :: Double -> Integer + bucketCount errorRate = + let power :: Double = fromIntegral (ceiling . logBase 2 $ (1.04 / errorRate) ** 2 :: Integer) + in ceiling $ 2 ** power + diff --git a/src/Ringo/Types.hs b/src/Ringo/Types.hs index c2c49d4..7fa04e1 100644 --- a/src/Ringo/Types.hs +++ b/src/Ringo/Types.hs @@ -69,34 +69,36 @@ factColumnName (FactAverage cName _) = Just cName factColumnName (FactCountDistinct cName _) = cName data Settings = Settings - { settingDimPrefix :: !Text - , settingFactPrefix :: !Text - , settingTimeUnit :: !TimeUnit - , settingAvgCountColumSuffix :: !Text - , settingAvgSumColumnSuffix :: !Text - , settingDimTableIdColumnName :: !Text - , settingDimTableIdColumnType :: !Text - , settingFactCountColumnType :: !Text - , settingFactInfix :: !Text - , settingDependenciesJSONFileName :: !Text - , settingFactsJSONFileName :: !Text - , settingDimensionJSONFileName :: !Text + { settingDimPrefix :: !Text + , settingFactPrefix :: !Text + , settingTimeUnit :: !TimeUnit + , settingAvgCountColumSuffix :: !Text + , settingAvgSumColumnSuffix :: !Text + , settingDimTableIdColumnName :: !Text + , settingDimTableIdColumnType :: !Text + , settingFactCountColumnType :: !Text + , settingFactCountDistinctErrorRate :: !Double + , settingFactInfix :: !Text + , settingDependenciesJSONFileName :: !Text + , settingFactsJSONFileName :: !Text + , settingDimensionJSONFileName :: !Text } deriving (Eq, Show) defSettings :: Settings defSettings = Settings - { settingDimPrefix = "dim_" - , settingFactPrefix = "fact_" - , settingTimeUnit = Minute - , settingAvgCountColumSuffix = "_count" - , settingAvgSumColumnSuffix = "_sum" - , settingDimTableIdColumnName = "id" - , settingDimTableIdColumnType = "serial" - , settingFactCountColumnType = "integer" - , settingFactInfix = "_by_" - , settingDependenciesJSONFileName = "dependencies.json" - , settingFactsJSONFileName = "facts.json" - , settingDimensionJSONFileName = "dimensions.json" + { settingDimPrefix = "dim_" + , settingFactPrefix = "fact_" + , settingTimeUnit = Minute + , settingAvgCountColumSuffix = "_count" + , settingAvgSumColumnSuffix = "_sum" + , settingDimTableIdColumnName = "id" + , settingDimTableIdColumnType = "serial" + , settingFactCountColumnType = "integer" + , settingFactCountDistinctErrorRate = 0.05 + , settingFactInfix = "_by_" + , settingDependenciesJSONFileName = "dependencies.json" + , settingFactsJSONFileName = "facts.json" + , settingDimensionJSONFileName = "dimensions.json" } data ValidationError = MissingTable !TableName