From ad14698ab06323dee15876c65bb408f433384f1f Mon Sep 17 00:00:00 2001 From: Abhinav Sarkar Date: Tue, 29 Dec 2015 15:19:17 +0530 Subject: [PATCH] Refactoring. --- app/Ringo/InputParser.hs | 4 +- ringo.cabal | 20 +-- src/Ringo/Extractor.hs | 8 +- src/Ringo/Extractor/Internal.hs | 4 +- src/Ringo/Generator.hs | 249 ++++++++++++++++---------------- src/Ringo/Validator.hs | 2 +- 6 files changed, 141 insertions(+), 146 deletions(-) diff --git a/app/Ringo/InputParser.hs b/app/Ringo/InputParser.hs index 8bdc235..0ae0912 100644 --- a/app/Ringo/InputParser.hs +++ b/app/Ringo/InputParser.hs @@ -8,8 +8,8 @@ import qualified Data.Vector as V import Control.Applicative ((<$>), (<*>), pure) #endif -import Data.Maybe (fromMaybe) -import Data.Vector ((!), (!?)) +import Data.Maybe (fromMaybe) +import Data.Vector ((!), (!?)) import Data.Yaml hiding (Null) import Ringo.Types diff --git a/ringo.cabal b/ringo.cabal index 5848220..f284edf 100644 --- a/ringo.cabal +++ b/ringo.cabal @@ -36,16 +36,16 @@ executable ringo other-modules: Ringo.ArgParser, Ringo.InputParser main-is: Main.hs - build-depends: base >=4.7 && <5, - text >=1.2 && <1.3, - bytestring >=0.10 && <0.11, - containers >=0.5 && <0.6, - optparse-applicative >=0.11 && <0.12, - yaml >=0.8 && <0.9, - vector >=0.10 && <0.11, - directory >=1.2 && <1.3, - filepath >=1.3 && <1.5, - aeson >=0.8 && <0.9, + build-depends: base >=4.7 && <5, + text >=1.2 && <1.3, + bytestring >=0.10 && <0.11, + containers >=0.5 && <0.6, + optparse-applicative >=0.11 && <0.12, + yaml >=0.8.11 && <0.9, + vector >=0.10 && <0.11, + directory >=1.2 && <1.3, + filepath >=1.3 && <1.5, + aeson >=0.8 && <0.9, ringo ghc-options: -Wall -fno-warn-unused-do-bind -funbox-strict-fields -fno-warn-orphans -O2 default-extensions: OverloadedStrings, RecordWildCards, ScopedTypeVariables, BangPatterns, diff --git a/src/Ringo/Extractor.hs b/src/Ringo/Extractor.hs index 82b3c45..caa92f0 100644 --- a/src/Ringo/Extractor.hs +++ b/src/Ringo/Extractor.hs @@ -31,7 +31,7 @@ extractFactTable fact = do columns = concatFor (factColumns fact) $ \col -> case col of DimTime cName -> [ Column (timeUnitColumnName dimIdColName cName settingTimeUnit) "bigint" NotNull ] - NoDimId cName -> [ fromJust . findColumn cName . tableColumns $ table] + NoDimId cName -> [ fromJust . findColumn cName . tableColumns $ table] -- TODO should be not null FactCount _ cName -> [ Column cName countColType NotNull ] FactSum scName cName -> [ Column cName (sourceColumnType scName) NotNull ] FactAverage scName cName -> @@ -41,13 +41,13 @@ extractFactTable fact = do FactCountDistinct _ cName -> [ Column cName "json" NotNull ] _ -> [] - fkCols = for allDims $ \(_, Table {..}) -> + fkColumns = for allDims $ \(_, Table {..}) -> let colName = factDimFKIdColumnName settingDimPrefix dimIdColName tableName colType = idColTypeToFKIdColType settingDimTableIdColumnType in Column colName colType NotNull ukColNames = - (++ map columnName fkCols) + (++ map columnName fkColumns) . forMaybe (factColumns fact) $ \col -> case col of DimTime cName -> Just (timeUnitColumnName dimIdColName cName settingTimeUnit) NoDimId cName -> Just cName @@ -56,7 +56,7 @@ extractFactTable fact = do return Table { tableName = extractedFactTableName settingFactPrefix settingFactInfix (factName fact) settingTimeUnit - , tableColumns = columns ++ fkCols + , tableColumns = columns ++ fkColumns , tableConstraints = [ UniqueKey ukColNames ] } diff --git a/src/Ringo/Extractor/Internal.hs b/src/Ringo/Extractor/Internal.hs index eba45d5..5b0d93b 100644 --- a/src/Ringo/Extractor/Internal.hs +++ b/src/Ringo/Extractor/Internal.hs @@ -80,6 +80,4 @@ extractAllDimensionTables fact = do parentDims <- concat <$> mapM extract (factParentNames fact) return . nubBy ((==) `on` snd) $ myDims ++ parentDims where - extract fName = do - facts <- asks envFacts - extractAllDimensionTables . fromJust . findFact fName $ facts + extract fName = asks envFacts >>= extractAllDimensionTables . fromJust . findFact fName diff --git a/src/Ringo/Generator.hs b/src/Ringo/Generator.hs index 899bc1c..907ba0a 100644 --- a/src/Ringo/Generator.hs +++ b/src/Ringo/Generator.hs @@ -15,7 +15,7 @@ import Control.Applicative ((<$>)) import Control.Monad.Reader (Reader, asks) import Data.List (nub, find) -import Data.Maybe (fromJust, fromMaybe, mapMaybe) +import Data.Maybe (fromJust, fromMaybe, mapMaybe, listToMaybe) import Data.Monoid ((<>)) import Data.Text (Text) @@ -23,13 +23,12 @@ import Ringo.Extractor.Internal import Ringo.Types import Ringo.Utils -nullableDefnSQL :: Nullable -> Text -nullableDefnSQL Null = "NULL" -nullableDefnSQL NotNull = "NOT NULL" - columnDefnSQL :: Column -> Text columnDefnSQL Column {..} = columnName <> " " <> columnType <> " " <> nullableDefnSQL columnNullable + where + nullableDefnSQL Null = "NULL" + nullableDefnSQL NotNull = "NOT NULL" joinColumnNames :: [ColumnName] -> Text joinColumnNames = Text.intercalate ",\n" @@ -37,24 +36,24 @@ joinColumnNames = Text.intercalate ",\n" fullColName :: TableName -> ColumnName -> ColumnName fullColName tName cName = tName <> "." <> cName -constraintDefnSQL :: Table -> TableConstraint -> [Text] -constraintDefnSQL Table {..} constraint = - let alterTableSQL = "ALTER TABLE ONLY " <> tableName <> " ADD " - in case constraint of - PrimaryKey cName -> [ alterTableSQL <> "PRIMARY KEY (" <> cName <> ")" ] - ForeignKey oTableName cNamePairs -> - [ alterTableSQL <> "FOREIGN KEY (" <> joinColumnNames (map fst cNamePairs) <> ") REFERENCES " - <> oTableName <> " (" <> joinColumnNames (map snd cNamePairs) <> ")" ] - UniqueKey cNames -> ["CREATE UNIQUE INDEX ON " <> tableName <> " (" <> joinColumnNames cNames <> ")"] - tableDefnSQL :: Table -> [Text] -tableDefnSQL table@Table {..} = - tableSQL : concatMap (constraintDefnSQL table) tableConstraints +tableDefnSQL Table {..} = + tableSQL : concatMap constraintDefnSQL tableConstraints where tableSQL = "CREATE TABLE " <> tableName <> " (\n" <> (joinColumnNames . map columnDefnSQL $ tableColumns) <> "\n)" + constraintDefnSQL constraint = + let alterTableSQL = "ALTER TABLE ONLY " <> tableName <> " ADD " + in case constraint of + PrimaryKey cName -> [ alterTableSQL <> "PRIMARY KEY (" <> cName <> ")" ] + ForeignKey oTableName cNamePairs -> + [ alterTableSQL <> "FOREIGN KEY (" <> joinColumnNames (map fst cNamePairs) <> ") REFERENCES " + <> oTableName <> " (" <> joinColumnNames (map snd cNamePairs) <> ")" ] + UniqueKey cNames -> ["CREATE UNIQUE INDEX ON " <> tableName <> " (" <> joinColumnNames cNames <> ")"] + + factTableDefnSQL :: Fact -> Table -> Reader Env [Text] factTableDefnSQL fact table = do Settings {..} <- asks envSettings @@ -81,7 +80,7 @@ dimColumnMapping dimPrefix fact dimTableName = coalesceColumn :: TypeDefaults -> TableName -> Column -> Text coalesceColumn defaults tName Column{..} = if columnNullable == Null - then "coalesce(" <> fqColName <> "," <> defVal columnType <> ")" + then "coalesce(" <> fqColName <> ", " <> defVal columnType <> ")" else fqColName where fqColName = fullColName tName columnName @@ -100,14 +99,11 @@ dimensionTablePopulateSQL popMode fact dimTableName = do defaults <- asks envTypeDefaults let factTable = fromJust $ findTable (factTableName fact) tables colMapping = dimColumnMapping dimPrefix fact dimTableName - baseSelectC = "SELECT DISTINCT\n" - <> joinColumnNames - (map (\(_, cName) -> - let col = fromJust . findColumn cName $ tableColumns factTable - in coalesceColumn defaults (factTableName fact) col <> " AS " <> cName) - colMapping) - <> "\n" - <> "FROM " <> factTableName fact + selectCols = [ coalesceColumn defaults (factTableName fact) col <> " AS " <> cName + | (_, cName) <- colMapping + , let col = fromJust . findColumn cName $ tableColumns factTable ] + baseSelectC = "SELECT DISTINCT\n" <> joinColumnNames selectCols + <> "\nFROM " <> factTableName fact insertC selectC = "INSERT INTO " <> dimTableName <> " (\n" <> joinColumnNames (map fst colMapping) <> "\n) " <> "SELECT x.* FROM (\n" <> selectC <> ") x" @@ -122,7 +118,7 @@ dimensionTablePopulateSQL popMode fact dimTableName = do <> "\n)") <> "\nLEFT JOIN " <> dimTableName <> " ON\n" <> Text.intercalate " \nAND " - [ fullColName dimTableName c1 <> " IS NOT DISTINCT FROM " <> fullColName "x" c2 + [ fullColName dimTableName c1 <> " IS NOT DISTINCT FROM " <> fullColName "x" c2 -- TODO can be replaced with = ? | (c1, c2) <- colMapping ] <> "\nWHERE " <> Text.intercalate " \nAND " [ fullColName dimTableName c <> " IS NULL" | (c, _) <- colMapping ] @@ -135,6 +131,58 @@ data FactTablePopulateSelectSQL = FactTablePopulateSelectSQL , ftpsGroupByCols :: ![Text] } deriving (Show, Eq) +factTableUpdateSQL :: Fact -> Text -> FactTablePopulateSelectSQL -> Reader Env [Text] +factTableUpdateSQL fact groupByColPrefix populateSelectSQL@FactTablePopulateSelectSQL {..} = do + Settings {..} <- asks envSettings + tables <- asks envTables + let countDistinctCols = [ col | col@(FactCountDistinct _ _) <- factColumns fact] + fTableName = factTableName fact + fTable = fromJust . findTable fTableName $ tables + tablePKColName = head [ cName | PrimaryKey cName <- tableConstraints fTable ] + extFactTableName = + extractedFactTableName settingFactPrefix settingFactInfix (factName fact) settingTimeUnit + + return $ for countDistinctCols $ \(FactCountDistinct scName cName) -> + let unqCol = fullColName fTableName (fromMaybe tablePKColName scName) <> "::text" + + bucketSelectCols = + [ ( "hashtext(" <> unqCol <> ") & " + <> Text.pack (show $ bucketCount settingFactCountDistinctErrorRate - 1) + , cName <> "_bnum" + ) + , ( "31 - floor(log(2, min(hashtext(" <> unqCol <> ") & ~(1 << 31))))::int" + , cName <> "_bhash" + ) + ] + + selectSQL = toSelectSQL $ + populateSelectSQL + { ftpsSelectCols = filter ((`elem` ftpsGroupByCols) . snd) ftpsSelectCols ++ bucketSelectCols + , ftpsGroupByCols = ftpsGroupByCols ++ [ cName <> "_bnum" ] + , ftpsWhereClauses = ftpsWhereClauses ++ [ unqCol <> " IS NOT NULL" ] + } + + aggSelectClause = + "json_object_agg(" <> cName <> "_bnum, " <> cName <> "_bhash) AS " <> cName + + in "UPDATE " <> extFactTableName + <> "\nSET " <> cName <> " = " <> fullColName "xyz" cName + <> "\nFROM (" + <> "\nSELECT " <> joinColumnNames (ftpsGroupByCols ++ [aggSelectClause]) + <> "\nFROM (\n" <> selectSQL <> "\n) zyx" + <> "\nGROUP BY \n" <> joinColumnNames ftpsGroupByCols + <> "\n) xyz" + <> "\n WHERE\n" + <> Text.intercalate "\nAND " + [ fullColName extFactTableName .fromJust . Text.stripPrefix groupByColPrefix $ col + <> " = " <> fullColName "xyz" col + | col <- ftpsGroupByCols ] + where + bucketCount :: Double -> Integer + bucketCount errorRate = + let power :: Double = fromIntegral (ceiling . logBase 2 $ (1.04 / errorRate) ** 2 :: Integer) + in ceiling $ 2 ** power + factTablePopulateSQL :: TablePopulationMode -> Fact -> Reader Env [Text] factTablePopulateSQL popMode fact = do Settings {..} <- asks envSettings @@ -144,7 +192,6 @@ factTablePopulateSQL popMode fact = do let fTableName = factTableName fact fTable = fromJust . findTable fTableName $ tables dimIdColName = settingDimTableIdColumnName - tablePKColName = head [ cName | PrimaryKey cName <- tableConstraints fTable ] timeUnitColumnInsertSQL cName = let colName = timeUnitColumnName dimIdColName cName settingTimeUnit @@ -176,133 +223,83 @@ factTablePopulateSQL popMode fact = do FactCountDistinct _ cName -> [ (cName, "'{}'::json", False)] _ -> [] - dimColMap = for allDims $ \(dimFact, factTable@Table {tableName}) -> - let colName = factDimFKIdColumnName settingDimPrefix dimIdColName tableName - col = fromJust . findColumn colName $ tableColumns factSourceTable - factSourceTableName = factTableName dimFact - factSourceTable = fromJust . findTable factSourceTableName $ tables - insertSQL = if factTable `elem` tables -- existing dimension table - then (if columnNullable col == Null then coalesceFKId else id) - $ fullColName factSourceTableName colName - else let - dimLookupWhereClauses = - [ fullColName tableName c1 <> " = " <> coalesceColumn defaults factSourceTableName col2 - | (c1, c2) <- dimColumnMapping settingDimPrefix dimFact tableName - , let col2 = fromJust . findColumn c2 $ tableColumns factSourceTable ] - in "SELECT " <> dimIdColName <> " FROM " <> tableName <> "\nWHERE " - <> Text.intercalate "\n AND " dimLookupWhereClauses - insertSQL' = if factSourceTableName == fTableName - then insertSQL - else coalesceFKId insertSQL + dimColMap = for allDims $ \(dimFact, factTable@Table {tableName}) -> let + dimFKIdColName = factDimFKIdColumnName settingDimPrefix dimIdColName tableName + factSourceTableName = factTableName dimFact + factSourceTable = fromJust . findTable factSourceTableName $ tables + dimFKIdColumn = fromJust . findColumn dimFKIdColName $ tableColumns factSourceTable + dimLookupWhereClauses = + [ fullColName tableName dimColName <> " = " <> coalesceColumn defaults factSourceTableName sourceCol + | (dimColName, sourceColName) <- dimColumnMapping settingDimPrefix dimFact tableName + , let sourceCol = fromJust . findColumn sourceColName $ tableColumns factSourceTable ] + insertSQL = if factTable `elem` tables -- existing dimension table + then (if columnNullable dimFKIdColumn == Null then coalesceFKId else id) + $ fullColName factSourceTableName dimFKIdColName + else "SELECT " <> dimIdColName <> " FROM " <> tableName <> "\nWHERE " + <> Text.intercalate "\n AND " dimLookupWhereClauses + insertSQL' = if factSourceTableName == fTableName + then insertSQL + else coalesceFKId insertSQL -- TODO always coalesce + in (dimFKIdColName, insertSQL', True) - in (colName, insertSQL', True) + colMap = [ (cName, (sql, groupByColPrefix <> cName), addToGroupBy) + | (cName, sql, addToGroupBy) <- factColMap ++ dimColMap ] - colMap = [ (cName, (sql, groupByColPrefix <> cName), addAs) - | (cName, sql, addAs) <- factColMap ++ dimColMap ] - - joinClauses = + joinClauses = mapMaybe (\tName -> (\p -> "LEFT JOIN " <> tName <> "\nON "<> p) <$> joinClausePreds fTable tName) . nub . map (factTableName . fst) $ allDims - timeCol = fullColName fTableName $ head [ cName | DimTime cName <- factColumns fact ] + timeCol = fullColName fTableName $ head [ cName | DimTime cName <- factColumns fact ] - extFactTableName = + extFactTableName = extractedFactTableName settingFactPrefix settingFactInfix (factName fact) settingTimeUnit - insertIntoSelectSQL = + populateSelectSQL = FactTablePopulateSelectSQL { ftpsSelectCols = map snd3 colMap , ftpsSelectTable = fTableName , ftpsJoinClauses = joinClauses , ftpsWhereClauses = if popMode == IncrementalPopulation - then [timeCol <> " > ?", timeCol <> " <= ?"] + then [ timeCol <> " > ?", timeCol <> " <= ?" ] else [] , ftpsGroupByCols = map ((groupByColPrefix <>) . fst3) . filter thd3 $ colMap } - insertIntoInsertSQL = "INSERT INTO " <> extFactTableName - <> " (\n" <> Text.intercalate ",\n " (map fst3 colMap) <> "\n)" + insertIntoSQL = "INSERT INTO " <> extFactTableName + <> " (\n" <> Text.intercalate ",\n " (map fst3 colMap) <> "\n)\n" + <> toSelectSQL populateSelectSQL - countDistinctCols = [ col | col@(FactCountDistinct _ _) <- factColumns fact] + updateSQLs <- factTableUpdateSQL fact groupByColPrefix populateSelectSQL - updateSQLs = - let origGroupByCols = ftpsGroupByCols insertIntoSelectSQL - origSelectCols = ftpsSelectCols insertIntoSelectSQL - - in for countDistinctCols $ \(FactCountDistinct scName cName) -> - let unqCol = fullColName fTableName (fromMaybe tablePKColName scName) <> "::text" - - bucketSelectCols = - [ ( "hashtext(" <> unqCol <> ") & " - <> Text.pack (show $ bucketCount settingFactCountDistinctErrorRate - 1) - , cName <> "_bnum") - , ( "31 - floor(log(2, min(hashtext(" <> unqCol <> ") & ~(1 << 31))))::int" - , cName <> "_bhash" - ) - ] - - selectSQL = toSelectSQL $ - insertIntoSelectSQL - { ftpsSelectCols = filter ((`elem` origGroupByCols) . snd) origSelectCols ++ bucketSelectCols - , ftpsGroupByCols = origGroupByCols ++ [cName <> "_bnum"] - , ftpsWhereClauses = ftpsWhereClauses insertIntoSelectSQL ++ [ unqCol <> " IS NOT NULL" ] - } - - aggSelectClause = - "json_object_agg(" <> cName <> "_bnum, " <> cName <> "_bhash) AS " <> cName - - in "UPDATE " <> extFactTableName - <> "\nSET " <> cName <> " = " <> fullColName "xyz" cName - <> "\nFROM (" - <> "\nSELECT " <> joinColumnNames (origGroupByCols ++ [aggSelectClause]) - <> "\nFROM (\n" <> selectSQL <> "\n) zyx" - <> "\nGROUP BY \n" <> joinColumnNames origGroupByCols - <> "\n) xyz" - <> "\n WHERE\n" - <> Text.intercalate "\nAND " - [ fullColName extFactTableName .fromJust . Text.stripPrefix groupByColPrefix $ col - <> " = " <> fullColName "xyz" col - | col <- origGroupByCols ] - - return $ insertIntoInsertSQL <> "\n" <> toSelectSQL insertIntoSelectSQL : - if null countDistinctCols then [] else updateSQLs + return $ insertIntoSQL : updateSQLs where groupByColPrefix = "xxff_" joinClausePreds table oTableName = - fmap (\(ForeignKey _ colPairs) -> - Text.intercalate " AND " - . map (\(c1, c2) -> fullColName (tableName table) c1 <> " = " <> fullColName oTableName c2) - $ colPairs ) - . find (\cons -> case cons of - ForeignKey tName _ -> tName == oTableName - _ -> False) - . tableConstraints - $ table - - toSelectSQL FactTablePopulateSelectSQL {..} = - "SELECT \n" <> joinColumnNames (map (uncurry asName) ftpsSelectCols) - <> "\nFROM " <> ftpsSelectTable - <> (if not . null $ ftpsJoinClauses - then "\n" <> Text.intercalate "\n" ftpsJoinClauses - else "") - <> (if not . null $ ftpsWhereClauses - then "\nWHERE " <> Text.intercalate "\nAND " ftpsWhereClauses - else "") - <> "\nGROUP BY \n" - <> joinColumnNames ftpsGroupByCols - where - asName sql alias = "(" <> sql <> ")" <> " as " <> alias + Text.intercalate " AND " + . map (\(c1, c2) -> fullColName (tableName table) c1 <> " = " <> fullColName oTableName c2) + <$> listToMaybe [ colPairs | ForeignKey tName colPairs <- tableConstraints table + , tName == oTableName ] coalesceFKId col = if "coalesce" `Text.isPrefixOf` col then col else "coalesce((" <> col <> "), -1)" - bucketCount :: Double -> Integer - bucketCount errorRate = - let power :: Double = fromIntegral (ceiling . logBase 2 $ (1.04 / errorRate) ** 2 :: Integer) - in ceiling $ 2 ** power +toSelectSQL :: FactTablePopulateSelectSQL -> Text +toSelectSQL FactTablePopulateSelectSQL {..} = + "SELECT \n" <> joinColumnNames (map (uncurry asName) ftpsSelectCols) + <> "\nFROM " <> ftpsSelectTable + <> (if not . null $ ftpsJoinClauses + then "\n" <> Text.intercalate "\n" ftpsJoinClauses + else "") + <> (if not . null $ ftpsWhereClauses + then "\nWHERE " <> Text.intercalate "\nAND " ftpsWhereClauses + else "") + <> "\nGROUP BY \n" + <> joinColumnNames ftpsGroupByCols + where + asName sql alias = "(" <> sql <> ")" <> " as " <> alias diff --git a/src/Ringo/Validator.hs b/src/Ringo/Validator.hs index 7ae1b20..b375553 100644 --- a/src/Ringo/Validator.hs +++ b/src/Ringo/Validator.hs @@ -29,7 +29,7 @@ validateTable table = do let constVs = concatMap (checkConstraint tables) . tableConstraints $ table typeDefaultVs = [ MissingTypeDefault cType | Column _ cType _ <- tableColumns table - , null . filter (`Text.isPrefixOf` cType) $ defaults] + , null . filter (`Text.isPrefixOf` cType) $ defaults] -- TODO only dimval columns need this check return $ constVs ++ typeDefaultVs where checkConstraint _ (PrimaryKey colName) = checkTableForCol table colName