From 601eed9a3c5e7c0f59158d51ca9396dd0f562e83 Mon Sep 17 00:00:00 2001 From: Abhinav Sarkar Date: Tue, 29 Dec 2015 18:22:01 +0530 Subject: [PATCH] Changes count distinct algo to use a specialized int log2 function. --- ringo.cabal | 9 +-- src/Ringo/Generator/Internal.hs | 4 +- src/Ringo/Generator/Populate/Fact.hs | 90 ++++++++++++++++++---------- src/Ringo/Validator.hs | 2 +- stack.yaml | 2 +- 5 files changed, 67 insertions(+), 40 deletions(-) diff --git a/ringo.cabal b/ringo.cabal index e9254e0..9d59a5b 100644 --- a/ringo.cabal +++ b/ringo.cabal @@ -26,10 +26,11 @@ library Ringo.Generator.Populate.Dimension, Ringo.Generator.Populate.Fact, Ringo.Utils - build-depends: base >=4.7 && <5, - text >=1.2 && <1.3, - containers >=0.5 && <0.6, - mtl >=2.1 && <2.3 + build-depends: base >=4.7 && <5, + text >=1.2 && <1.3, + containers >=0.5 && <0.6, + mtl >=2.1 && <2.3, + raw-strings-qq >= 1.0 && <1.2 ghc-options: -Wall -fno-warn-unused-do-bind -funbox-strict-fields -fno-warn-orphans -O2 default-extensions: OverloadedStrings, RecordWildCards, ScopedTypeVariables, BangPatterns, TupleSections, CPP, NamedFieldPuns diff --git a/src/Ringo/Generator/Internal.hs b/src/Ringo/Generator/Internal.hs index 58f1b41..42a812c 100644 --- a/src/Ringo/Generator/Internal.hs +++ b/src/Ringo/Generator/Internal.hs @@ -4,7 +4,6 @@ import qualified Data.Map as Map import qualified Data.Text as Text import Data.List (find) -import Data.Maybe (fromMaybe) import Data.Monoid ((<>)) import Data.Text (Text) @@ -31,8 +30,7 @@ coalesceColumn defaults tName Column{..} = fqColName = fullColumnName tName columnName defVal colType = - fromMaybe (error $ "Default value not known for column type: " ++ Text.unpack colType) - . fmap snd + maybe (error $ "Default value not known for column type: " ++ Text.unpack colType) snd . find (\(k, _) -> k `Text.isPrefixOf` colType) . Map.toList $ defaults diff --git a/src/Ringo/Generator/Populate/Fact.hs b/src/Ringo/Generator/Populate/Fact.hs index af5ec12..1976688 100644 --- a/src/Ringo/Generator/Populate/Fact.hs +++ b/src/Ringo/Generator/Populate/Fact.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE QuasiQuotes #-} module Ringo.Generator.Populate.Fact (factTablePopulateSQL) where import qualified Data.Text as Text @@ -12,12 +13,38 @@ import Data.List (nub) import Data.Maybe (fromJust, fromMaybe, mapMaybe, listToMaybe) import Data.Monoid ((<>)) import Data.Text (Text) +import Text.RawString.QQ (r) import Ringo.Extractor.Internal import Ringo.Generator.Internal import Ringo.Types import Ringo.Utils +ilog2FunctionString :: Text +ilog2FunctionString = [r|CREATE OR REPLACE FUNCTION ilog2(v integer) + RETURNS integer AS +$$ +DECLARE + r integer; + shift integer; +BEGIN + IF v > x'FFFF'::integer THEN r := 1 << 4; ELSE r := 0 << 4; END IF; + v := v >> r; + IF v > x'FF'::integer THEN shift := 1 << 3; ELSE shift := 0 << 3; END IF; + v := v >> shift; + r := r | shift; + IF v > x'F'::integer THEN shift := 1 << 2; ELSE shift := 0 << 2; END IF; + v := v >> shift; + r := r | shift; + IF v > x'3'::integer THEN shift := 1 << 1; ELSE shift := 0 << 3; END IF; + v := v >> shift; + r := r | shift; + r := r | (v >> 1); + RETURN r; +END; +$$ +LANGUAGE 'plpgsql' IMMUTABLE|] + data FactTablePopulateSelectSQL = FactTablePopulateSelectSQL { ftpsSelectCols :: ![(Text, Text)] , ftpsSelectTable :: !Text @@ -37,41 +64,42 @@ factTableUpdateSQL fact groupByColPrefix populateSelectSQL@FactTablePopulateSele extFactTableName = extractedFactTableName settingFactPrefix settingFactInfix (factName fact) settingTimeUnit - return $ for countDistinctCols $ \(FactCountDistinct scName cName) -> - let unqCol = fullColumnName fTableName (fromMaybe tablePKColName scName) <> "::text" + return . (\xs -> if null xs then xs else ilog2FunctionString : xs) + $ for countDistinctCols $ \(FactCountDistinct scName cName) -> + let unqCol = fullColumnName fTableName (fromMaybe tablePKColName scName) <> "::text" - bucketSelectCols = - [ ( "hashtext(" <> unqCol <> ") & " - <> Text.pack (show $ bucketCount settingFactCountDistinctErrorRate - 1) - , cName <> "_bnum" - ) - , ( "31 - floor(log(2, min(hashtext(" <> unqCol <> ") & ~(1 << 31))))::int" - , cName <> "_bhash" - ) - ] + bucketSelectCols = + [ ( "hashtext(" <> unqCol <> ") & " + <> Text.pack (show $ bucketCount settingFactCountDistinctErrorRate - 1) + , cName <> "_bnum" + ) + , ( "31 - ilog2(min(hashtext(" <> unqCol <> ") & ~(1 << 31)))" + , cName <> "_bhash" + ) + ] - selectSQL = toSelectSQL $ - populateSelectSQL - { ftpsSelectCols = filter ((`elem` ftpsGroupByCols) . snd) ftpsSelectCols ++ bucketSelectCols - , ftpsGroupByCols = ftpsGroupByCols ++ [ cName <> "_bnum" ] - , ftpsWhereClauses = ftpsWhereClauses ++ [ unqCol <> " IS NOT NULL" ] - } + selectSQL = toSelectSQL $ + populateSelectSQL + { ftpsSelectCols = filter ((`elem` ftpsGroupByCols) . snd) ftpsSelectCols ++ bucketSelectCols + , ftpsGroupByCols = ftpsGroupByCols ++ [ cName <> "_bnum" ] + , ftpsWhereClauses = ftpsWhereClauses ++ [ unqCol <> " IS NOT NULL" ] + } - aggSelectClause = - "json_object_agg(" <> cName <> "_bnum, " <> cName <> "_bhash) AS " <> cName + aggSelectClause = + "json_object_agg(" <> cName <> "_bnum, " <> cName <> "_bhash) AS " <> cName - in "UPDATE " <> extFactTableName - <> "\nSET " <> cName <> " = " <> fullColumnName "xyz" cName - <> "\nFROM (" - <> "\nSELECT " <> joinColumnNames (ftpsGroupByCols ++ [aggSelectClause]) - <> "\nFROM (\n" <> selectSQL <> "\n) zyx" - <> "\nGROUP BY \n" <> joinColumnNames ftpsGroupByCols - <> "\n) xyz" - <> "\n WHERE\n" - <> Text.intercalate "\nAND " - [ fullColumnName extFactTableName .fromJust . Text.stripPrefix groupByColPrefix $ col - <> " = " <> fullColumnName "xyz" col - | col <- ftpsGroupByCols ] + in "UPDATE " <> extFactTableName + <> "\nSET " <> cName <> " = " <> fullColumnName "xyz" cName + <> "\nFROM (" + <> "\nSELECT " <> joinColumnNames (ftpsGroupByCols ++ [aggSelectClause]) + <> "\nFROM (\n" <> selectSQL <> "\n) zyx" + <> "\nGROUP BY \n" <> joinColumnNames ftpsGroupByCols + <> "\n) xyz" + <> "\n WHERE\n" + <> Text.intercalate "\nAND " + [ fullColumnName extFactTableName .fromJust . Text.stripPrefix groupByColPrefix $ col + <> " = " <> fullColumnName "xyz" col + | col <- ftpsGroupByCols ] where bucketCount :: Double -> Integer bucketCount errorRate = diff --git a/src/Ringo/Validator.hs b/src/Ringo/Validator.hs index 8b77461..55ff06f 100644 --- a/src/Ringo/Validator.hs +++ b/src/Ringo/Validator.hs @@ -60,7 +60,7 @@ validateFact Fact {..} = do , let col = findColumn cName (tableColumns table) , isJust col , let cType = columnType $ fromJust col - , null . filter (`Text.isPrefixOf` cType) $ defaults ] + , not . any (`Text.isPrefixOf` cType) $ defaults ] return $ tableVs ++ parentVs ++ colVs ++ timeVs ++ notNullVs ++ typeDefaultVs where diff --git a/stack.yaml b/stack.yaml index 75d20fb..dcddbaa 100644 --- a/stack.yaml +++ b/stack.yaml @@ -1,7 +1,7 @@ # For more information, see: https://github.com/commercialhaskell/stack/blob/master/doc/yaml_configuration.md # Specifies the GHC version and set of packages available (e.g., lts-3.5, nightly-2015-09-21, ghc-7.10.2) -resolver: lts-3.19 +resolver: lts-3.20 # Local packages, usually specified by relative directory name packages: