Changes count distinct algo to use a specialized int log2 function.
This commit is contained in:
parent
2d5a49f53e
commit
601eed9a3c
@ -26,10 +26,11 @@ library
|
||||
Ringo.Generator.Populate.Dimension,
|
||||
Ringo.Generator.Populate.Fact,
|
||||
Ringo.Utils
|
||||
build-depends: base >=4.7 && <5,
|
||||
text >=1.2 && <1.3,
|
||||
containers >=0.5 && <0.6,
|
||||
mtl >=2.1 && <2.3
|
||||
build-depends: base >=4.7 && <5,
|
||||
text >=1.2 && <1.3,
|
||||
containers >=0.5 && <0.6,
|
||||
mtl >=2.1 && <2.3,
|
||||
raw-strings-qq >= 1.0 && <1.2
|
||||
ghc-options: -Wall -fno-warn-unused-do-bind -funbox-strict-fields -fno-warn-orphans -O2
|
||||
default-extensions: OverloadedStrings, RecordWildCards, ScopedTypeVariables, BangPatterns,
|
||||
TupleSections, CPP, NamedFieldPuns
|
||||
|
@ -4,7 +4,6 @@ import qualified Data.Map as Map
|
||||
import qualified Data.Text as Text
|
||||
|
||||
import Data.List (find)
|
||||
import Data.Maybe (fromMaybe)
|
||||
import Data.Monoid ((<>))
|
||||
import Data.Text (Text)
|
||||
|
||||
@ -31,8 +30,7 @@ coalesceColumn defaults tName Column{..} =
|
||||
fqColName = fullColumnName tName columnName
|
||||
|
||||
defVal colType =
|
||||
fromMaybe (error $ "Default value not known for column type: " ++ Text.unpack colType)
|
||||
. fmap snd
|
||||
maybe (error $ "Default value not known for column type: " ++ Text.unpack colType) snd
|
||||
. find (\(k, _) -> k `Text.isPrefixOf` colType)
|
||||
. Map.toList
|
||||
$ defaults
|
||||
|
@ -1,3 +1,4 @@
|
||||
{-# LANGUAGE QuasiQuotes #-}
|
||||
module Ringo.Generator.Populate.Fact (factTablePopulateSQL) where
|
||||
|
||||
import qualified Data.Text as Text
|
||||
@ -12,12 +13,38 @@ import Data.List (nub)
|
||||
import Data.Maybe (fromJust, fromMaybe, mapMaybe, listToMaybe)
|
||||
import Data.Monoid ((<>))
|
||||
import Data.Text (Text)
|
||||
import Text.RawString.QQ (r)
|
||||
|
||||
import Ringo.Extractor.Internal
|
||||
import Ringo.Generator.Internal
|
||||
import Ringo.Types
|
||||
import Ringo.Utils
|
||||
|
||||
ilog2FunctionString :: Text
|
||||
ilog2FunctionString = [r|CREATE OR REPLACE FUNCTION ilog2(v integer)
|
||||
RETURNS integer AS
|
||||
$$
|
||||
DECLARE
|
||||
r integer;
|
||||
shift integer;
|
||||
BEGIN
|
||||
IF v > x'FFFF'::integer THEN r := 1 << 4; ELSE r := 0 << 4; END IF;
|
||||
v := v >> r;
|
||||
IF v > x'FF'::integer THEN shift := 1 << 3; ELSE shift := 0 << 3; END IF;
|
||||
v := v >> shift;
|
||||
r := r | shift;
|
||||
IF v > x'F'::integer THEN shift := 1 << 2; ELSE shift := 0 << 2; END IF;
|
||||
v := v >> shift;
|
||||
r := r | shift;
|
||||
IF v > x'3'::integer THEN shift := 1 << 1; ELSE shift := 0 << 3; END IF;
|
||||
v := v >> shift;
|
||||
r := r | shift;
|
||||
r := r | (v >> 1);
|
||||
RETURN r;
|
||||
END;
|
||||
$$
|
||||
LANGUAGE 'plpgsql' IMMUTABLE|]
|
||||
|
||||
data FactTablePopulateSelectSQL = FactTablePopulateSelectSQL
|
||||
{ ftpsSelectCols :: ![(Text, Text)]
|
||||
, ftpsSelectTable :: !Text
|
||||
@ -37,41 +64,42 @@ factTableUpdateSQL fact groupByColPrefix populateSelectSQL@FactTablePopulateSele
|
||||
extFactTableName =
|
||||
extractedFactTableName settingFactPrefix settingFactInfix (factName fact) settingTimeUnit
|
||||
|
||||
return $ for countDistinctCols $ \(FactCountDistinct scName cName) ->
|
||||
let unqCol = fullColumnName fTableName (fromMaybe tablePKColName scName) <> "::text"
|
||||
return . (\xs -> if null xs then xs else ilog2FunctionString : xs)
|
||||
$ for countDistinctCols $ \(FactCountDistinct scName cName) ->
|
||||
let unqCol = fullColumnName fTableName (fromMaybe tablePKColName scName) <> "::text"
|
||||
|
||||
bucketSelectCols =
|
||||
[ ( "hashtext(" <> unqCol <> ") & "
|
||||
<> Text.pack (show $ bucketCount settingFactCountDistinctErrorRate - 1)
|
||||
, cName <> "_bnum"
|
||||
)
|
||||
, ( "31 - floor(log(2, min(hashtext(" <> unqCol <> ") & ~(1 << 31))))::int"
|
||||
, cName <> "_bhash"
|
||||
)
|
||||
]
|
||||
bucketSelectCols =
|
||||
[ ( "hashtext(" <> unqCol <> ") & "
|
||||
<> Text.pack (show $ bucketCount settingFactCountDistinctErrorRate - 1)
|
||||
, cName <> "_bnum"
|
||||
)
|
||||
, ( "31 - ilog2(min(hashtext(" <> unqCol <> ") & ~(1 << 31)))"
|
||||
, cName <> "_bhash"
|
||||
)
|
||||
]
|
||||
|
||||
selectSQL = toSelectSQL $
|
||||
populateSelectSQL
|
||||
{ ftpsSelectCols = filter ((`elem` ftpsGroupByCols) . snd) ftpsSelectCols ++ bucketSelectCols
|
||||
, ftpsGroupByCols = ftpsGroupByCols ++ [ cName <> "_bnum" ]
|
||||
, ftpsWhereClauses = ftpsWhereClauses ++ [ unqCol <> " IS NOT NULL" ]
|
||||
}
|
||||
selectSQL = toSelectSQL $
|
||||
populateSelectSQL
|
||||
{ ftpsSelectCols = filter ((`elem` ftpsGroupByCols) . snd) ftpsSelectCols ++ bucketSelectCols
|
||||
, ftpsGroupByCols = ftpsGroupByCols ++ [ cName <> "_bnum" ]
|
||||
, ftpsWhereClauses = ftpsWhereClauses ++ [ unqCol <> " IS NOT NULL" ]
|
||||
}
|
||||
|
||||
aggSelectClause =
|
||||
"json_object_agg(" <> cName <> "_bnum, " <> cName <> "_bhash) AS " <> cName
|
||||
aggSelectClause =
|
||||
"json_object_agg(" <> cName <> "_bnum, " <> cName <> "_bhash) AS " <> cName
|
||||
|
||||
in "UPDATE " <> extFactTableName
|
||||
<> "\nSET " <> cName <> " = " <> fullColumnName "xyz" cName
|
||||
<> "\nFROM ("
|
||||
<> "\nSELECT " <> joinColumnNames (ftpsGroupByCols ++ [aggSelectClause])
|
||||
<> "\nFROM (\n" <> selectSQL <> "\n) zyx"
|
||||
<> "\nGROUP BY \n" <> joinColumnNames ftpsGroupByCols
|
||||
<> "\n) xyz"
|
||||
<> "\n WHERE\n"
|
||||
<> Text.intercalate "\nAND "
|
||||
[ fullColumnName extFactTableName .fromJust . Text.stripPrefix groupByColPrefix $ col
|
||||
<> " = " <> fullColumnName "xyz" col
|
||||
| col <- ftpsGroupByCols ]
|
||||
in "UPDATE " <> extFactTableName
|
||||
<> "\nSET " <> cName <> " = " <> fullColumnName "xyz" cName
|
||||
<> "\nFROM ("
|
||||
<> "\nSELECT " <> joinColumnNames (ftpsGroupByCols ++ [aggSelectClause])
|
||||
<> "\nFROM (\n" <> selectSQL <> "\n) zyx"
|
||||
<> "\nGROUP BY \n" <> joinColumnNames ftpsGroupByCols
|
||||
<> "\n) xyz"
|
||||
<> "\n WHERE\n"
|
||||
<> Text.intercalate "\nAND "
|
||||
[ fullColumnName extFactTableName .fromJust . Text.stripPrefix groupByColPrefix $ col
|
||||
<> " = " <> fullColumnName "xyz" col
|
||||
| col <- ftpsGroupByCols ]
|
||||
where
|
||||
bucketCount :: Double -> Integer
|
||||
bucketCount errorRate =
|
||||
|
@ -60,7 +60,7 @@ validateFact Fact {..} = do
|
||||
, let col = findColumn cName (tableColumns table)
|
||||
, isJust col
|
||||
, let cType = columnType $ fromJust col
|
||||
, null . filter (`Text.isPrefixOf` cType) $ defaults ]
|
||||
, not . any (`Text.isPrefixOf` cType) $ defaults ]
|
||||
|
||||
return $ tableVs ++ parentVs ++ colVs ++ timeVs ++ notNullVs ++ typeDefaultVs
|
||||
where
|
||||
|
@ -1,7 +1,7 @@
|
||||
# For more information, see: https://github.com/commercialhaskell/stack/blob/master/doc/yaml_configuration.md
|
||||
|
||||
# Specifies the GHC version and set of packages available (e.g., lts-3.5, nightly-2015-09-21, ghc-7.10.2)
|
||||
resolver: lts-3.19
|
||||
resolver: lts-3.20
|
||||
|
||||
# Local packages, usually specified by relative directory name
|
||||
packages:
|
||||
|
Loading…
Reference in New Issue
Block a user