Changes count distinct algo to use a specialized int log2 function.

pull/1/head
Abhinav Sarkar 2015-12-29 18:22:01 +05:30
parent 2d5a49f53e
commit 601eed9a3c
5 changed files with 67 additions and 40 deletions

View File

@ -26,10 +26,11 @@ library
Ringo.Generator.Populate.Dimension,
Ringo.Generator.Populate.Fact,
Ringo.Utils
build-depends: base >=4.7 && <5,
text >=1.2 && <1.3,
containers >=0.5 && <0.6,
mtl >=2.1 && <2.3
build-depends: base >=4.7 && <5,
text >=1.2 && <1.3,
containers >=0.5 && <0.6,
mtl >=2.1 && <2.3,
raw-strings-qq >= 1.0 && <1.2
ghc-options: -Wall -fno-warn-unused-do-bind -funbox-strict-fields -fno-warn-orphans -O2
default-extensions: OverloadedStrings, RecordWildCards, ScopedTypeVariables, BangPatterns,
TupleSections, CPP, NamedFieldPuns

View File

@ -4,7 +4,6 @@ import qualified Data.Map as Map
import qualified Data.Text as Text
import Data.List (find)
import Data.Maybe (fromMaybe)
import Data.Monoid ((<>))
import Data.Text (Text)
@ -31,8 +30,7 @@ coalesceColumn defaults tName Column{..} =
fqColName = fullColumnName tName columnName
defVal colType =
fromMaybe (error $ "Default value not known for column type: " ++ Text.unpack colType)
. fmap snd
maybe (error $ "Default value not known for column type: " ++ Text.unpack colType) snd
. find (\(k, _) -> k `Text.isPrefixOf` colType)
. Map.toList
$ defaults

View File

@ -1,3 +1,4 @@
{-# LANGUAGE QuasiQuotes #-}
module Ringo.Generator.Populate.Fact (factTablePopulateSQL) where
import qualified Data.Text as Text
@ -12,12 +13,38 @@ import Data.List (nub)
import Data.Maybe (fromJust, fromMaybe, mapMaybe, listToMaybe)
import Data.Monoid ((<>))
import Data.Text (Text)
import Text.RawString.QQ (r)
import Ringo.Extractor.Internal
import Ringo.Generator.Internal
import Ringo.Types
import Ringo.Utils
ilog2FunctionString :: Text
ilog2FunctionString = [r|CREATE OR REPLACE FUNCTION ilog2(v integer)
RETURNS integer AS
$$
DECLARE
r integer;
shift integer;
BEGIN
IF v > x'FFFF'::integer THEN r := 1 << 4; ELSE r := 0 << 4; END IF;
v := v >> r;
IF v > x'FF'::integer THEN shift := 1 << 3; ELSE shift := 0 << 3; END IF;
v := v >> shift;
r := r | shift;
IF v > x'F'::integer THEN shift := 1 << 2; ELSE shift := 0 << 2; END IF;
v := v >> shift;
r := r | shift;
IF v > x'3'::integer THEN shift := 1 << 1; ELSE shift := 0 << 3; END IF;
v := v >> shift;
r := r | shift;
r := r | (v >> 1);
RETURN r;
END;
$$
LANGUAGE 'plpgsql' IMMUTABLE|]
data FactTablePopulateSelectSQL = FactTablePopulateSelectSQL
{ ftpsSelectCols :: ![(Text, Text)]
, ftpsSelectTable :: !Text
@ -37,41 +64,42 @@ factTableUpdateSQL fact groupByColPrefix populateSelectSQL@FactTablePopulateSele
extFactTableName =
extractedFactTableName settingFactPrefix settingFactInfix (factName fact) settingTimeUnit
return $ for countDistinctCols $ \(FactCountDistinct scName cName) ->
let unqCol = fullColumnName fTableName (fromMaybe tablePKColName scName) <> "::text"
return . (\xs -> if null xs then xs else ilog2FunctionString : xs)
$ for countDistinctCols $ \(FactCountDistinct scName cName) ->
let unqCol = fullColumnName fTableName (fromMaybe tablePKColName scName) <> "::text"
bucketSelectCols =
[ ( "hashtext(" <> unqCol <> ") & "
<> Text.pack (show $ bucketCount settingFactCountDistinctErrorRate - 1)
, cName <> "_bnum"
)
, ( "31 - floor(log(2, min(hashtext(" <> unqCol <> ") & ~(1 << 31))))::int"
, cName <> "_bhash"
)
]
bucketSelectCols =
[ ( "hashtext(" <> unqCol <> ") & "
<> Text.pack (show $ bucketCount settingFactCountDistinctErrorRate - 1)
, cName <> "_bnum"
)
, ( "31 - ilog2(min(hashtext(" <> unqCol <> ") & ~(1 << 31)))"
, cName <> "_bhash"
)
]
selectSQL = toSelectSQL $
populateSelectSQL
{ ftpsSelectCols = filter ((`elem` ftpsGroupByCols) . snd) ftpsSelectCols ++ bucketSelectCols
, ftpsGroupByCols = ftpsGroupByCols ++ [ cName <> "_bnum" ]
, ftpsWhereClauses = ftpsWhereClauses ++ [ unqCol <> " IS NOT NULL" ]
}
selectSQL = toSelectSQL $
populateSelectSQL
{ ftpsSelectCols = filter ((`elem` ftpsGroupByCols) . snd) ftpsSelectCols ++ bucketSelectCols
, ftpsGroupByCols = ftpsGroupByCols ++ [ cName <> "_bnum" ]
, ftpsWhereClauses = ftpsWhereClauses ++ [ unqCol <> " IS NOT NULL" ]
}
aggSelectClause =
"json_object_agg(" <> cName <> "_bnum, " <> cName <> "_bhash) AS " <> cName
aggSelectClause =
"json_object_agg(" <> cName <> "_bnum, " <> cName <> "_bhash) AS " <> cName
in "UPDATE " <> extFactTableName
<> "\nSET " <> cName <> " = " <> fullColumnName "xyz" cName
<> "\nFROM ("
<> "\nSELECT " <> joinColumnNames (ftpsGroupByCols ++ [aggSelectClause])
<> "\nFROM (\n" <> selectSQL <> "\n) zyx"
<> "\nGROUP BY \n" <> joinColumnNames ftpsGroupByCols
<> "\n) xyz"
<> "\n WHERE\n"
<> Text.intercalate "\nAND "
[ fullColumnName extFactTableName .fromJust . Text.stripPrefix groupByColPrefix $ col
<> " = " <> fullColumnName "xyz" col
| col <- ftpsGroupByCols ]
in "UPDATE " <> extFactTableName
<> "\nSET " <> cName <> " = " <> fullColumnName "xyz" cName
<> "\nFROM ("
<> "\nSELECT " <> joinColumnNames (ftpsGroupByCols ++ [aggSelectClause])
<> "\nFROM (\n" <> selectSQL <> "\n) zyx"
<> "\nGROUP BY \n" <> joinColumnNames ftpsGroupByCols
<> "\n) xyz"
<> "\n WHERE\n"
<> Text.intercalate "\nAND "
[ fullColumnName extFactTableName .fromJust . Text.stripPrefix groupByColPrefix $ col
<> " = " <> fullColumnName "xyz" col
| col <- ftpsGroupByCols ]
where
bucketCount :: Double -> Integer
bucketCount errorRate =

View File

@ -60,7 +60,7 @@ validateFact Fact {..} = do
, let col = findColumn cName (tableColumns table)
, isJust col
, let cType = columnType $ fromJust col
, null . filter (`Text.isPrefixOf` cType) $ defaults ]
, not . any (`Text.isPrefixOf` cType) $ defaults ]
return $ tableVs ++ parentVs ++ colVs ++ timeVs ++ notNullVs ++ typeDefaultVs
where

View File

@ -1,7 +1,7 @@
# For more information, see: https://github.com/commercialhaskell/stack/blob/master/doc/yaml_configuration.md
# Specifies the GHC version and set of packages available (e.g., lts-3.5, nightly-2015-09-21, ghc-7.10.2)
resolver: lts-3.19
resolver: lts-3.20
# Local packages, usually specified by relative directory name
packages: