Changes count distinct algo to use a specialized int log2 function.

pull/1/head
Abhinav Sarkar 2015-12-29 18:22:01 +05:30
parent 2d5a49f53e
commit 601eed9a3c
5 changed files with 67 additions and 40 deletions

View File

@ -29,7 +29,8 @@ library
build-depends: base >=4.7 && <5, build-depends: base >=4.7 && <5,
text >=1.2 && <1.3, text >=1.2 && <1.3,
containers >=0.5 && <0.6, containers >=0.5 && <0.6,
mtl >=2.1 && <2.3 mtl >=2.1 && <2.3,
raw-strings-qq >= 1.0 && <1.2
ghc-options: -Wall -fno-warn-unused-do-bind -funbox-strict-fields -fno-warn-orphans -O2 ghc-options: -Wall -fno-warn-unused-do-bind -funbox-strict-fields -fno-warn-orphans -O2
default-extensions: OverloadedStrings, RecordWildCards, ScopedTypeVariables, BangPatterns, default-extensions: OverloadedStrings, RecordWildCards, ScopedTypeVariables, BangPatterns,
TupleSections, CPP, NamedFieldPuns TupleSections, CPP, NamedFieldPuns

View File

@ -4,7 +4,6 @@ import qualified Data.Map as Map
import qualified Data.Text as Text import qualified Data.Text as Text
import Data.List (find) import Data.List (find)
import Data.Maybe (fromMaybe)
import Data.Monoid ((<>)) import Data.Monoid ((<>))
import Data.Text (Text) import Data.Text (Text)
@ -31,8 +30,7 @@ coalesceColumn defaults tName Column{..} =
fqColName = fullColumnName tName columnName fqColName = fullColumnName tName columnName
defVal colType = defVal colType =
fromMaybe (error $ "Default value not known for column type: " ++ Text.unpack colType) maybe (error $ "Default value not known for column type: " ++ Text.unpack colType) snd
. fmap snd
. find (\(k, _) -> k `Text.isPrefixOf` colType) . find (\(k, _) -> k `Text.isPrefixOf` colType)
. Map.toList . Map.toList
$ defaults $ defaults

View File

@ -1,3 +1,4 @@
{-# LANGUAGE QuasiQuotes #-}
module Ringo.Generator.Populate.Fact (factTablePopulateSQL) where module Ringo.Generator.Populate.Fact (factTablePopulateSQL) where
import qualified Data.Text as Text import qualified Data.Text as Text
@ -12,12 +13,38 @@ import Data.List (nub)
import Data.Maybe (fromJust, fromMaybe, mapMaybe, listToMaybe) import Data.Maybe (fromJust, fromMaybe, mapMaybe, listToMaybe)
import Data.Monoid ((<>)) import Data.Monoid ((<>))
import Data.Text (Text) import Data.Text (Text)
import Text.RawString.QQ (r)
import Ringo.Extractor.Internal import Ringo.Extractor.Internal
import Ringo.Generator.Internal import Ringo.Generator.Internal
import Ringo.Types import Ringo.Types
import Ringo.Utils import Ringo.Utils
ilog2FunctionString :: Text
ilog2FunctionString = [r|CREATE OR REPLACE FUNCTION ilog2(v integer)
RETURNS integer AS
$$
DECLARE
r integer;
shift integer;
BEGIN
IF v > x'FFFF'::integer THEN r := 1 << 4; ELSE r := 0 << 4; END IF;
v := v >> r;
IF v > x'FF'::integer THEN shift := 1 << 3; ELSE shift := 0 << 3; END IF;
v := v >> shift;
r := r | shift;
IF v > x'F'::integer THEN shift := 1 << 2; ELSE shift := 0 << 2; END IF;
v := v >> shift;
r := r | shift;
IF v > x'3'::integer THEN shift := 1 << 1; ELSE shift := 0 << 3; END IF;
v := v >> shift;
r := r | shift;
r := r | (v >> 1);
RETURN r;
END;
$$
LANGUAGE 'plpgsql' IMMUTABLE|]
data FactTablePopulateSelectSQL = FactTablePopulateSelectSQL data FactTablePopulateSelectSQL = FactTablePopulateSelectSQL
{ ftpsSelectCols :: ![(Text, Text)] { ftpsSelectCols :: ![(Text, Text)]
, ftpsSelectTable :: !Text , ftpsSelectTable :: !Text
@ -37,7 +64,8 @@ factTableUpdateSQL fact groupByColPrefix populateSelectSQL@FactTablePopulateSele
extFactTableName = extFactTableName =
extractedFactTableName settingFactPrefix settingFactInfix (factName fact) settingTimeUnit extractedFactTableName settingFactPrefix settingFactInfix (factName fact) settingTimeUnit
return $ for countDistinctCols $ \(FactCountDistinct scName cName) -> return . (\xs -> if null xs then xs else ilog2FunctionString : xs)
$ for countDistinctCols $ \(FactCountDistinct scName cName) ->
let unqCol = fullColumnName fTableName (fromMaybe tablePKColName scName) <> "::text" let unqCol = fullColumnName fTableName (fromMaybe tablePKColName scName) <> "::text"
bucketSelectCols = bucketSelectCols =
@ -45,7 +73,7 @@ factTableUpdateSQL fact groupByColPrefix populateSelectSQL@FactTablePopulateSele
<> Text.pack (show $ bucketCount settingFactCountDistinctErrorRate - 1) <> Text.pack (show $ bucketCount settingFactCountDistinctErrorRate - 1)
, cName <> "_bnum" , cName <> "_bnum"
) )
, ( "31 - floor(log(2, min(hashtext(" <> unqCol <> ") & ~(1 << 31))))::int" , ( "31 - ilog2(min(hashtext(" <> unqCol <> ") & ~(1 << 31)))"
, cName <> "_bhash" , cName <> "_bhash"
) )
] ]

View File

@ -60,7 +60,7 @@ validateFact Fact {..} = do
, let col = findColumn cName (tableColumns table) , let col = findColumn cName (tableColumns table)
, isJust col , isJust col
, let cType = columnType $ fromJust col , let cType = columnType $ fromJust col
, null . filter (`Text.isPrefixOf` cType) $ defaults ] , not . any (`Text.isPrefixOf` cType) $ defaults ]
return $ tableVs ++ parentVs ++ colVs ++ timeVs ++ notNullVs ++ typeDefaultVs return $ tableVs ++ parentVs ++ colVs ++ timeVs ++ notNullVs ++ typeDefaultVs
where where

View File

@ -1,7 +1,7 @@
# For more information, see: https://github.com/commercialhaskell/stack/blob/master/doc/yaml_configuration.md # For more information, see: https://github.com/commercialhaskell/stack/blob/master/doc/yaml_configuration.md
# Specifies the GHC version and set of packages available (e.g., lts-3.5, nightly-2015-09-21, ghc-7.10.2) # Specifies the GHC version and set of packages available (e.g., lts-3.5, nightly-2015-09-21, ghc-7.10.2)
resolver: lts-3.19 resolver: lts-3.20
# Local packages, usually specified by relative directory name # Local packages, usually specified by relative directory name
packages: packages: