Tool to transform OLTP database schemas to OLAP database schemas automatically
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ringo/ringo/src/Ringo/Types/Internal.hs

464 lines
20 KiB

{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE Rank2Types #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE KindSignatures #-}
module Ringo.Types.Internal where
import qualified Data.Text as Text
import Data.Map (Map)
import Data.Monoid ((<>))
import Data.Text (Text)
showColNames :: [Text] -> String
showColNames cols = Text.unpack $ "(" <> Text.intercalate ", " cols <> ")"
-- | Name of a 'Column'
type ColumnName = Text
-- | Type of a 'Column'
type ColumnType = Text
-- | Name of a 'Table'
type TableName = Text
-- | Nullness of a 'Column'
data Nullable =
Null -- ^ the column is nullable
| NotNull -- ^ the column is not nullable
deriving (Eq, Enum)
instance Show Nullable where
show Null = "NULL"
show NotNull = "NOT NULL"
-- | A column of a 'Table'
data Column = Column
{ columnName :: !ColumnName -- ^ Name of the column
, columnType :: !ColumnType -- ^ Type of the column
, columnNullable :: !Nullable -- ^ Nullness of the column
} deriving (Eq)
instance Show Column where
show Column {..} = "Column "
++ Text.unpack columnName ++ " "
++ Text.unpack columnType ++ " "
++ show columnNullable
-- | A constraint on a 'Table'
data TableConstraint =
-- | A primary key constraint
PrimaryKey
{ tableConstrPrimaryKeyColumn :: !ColumnName -- ^ Name of the primary key column
}
-- | A unique key contraint
| UniqueKey
{ tableConstrUniqueKeyColumns :: ![ColumnName] -- ^ Name of the unique key columns
}
-- | A foreign key constraint
| ForeignKey
{ -- | Name of the table referenced by the foreign key
tableConstrForeignKeyTable :: !TableName
-- | Mapping of the columns as an associative list for the foreign key.
-- keys: this table's column names, values: referenced table's column names.
, tableConstrForeignKeyColumnMapping :: ![(ColumnName, ColumnName)]
}
deriving (Eq)
instance Show TableConstraint where
show (PrimaryKey col) = "PrimaryKey " ++ Text.unpack col
show (UniqueKey cols) = "UniqueKey " ++ showColNames cols
show (ForeignKey tName colMap) = "ForeignKey " ++ showColNames (map fst colMap) ++ " "
++ Text.unpack tName ++ " " ++ showColNames (map snd colMap)
-- | A table representing a physical table in the database
--
-- The following example represents a set of tables from a multi-publisher blog system:
--
-- >>> :set -XOverloadedStrings
-- >>> :{
-- let publishersTable =
-- Table { tableName = "publishers"
-- , tableColumns =
-- [ Column "id" "integer" NotNull
-- , Column "name" "varchar(100)" NotNull
-- ]
-- , tableConstraints =
-- [ PrimaryKey "id"
-- , UniqueKey [ "name" ]
-- ]
-- }
-- usersTable =
-- Table { tableName = "users"
-- , tableColumns =
-- [ Column "id" "uuid" NotNull
-- , Column "created_at" "timestamp" NotNull
-- , Column "pub_id" "integer" NotNull
-- , Column "username" "varchar(100)" NotNull
-- , Column "email" "varchar(500)" Null
-- ]
-- , tableConstraints =
-- [ PrimaryKey "id"
-- , ForeignKey "publishers" [ ("pub_id", "id") ]
-- , UniqueKey [ "pub_id", "username" ]
-- ]
-- }
-- -- This table records the time spent by each user on a post
-- postViewEventsTable =
-- Table { tableName = "post_view_events"
-- , tableColumns =
-- [ Column "id" "uuid" NotNull
-- , Column "created_at" "timestamp" NotNull
-- , Column "user_id" "uuid" NotNull
-- , Column "pub_id" "integer" NotNull
-- , Column "post_id" "uuid" NotNull
-- , Column "geo_city" "varchar(100)" Null
-- , Column "geo_country" "varchar(100)" Null
-- , Column "device_version" "varchar(25)" Null
-- , Column "device_name" "varchar(100)" Null
-- , Column "device_type" "varchar(50)" Null
-- , Column "time_spent" "integer" NotNull
-- ]
-- , tableConstraints =
-- [ PrimaryKey "id"
-- , ForeignKey "users" [ ("user_id", "id") ]
-- , ForeignKey "publishers" [ ("pub_id", "id") ]
-- ]
-- }
-- :}
data Table = Table
{ tableName :: !TableName -- ^ Name of the table
, tableColumns :: ![Column] -- ^ A list of the columns in the table
, tableConstraints :: ![TableConstraint] -- ^ A list of the constraints on the table
} deriving (Eq)
instance Show Table where
show Table {..} =
unlines $ ("Table " ++ Text.unpack tableName) : map show tableColumns ++ map show tableConstraints
-- | Type of a 'FactColumnType'
data FactColumnKind =
FCKNone -- ^ A FactColumnType without any parameters
| FCKTargetTable -- ^ A FactColumnType with 'factColTargetTable' as the only parameter
| FCKMaybeSourceColumn -- ^ A FactColumnType with 'factColMaybeSourceColumn' as the only parameter
| FCKSourceColumn -- ^ A FactColumnType with 'factColSourceColumn' as the only parameter
#if MIN_VERSION_base(4,9,0)
-- | Type of a fact column
#else
-- | Type of a fact column
--
-- 'DimTime':
-- A fact column which contains a time dimension data (e.g. `created_at`). This is not exatracted
-- as a dimension table and instead just converted to an int which depends on 'settingTimeUnit'.
-- Every fact must have one of this.
--
-- 'NoDimId':
-- A fact column which contains an id of a dimension which does not need to extracted as a table
-- and does not exist already.
--
-- 'TenantId':
-- A fact column which constains an id of a tenant in a multi-tenant database (e.g. `organization_id`).
-- This is not extracted as a dimension table.
--
-- 'DimId':
-- A fact column which constains an id of a dimension which does not need to extracted as a table
-- and exists already.
--
-- 'DimVal':
-- A fact column which constains a value which needs to be extracted to a dimension table.
-- Multiple DimVal fact columns can be extracted to the same dimension table.
--
-- 'FactCount':
-- A fact column which will contain the count of the rows (@count(*)@) or count of a source column
-- if provided.
--
-- 'FactCountDistinct':
-- A fact column which will contain the count of the unique values of a source column if provided
-- or the primary key of the table.
--
-- 'FactSum':
-- A fact column which will contain the sum of the values of the provided source column.
--
-- 'FactAverage':
-- A fact column which will contain the average of the values of the provided source column.
--
-- 'FactMax':
-- A fact column which will contain the maximum of the values of the provided source column.
--
-- 'FactMin':
-- A fact column which will contain the minimum of the values of the provided source column.
#endif
data FactColumnType (a :: FactColumnKind) where
#if MIN_VERSION_base(4,9,0)
-- | A fact column which contains a time dimension data (e.g. `created_at`). This is not exatracted
-- as a dimension table and instead just converted to an int which depends on 'settingTimeUnit'.
-- Every fact must have one of this.
#endif
DimTime :: FactColumnType 'FCKNone
#if MIN_VERSION_base(4,9,0)
-- | A fact column which contains an id of a dimension which does not need to extracted as a table
-- and does not exist already.
#endif
NoDimId :: FactColumnType 'FCKNone
#if MIN_VERSION_base(4,9,0)
-- | A fact column which constains an id of a tenant in a multi-tenant database (e.g. `organization_id`).
-- This is not extracted as a dimension table.
#endif
TenantId :: FactColumnType 'FCKNone
#if MIN_VERSION_base(4,9,0)
-- | A fact column which constains an id of a dimension which does not need to extracted as a table
-- and exists already.
#endif
DimId :: { factColTargetTable :: !TableName -- ^ Name of the target dimension table
} -> FactColumnType 'FCKTargetTable
#if MIN_VERSION_base(4,9,0)
-- | A fact column which constains a value which needs to be extracted to a dimension table.
-- Multiple DimVal fact columns can be extracted to the same dimension table.
#endif
DimVal :: { factColTargetTable :: !TableName } -> FactColumnType 'FCKTargetTable
#if MIN_VERSION_base(4,9,0)
-- | A fact column which will contain the count of the rows (@count(*)@) or count of a source column
-- if provided.
#endif
FactCount :: { factColMaybeSourceColumn :: !(Maybe ColumnName) -- ^ Name of the optional source column
} -> FactColumnType 'FCKMaybeSourceColumn
#if MIN_VERSION_base(4,9,0)
-- | A fact column which will contain the count of the unique values of a source column if provided
-- or the primary key of the table.
#endif
FactCountDistinct :: { factColMaybeSourceColumn :: !(Maybe ColumnName) } -> FactColumnType 'FCKMaybeSourceColumn
#if MIN_VERSION_base(4,9,0)
-- | A fact column which will contain the sum of the values of the provided source column.
#endif
FactSum :: { factColSourceColumn :: !ColumnName -- ^ Name of the source column
} -> FactColumnType 'FCKSourceColumn
#if MIN_VERSION_base(4,9,0)
-- | A fact column which will contain the average of the values of the provided source column.
#endif
FactAverage :: { factColSourceColumn :: !ColumnName } -> FactColumnType 'FCKSourceColumn
#if MIN_VERSION_base(4,9,0)
-- | A fact column which will contain the maximum of the values of the provided source column.
#endif
FactMax :: { factColSourceColumn :: !ColumnName } -> FactColumnType 'FCKSourceColumn
#if MIN_VERSION_base(4,9,0)
-- | A fact column which will contain the minimum of the values of the provided source column.
#endif
FactMin :: { factColSourceColumn :: !ColumnName } -> FactColumnType 'FCKSourceColumn
deriving instance Show (FactColumnType a)
-- | A column in a fact table
data FactColumn = forall a. FactColumn
{ -- | Name of the fact column in the generated table
factColTargetColumn :: !ColumnName
-- | Type of the fact column
, factColType :: FactColumnType a }
deriving instance Show FactColumn
-- | A fact is a table that records measurements or metrics for a specific event
--
-- The following represents a fact for the same multi-publisher blog system:
--
-- >>> :{
-- let postFact =
-- Fact { factName = "post_views"
-- , factTableName = "post_view_events"
-- , factTablePersistent = True
-- , factParentNames = []
-- , factColumns =
-- [ FactColumn "created_at" $ DimTime
-- , FactColumn "publisher_id" $ TenantId
-- , FactColumn "user_id" $ DimId "users"
-- , FactColumn "post_id" $ NoDimId
-- , FactColumn "geo_city" $ DimVal "geo"
-- , FactColumn "geo_country" $ DimVal "geo"
-- , FactColumn "device_name" $ DimVal "device"
-- , FactColumn "device_type" $ DimVal "device"
-- , FactColumn "count" $ FactCount Nothing
-- , FactColumn "unq_device_count" $ FactCountDistinct $ Just "device_name"
-- , FactColumn "time_spent" $ FactSum "time_spent"
-- , FactColumn "max_time_spent" $ FactMax "time_spent"
-- ]
-- }
-- :}
data Fact = Fact
{ -- | Name of the fact
factName :: !TableName
-- | Name of the table from which the fact is derived
, factTableName :: !TableName
-- | If true, the generated fact table is actually created; if false, the generated
-- fact table is just used for intermidiate computations and is not actually created
, factTablePersistent :: !Bool
-- | Names of the parent facts
, factParentNames :: ![TableName]
-- | A list of fact columns in the fact
, factColumns :: ![FactColumn]
} deriving (Show)
-- | Returns the name of the optional source column of a fact column
factSourceColumnName :: FactColumn -> Maybe ColumnName
factSourceColumnName FactColumn {..} = case factColType of
DimTime -> Just factColTargetColumn
NoDimId -> Just factColTargetColumn
TenantId -> Just factColTargetColumn
DimId {..} -> Just factColTargetColumn
DimVal {..} -> Just factColTargetColumn
FactCount {..} -> factColMaybeSourceColumn
FactCountDistinct {..} -> factColMaybeSourceColumn
FactSum {..} -> Just factColSourceColumn
FactAverage {..} -> Just factColSourceColumn
FactMax {..} -> Just factColSourceColumn
FactMin {..} -> Just factColSourceColumn
-- | Units of time
data TimeUnit = Second | Minute | Hour | Day | Week
deriving (Eq, Enum, Show, Read)
-- | Returns the name of a time unit
timeUnitName :: TimeUnit -> Text
timeUnitName = Text.toLower . Text.pack . show
-- | Returns the number of seconds in a time unit
timeUnitToSeconds :: TimeUnit -> Int
timeUnitToSeconds Second = 1
timeUnitToSeconds Minute = 60 * timeUnitToSeconds Second
timeUnitToSeconds Hour = 60 * timeUnitToSeconds Minute
timeUnitToSeconds Day = 24 * timeUnitToSeconds Hour
timeUnitToSeconds Week = 7 * timeUnitToSeconds Day
-- | Global settings for the library
data Settings = Settings
{ -- | Prefix for the names of the generated dimension tables. Default: "dim_".
settingDimPrefix :: !Text
-- | Prefix for the names of the generated fact tables. Default: "fact_".
, settingFactPrefix :: !Text
-- | Infix for the names of the generated fact tables. Default: "_by_".
, settingFactInfix :: !Text
-- | Time unit used to summarize the fact table data. Default: 'Minute'.
, settingTimeUnit :: !TimeUnit
-- | Suffix for the names of the generated average-count fact columns. Default: "_count".
, settingAvgCountColumnSuffix :: !Text
-- | Suffix for the names of the generated average-sum fact columns. Default: "_sum".
, settingAvgSumColumnSuffix :: !Text
-- | Name of the id columns of the generated dimension tables. Default: "id".
, settingDimTableIdColumnName :: !Text
-- | Type of the id columns of the generated dimension tables. Default: "serial".
, settingDimTableIdColumnType :: !Text
-- | Type of the count fact columns of the generated dimension tables. Default: "integer".
, settingFactCountColumnType :: !Text
-- | Maximum error rate for the hyperloglog algorithm for computing
-- count distinct fact columns of the generated dimension tables. Default: 0.05.
, settingFactCountDistinctErrorRate :: !Double
-- | Name of the generated JSON file containing the dependency graph.
-- Default: "dependencies.json".
, settingDependenciesJSONFileName :: !Text
-- | Name of the generated JSON file containing the list of name of the generated
-- fact tables. Default: "facts.json".
, settingFactsJSONFileName :: !Text
-- | Name of the generated JSON file containing the list of name of the generated
-- dimension tables. Default: "dimensions.json".
, settingDimensionsJSONFileName :: !Text
-- | Value to coalesce the missing foreign key id column values to in the generated
-- fact tables. Default: -1.
, settingForeignKeyIdCoalesceValue :: !Int
-- | Suffix template for names of all the generated tables. Default: "{{suff}}".
, settingTableNameSuffixTemplate :: !Text
} deriving (Eq, Show)
-- | Settings with default values
defSettings :: Settings
defSettings = Settings
{ settingDimPrefix = "dim_"
, settingFactPrefix = "fact_"
, settingFactInfix = "_by_"
, settingTimeUnit = Minute
, settingAvgCountColumnSuffix = "_count"
, settingAvgSumColumnSuffix = "_sum"
, settingDimTableIdColumnName = "id"
, settingDimTableIdColumnType = "serial"
, settingFactCountColumnType = "integer"
, settingFactCountDistinctErrorRate = 0.05
, settingDependenciesJSONFileName = "dependencies.json"
, settingFactsJSONFileName = "facts.json"
, settingDimensionsJSONFileName = "dimensions.json"
, settingForeignKeyIdCoalesceValue = -1
, settingTableNameSuffixTemplate = "{{suff}}"
}
-- | Errors possible while validating the config
data ValidationError =
-- | When referencing a table which is missing from the config
MissingTable !TableName
-- | When referencing a fact which is missing from the config
| MissingFact !TableName
-- | When referencing a column which is missing from the config
| MissingColumn !TableName !ColumnName
-- | When a fact has no 'DimTime' columns
| MissingTimeColumn !TableName
-- | When a 'DimTime' fact column of a fact is nullable
| MissingNotNullConstraint !TableName !ColumnName
-- | When the default value of a type is missing from the config
| MissingTypeDefault !Text
-- | When there are multiple tables with the same name in the config
| DuplicateTable !TableName
-- | When there are multiple facts with the same name in the config
| DuplicateFact !TableName
-- | When there are multiple columns with the same name in a table in the config
| DuplicateColumn !TableName !ColumnName
-- | When there are multiple dimensions with the same name in the config
| DuplicateDimension !TableName
deriving (Eq, Show)
-- | A mapping of SQL types to their default values used to coleasce null column values in
-- the generated dimension and fact tables
type TypeDefaults = Map Text Text
-- | The config for the library
data Config = Config
{ _configTables :: ![Table]
, _configFacts :: ![Fact]
, _configSettings :: !Settings
, _configTypeDefaults :: !TypeDefaults
} deriving (Show)
-- | Return the list of source tables from the config
configTables :: Config -> [Table]
configTables = _configTables
-- | Return the list of facts to be generated from the config
configFacts :: Config -> [Fact]
configFacts = _configFacts
-- | Return the settings from the config
configSettings :: Config -> Settings
configSettings = _configSettings
-- | Return the defaults for the SQL types from the config
configTypeDefaults :: Config -> TypeDefaults
configTypeDefaults = _configTypeDefaults
-- | The mode for population of the generated tables; used to switch the SQL for table population
data TablePopulationMode = FullPopulation -- ^ Populating the tables fully, starting with empty ones
| IncrementalPopulation -- ^ Populating the tables incrementally
deriving (Eq, Show)
-- | The dependency graph of the generated tables describing the order in which they have to be populated
type Dependencies = Map TableName [TableName]