feat: data representations allow custom parsing and formatting of API…

… fields. See PR #2523. Most notable code changes: - Load data representation casts into schema cache. - Data representations for reads, filters, inserts, updates, views, over joins. - `CoercibleField` represents name references in queries where coercion may be needed. - `ResolverContext` help facilitate field resolution during planning. - Planner 'resolves' names in the API query and pairs them with any implicit conversions to be used in the query builder stage. - Tests for all of the above. - More consistent naming (TypedX -> CoercibleX). New: unit tests for more data representation use cases; helpful as examples as well. New: update CHANGELOG with data representations feature description. Fixed failing idempotence test. New: replace date formatter test with one that does something. Fixup: inadvertent CHANGELOG change after rebase. Cleanup: `tfName` -> `cfName` and related. Document what IRType means. Formatting. New: use a subquery to interpret `IN` literals requiring data rep transformation. - With the previous method, very long queries such as `ANY (ARRAY[test.color('000100'), test.color('CAFE12'), test.color('01E240'), ...` could be generated. Consider the case where the parser function name is 45 characters and there's a hundred literals. That's 4.5kB of SQL just for the function name alone! - New version uses `unnest`: `ANY (SELECT test.color(unnest('{000100,CAFE12,01E240,...}'::text[]))` to produce a much shorter query. - This is likely to be more performant and either way much more readable and debuggable in the logs.
PostgREST · Jun 29, 2023 · 0a1564b · 0a1564b
1 parent 078c6ec
commit 0a1564b
Show file tree

Hide file tree

Showing 17 changed files with 1,046 additions and 164 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,22 @@ This project adheres to [Semantic Versioning](http://semver.org/).
 
 ## Unreleased
 
+### Added
+
+ - #2523, Data representations - @aljungberg
+   + Allows for flexible API output formatting and input parsing on a per-column type basis using regular SQL functions configured in the database
+   + Enables greater flexibility in the form and shape of your APIs, both for output and input, making PostgREST a more versatile general-purpose API server
+   + Examples include base64 encode/decode your binary data (like a `bytea` column containing an image), choose whether to present a timestamp column as seconds since the Unix epoch or as an ISO 8601 string, or represent fixed precision decimals as strings, not doubles, to preserve precision
+   + ...and accept the same in `POST/PUT/PATCH` by configuring the reverse transformation(s)
+   + Other use-cases include custom representation of enums, arrays, nested objects, CSS hex colour strings, gzip compressed fields, metric to imperial conversions, and much more
+   + Works when using the `select` parameter to select only a subset of columns, embedding through complex joins, renaming fields, with views and computed columns
+   + Works when filtering on a formatted column without extra indexes by parsing to the canonical representation
+   + Works for data `RETURNING` operations, such as requesting the full body in a POST/PUT/PATCH with `Prefer: return=representation`
+   + Works for batch updates and inserts
+   + Completely optional, define the functions in the database and they will be used automatically everywhere
+   + Data representations preserve the ability to write to the original column and require no extra storage or complex triggers (compared to using `GENERATED ALWAYS` columns)
+   + Note: data representations require Postgres 10 (Postgres 11 if using `IN` predicates); data representations are not implemented for RPC
+
 ### Fixed
 
  - #2821, Fix OPTIONS not accepting all available media types - @steve-chavez

diff --git a/postgrest.cabal b/postgrest.cabal
@@ -49,6 +49,7 @@ library
                       PostgREST.SchemaCache.Identifiers
                       PostgREST.SchemaCache.Routine
                       PostgREST.SchemaCache.Relationship
+                      PostgREST.SchemaCache.Representations
                       PostgREST.SchemaCache.Table
                       PostgREST.Error
                       PostgREST.Logger

diff --git a/src/PostgREST/Plan.hs b/src/PostgREST/Plan.hs
diff --git a/src/PostgREST/Plan/MutatePlan.hs b/src/PostgREST/Plan/MutatePlan.hs
@@ -6,8 +6,9 @@ where
 import qualified Data.ByteString.Lazy as LBS
 
 import PostgREST.ApiRequest.Preferences  (PreferResolution)
-import PostgREST.ApiRequest.Types        (LogicTree, OrderTerm)
-import PostgREST.Plan.Types              (TypedField)
+import PostgREST.ApiRequest.Types        (OrderTerm)
+import PostgREST.Plan.Types              (CoercibleField,
+                                          CoercibleLogicTree)
 import PostgREST.RangeQuery              (NonnegRange)
 import PostgREST.SchemaCache.Identifiers (FieldName,
                                           QualifiedIdentifier)
@@ -18,27 +19,27 @@ import Protolude
 data MutatePlan
   = Insert
       { in_        :: QualifiedIdentifier
-      , insCols    :: [TypedField]
+      , insCols    :: [CoercibleField]
       , insBody    :: Maybe LBS.ByteString
       , onConflict :: Maybe (PreferResolution, [FieldName])
-      , where_     :: [LogicTree]
+      , where_     :: [CoercibleLogicTree]
       , returning  :: [FieldName]
       , insPkCols  :: [FieldName]
       , applyDefs  :: Bool
       }
   | Update
       { in_       :: QualifiedIdentifier
-      , updCols   :: [TypedField]
+      , updCols   :: [CoercibleField]
       , updBody   :: Maybe LBS.ByteString
-      , where_    :: [LogicTree]
+      , where_    :: [CoercibleLogicTree]
       , mutRange  :: NonnegRange
       , mutOrder  :: [OrderTerm]
       , returning :: [FieldName]
       , applyDefs :: Bool
       }
   | Delete
       { in_       :: QualifiedIdentifier
-      , where_    :: [LogicTree]
+      , where_    :: [CoercibleLogicTree]
       , mutRange  :: NonnegRange
       , mutOrder  :: [OrderTerm]
       , returning :: [FieldName]

diff --git a/src/PostgREST/Plan/ReadPlan.hs b/src/PostgREST/Plan/ReadPlan.hs
@@ -6,9 +6,11 @@ module PostgREST.Plan.ReadPlan
 
 import Data.Tree (Tree (..))
 
-import PostgREST.ApiRequest.Types         (Alias, Cast, Depth, Field,
-                                           Hint, JoinType, LogicTree,
-                                           NodeName, OrderTerm)
+import PostgREST.ApiRequest.Types         (Alias, Cast, Depth, Hint,
+                                           JoinType, NodeName,
+                                           OrderTerm)
+import PostgREST.Plan.Types               (CoercibleField (..),
+                                           CoercibleLogicTree)
 import PostgREST.RangeQuery               (NonnegRange)
 import PostgREST.SchemaCache.Identifiers  (FieldName,
                                            QualifiedIdentifier)
@@ -26,10 +28,10 @@ data JoinCondition =
   deriving (Eq)
 
 data ReadPlan = ReadPlan
-  { select       :: [(Field, Maybe Cast, Maybe Alias)]
+  { select       :: [(CoercibleField, Maybe Cast, Maybe Alias)]
   , from         :: QualifiedIdentifier
   , fromAlias    :: Maybe Alias
-  , where_       :: [LogicTree]
+  , where_       :: [CoercibleLogicTree]
   , order        :: [OrderTerm]
   , range_       :: NonnegRange
   , relName      :: NodeName

diff --git a/src/PostgREST/Plan/Types.hs b/src/PostgREST/Plan/Types.hs
@@ -1,24 +1,50 @@
 module PostgREST.Plan.Types
-  ( TypedField(..)
-  , resolveTableField
+  ( CoercibleField(..)
+  , unknownField
+  , CoercibleLogicTree(..)
+  , CoercibleFilter(..)
+  , TransformerProc
   ) where
 
-import qualified Data.HashMap.Strict.InsOrd as HMI
+import PostgREST.ApiRequest.Types (JsonPath, LogicOperator, OpExpr)
 
 import PostgREST.SchemaCache.Identifiers (FieldName)
-import PostgREST.SchemaCache.Table       (Column (..), Table (..))
 
 import Protolude
 
--- | A TypedField is a field with sufficient information to be read from JSON with `json_to_recordset`.
-data TypedField = TypedField
-   { tfName    :: FieldName
-   , tfIRType  :: Text -- ^ The initial type of the field, before any casting.
-   , tfDefault :: Maybe Text
-   } deriving (Eq)
-
-resolveTableField :: Table -> FieldName -> Maybe TypedField
-resolveTableField table fieldName =
-  case HMI.lookup fieldName (tableColumns table) of
-    Just column -> Just $ TypedField (colName column) (colNominalType column) (colDefault column)
-    Nothing     -> Nothing
+type TransformerProc = Text
+
+-- | A CoercibleField pairs the name of a query element with any type coercion information we need for some specific use case.
+-- |
+-- | As suggested by the name, it's often a reference to a field in a table but really it can be any nameable element (function parameter, calculation with an alias, etc) with a knowable type.
+-- |
+-- | In the simplest case, it allows us to parse JSON payloads with `json_to_recordset`, for which we need to know both the name and the type of each thing we'd like to extract. At a higher level, CoercibleField generalises to reflect that any value we work with in a query may need type specific handling.
+-- |
+-- | CoercibleField is the foundation for the Data Representations feature. This feature allow user-definable mappings between database types so that the same data can be presented or interpreted in various ways as needed. Sometimes the way Postgres coerces data implicitly isn't right for the job. Different mappings might be appropriate for different situations: parsing a filter from a query string requires one function (text -> field type) while parsing a payload from JSON takes another (json -> field type). And the reverse, outputting a field as JSON, requires yet a third (field type -> json). CoercibleField is that "job specific" reference to an element paired with the type we desire for that particular purpose and the function we'll use to get there, if any.
+-- |
+-- | In the planning phase, we "resolve" generic named elements into these specialised CoercibleFields. Again this is context specific: two different CoercibleFields both representing the exact same table column in the database, even in the same query, might have two different target types and mapping functions. For example, one might represent a column in a filter, and another the very same column in an output role to be sent in the response body.
+-- |
+-- | The type value is allowed to be the empty string. The analog here is soft type checking in programming languages: sometimes we don't need a variable to have a specified type and things will work anyhow. So the empty type variant is valid when we don't know and *don't need to know* about the specific type in some context. Note that this variation should not be used if it guarantees failure: in that case you should instead raise an error at the planning stage and bail out. For example, we can't parse JSON with `json_to_recordset` without knowing the types of each recipient field, and so error out. Using the empty string for the type would be incorrect and futile. On the other hand we use the empty type for RPC calls since type resolution isn't implemented for RPC, but it's fine because the query still works with Postgres' implicit coercion. In the future, hopefully we will support data representations across the board and then the empty type may be permanently retired.
+data CoercibleField = CoercibleField
+  { cfName      :: FieldName
+  , cfJsonPath  :: JsonPath
+  , cfIRType    :: Text                  -- ^ The native Postgres type of the field, the intermediate (IR) type before mapping.
+  , cfTransform :: Maybe TransformerProc -- ^ The optional mapping from irType -> targetType.
+  , cfDefault   :: Maybe Text
+  } deriving Eq
+
+unknownField :: FieldName -> JsonPath -> CoercibleField
+unknownField name path = CoercibleField name path "" Nothing Nothing
+
+-- | Like an API request LogicTree, but with coercible field information.
+data CoercibleLogicTree
+  = CoercibleExpr Bool LogicOperator [CoercibleLogicTree]
+  | CoercibleStmnt CoercibleFilter
+  deriving (Eq)
+
+data CoercibleFilter = CoercibleFilter
+  { field  :: CoercibleField
+  , opExpr :: OpExpr
+  }
+  | CoercibleFilterNullEmbed Bool FieldName
+  deriving (Eq)
diff --git a/src/PostgREST/Query/QueryBuilder.hs b/src/PostgREST/Query/QueryBuilder.hs
@@ -55,7 +55,7 @@ readPlanToQuery (Node ReadPlan{select,from=mainQi,fromAlias,where_=logicForest,o
   where
     fromFrag = fromF relToParent mainQi fromAlias
     qi = getQualifiedIdentifier relToParent mainQi fromAlias
-    defSelect = [(("*", []), Nothing, Nothing)] -- gets all the columns in case of an empty select, ignoring/obtaining these columns is done at the aggregation stage
+    defSelect = [(unknownField "*" [], Nothing, Nothing)] -- gets all the columns in case of an empty select, ignoring/obtaining these columns is done at the aggregation stage
     (selects, joins) = foldr getSelectsJoins ([],[]) forest
 
 getSelectsJoins :: ReadPlanTree -> ([SQL.Snippet], [SQL.Snippet]) -> ([SQL.Snippet], [SQL.Snippet])
@@ -98,11 +98,11 @@ mutatePlanToQuery (Insert mainQi iCols body onConflct putConditions returnings _
       MergeDuplicates  ->
         if null iCols
            then "DO NOTHING"
-           else "DO UPDATE SET " <> intercalateSnippet ", " ((pgFmtIdent . tfName) <> const " = EXCLUDED." <> (pgFmtIdent . tfName) <$> iCols)
+           else "DO UPDATE SET " <> intercalateSnippet ", " ((pgFmtIdent . cfName) <> const " = EXCLUDED." <> (pgFmtIdent . cfName) <$> iCols)
     ) onConflct <> " " <>
   returningF mainQi returnings
   where
-    cols = intercalateSnippet ", " $ pgFmtIdent . tfName <$> iCols
+    cols = intercalateSnippet ", " $ pgFmtIdent . cfName <$> iCols
 
 -- An update without a limit is always filtered with a WHERE
 mutatePlanToQuery (Update mainQi uCols body logicForest range ordts returnings applyDefaults)
@@ -136,8 +136,8 @@ mutatePlanToQuery (Update mainQi uCols body logicForest range ordts returnings a
     whereLogic = if null logicForest then mempty else " WHERE " <> intercalateSnippet " AND " (pgFmtLogicTree mainQi <$> logicForest)
     mainTbl = fromQi mainQi
     emptyBodyReturnedColumns = if null returnings then "NULL" else intercalateSnippet ", " (pgFmtColumn (QualifiedIdentifier mempty $ qiName mainQi) <$> returnings)
-    nonRangeCols = intercalateSnippet ", " (pgFmtIdent . tfName <> const " = " <> pgFmtColumn (QualifiedIdentifier mempty "pgrst_body") . tfName <$> uCols)
-    rangeCols = intercalateSnippet ", " ((\col -> pgFmtIdent (tfName col) <> " = (SELECT " <> pgFmtIdent (tfName col) <> " FROM pgrst_update_body) ") <$> uCols)
+    nonRangeCols = intercalateSnippet ", " (pgFmtIdent . cfName <> const " = " <> pgFmtColumn (QualifiedIdentifier mempty "pgrst_body") . cfName <$> uCols)
+    rangeCols = intercalateSnippet ", " ((\col -> pgFmtIdent (cfName col) <> " = (SELECT " <> pgFmtIdent (cfName col) <> " FROM pgrst_update_body) ") <$> uCols)
     (whereRangeIdF, rangeIdF) = mutRangeF mainQi (fst . otTerm <$> ordts)
 
 mutatePlanToQuery (Delete mainQi logicForest range ordts returnings)
@@ -171,7 +171,7 @@ callPlanToQuery (FunctionCall qi params args returnsScalar returnsSetOfScalar re
     fromCall = case params of
       OnePosParam prm -> "FROM " <> callIt (singleParameter args $ encodeUtf8 $ ppType prm)
       KeyParams []    -> "FROM " <> callIt mempty
-      KeyParams prms  -> fromJsonBodyF args ((\p -> TypedField (ppName p) (ppType p) Nothing) <$> prms) False True False <> ", " <>
+      KeyParams prms  -> fromJsonBodyF args ((\p -> CoercibleField (ppName p) mempty (ppType p) Nothing Nothing) <$> prms) False True False <> ", " <>
                          "LATERAL " <> callIt (fmtParams prms)
 
     callIt :: SQL.Snippet -> SQL.Snippet