vignette

gluc · Aug 4, 2016 · ae92414 · ae92414
1 parent 1f22bf0
commit ae92414
Show file tree

Hide file tree

Showing 8 changed files with 160 additions and 31 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -19,6 +19,7 @@ export(SetNames)
 export(Tap)
 export(Today)
 import(magrittr)
+import(methods)
 import(utils)
 importFrom(DiagrammeR,create_edges)
 importFrom(DiagrammeR,create_graph)

diff --git a/R/functionParser.R b/R/functionParser.R
@@ -30,6 +30,7 @@ ParseExpression <- function(expressionString) {
 }
 
 #' @import utils
+#' @import methods
 .ParseExpression <- function(expressionNode) {
   #is this named?
   idx <- ParseFindArgumentName(expressionNode)
@@ -50,7 +51,7 @@ ParseExpression <- function(expressionString) {
     expressionNode$.variableName <- ParseVariableName(expressionNode, idx)
   } else if (expressionNode$.type == "R") {
     expressionNode$expression <- paste0(expressionNode$.expressionV, collapse = "") %>% trimws %>% type.convert(as.is = TRUE)
-    if (is.character(expressionNode$expression) && ! grepl("['\"].*['\"]$", expressionNode$expression, perl = TRUE)) expressionNode$expression <- paste0("'", expressionNode$expression, "'")
+    #if (is.character(expressionNode$expression) && ! grepl("['\"].*['\"]$", expressionNode$expression, perl = TRUE)) expressionNode$expression <- paste0("'", expressionNode$expression, "'")
   } else stop (paste0("Unkown expressionNode type ", expressionNode$.type))
 
 }

diff --git a/Rprof.out b/Rprof.out
diff --git a/inst/extdata/context1.yaml b/inst/extdata/context1.yaml
@@ -56,7 +56,7 @@ modules:
 Closing Prices:
   type: structure
   variables:
-    series: Close
+    series: "'Close'"
     maxNaRatioDefault: 0.25
   Indices:
     type: structure
@@ -76,8 +76,8 @@ Closing Prices:
       variables:
         #variableName: value
         maxNaRatio: $maxNaRatioDefault
-        yahooSymbol: '^GSPC'
-        quandlCode: 'YAHOO/INDEX_GSPC'
+        yahooSymbol: "'^GSPC'"
+        quandlCode: "'YAHOO/INDEX_GSPC'"
       Pipe:
         type: pipe
         DateRange:
@@ -96,22 +96,22 @@ Closing Prices:
           Apple stock price
       variables:
         maxNaRatio: '$maxNaRatioDefault'
-        yahooSymbol: "AAPL"
-        quandlCode: "YAHOO/AAPL"
+        yahooSymbol: "'AAPL'"
+        quandlCode: "'YAHOO/AAPL'"
       pipe: *QYPipe
     MSFT:
       type: tap
       attributes:
-        longname: "Microsoft"
+        longname: "'Microsoft'"
       variables:
         maxNaRatio: 0.0
-        yahooSymbol: "MSFT"
-        quandlCode: "YAHOO/MSFT"
+        yahooSymbol: "'MSFT'"
+        quandlCode: "'YAHOO/MSFT'"
       pipe: *QYPipe
   Fabricated:
     type: structure
     variables:
-      startDateDefault: 1990-01-01
+      startDateDefault: "'1990-01-01'"
     Ones:
       type: tap
       parameters:

diff --git a/inst/extdata/context2.yaml b/inst/extdata/context2.yaml
@@ -0,0 +1,21 @@
+#simple context
+stocks:
+  type: structure
+  Apple:
+    type: tap
+    download:
+      type: processor
+      function: Quandl::Quandl(code = 'YAHOO/AAPL', type = 'xts')
+  Tesla:
+    type: tap
+    download:
+      type: processor
+      function: Quandl::Quandl(code = 'YAHOO/TSLA', type = 'xts')
+indices:
+  type: structure
+  S&P500:
+    type: tap
+    download:
+      type: processor
+      function: quantmod::getSymbols(Symbols = '^GSPC', auto.assign = FALSE)
+
diff --git a/tests/testthat/test-execution.R b/tests/testthat/test-execution.R
@@ -8,7 +8,7 @@ test_that("GetVariableVal", {
 Tap:
   type: tap
   variables:
-    x: a
+    x: \"'a'\"
     'y': 3
     z: .sum(3, 5)
   Processor:

diff --git a/tests/testthat/test-parsing.R b/tests/testthat/test-parsing.R
@@ -105,4 +105,50 @@ RandomCache:
 
 })
 
+test_that("variable", {
+  contextString <- "
+cars:
+  type: tap
+  load:
+    type: processor
+    function: identity(mtcars)
+"
+
+  context <- Load(textConnection(contextString))
+  res <- context$cars$tap()
+  expect_equal(res, mtcars)
+
+})
+
+
+test_that("string", {
+  contextString <- "
+cars:
+  type: tap
+  load:
+    type: processor
+    function: identity('mtcars')
+"
+
+  context <- Load(textConnection(contextString))
+  res <- context$cars$tap()
+  expect_equal(res, "mtcars")
+
+})
+
+
+test_that("complex", {
+  contextString <- "
+cars:
+  type: tap
+  load:
+    type: processor
+    function: identity(mtcars[1])
+"
+
+  context <- Load(textConnection(contextString))
+  res <- context$cars$tap()
+  expect_equal(res, mtcars[1])
+
+})
 
diff --git a/vignettes/datap_specification.Rmd b/vignettes/datap_specification.Rmd
@@ -15,6 +15,7 @@ vignette: |
 
 ```{r setup, include=FALSE}
 knitr::opts_chunk$set(echo = TRUE)
+options("getSymbols.warning4.0"=FALSE)
 ```
 
 > “If I could do it all again, I'd be a plumber.”
@@ -32,25 +33,74 @@ knitr::opts_chunk$set(echo = TRUE)
 
 ## Scope
 
-datap is a lightweight DSL (Domain Specific Language) to define configurable, modular, and re-usable data processes in the R programming language. datap contexts can be used to acquire, pre-process, quality-assure, and merge data.
+datap is a lightweight DSL (Domain Specific Language) to define configurable, modular, and re-usable data processes for use in the R programming language. datap contexts can be used to acquire, pre-process, quality-assure, and merge data.
 
 In practice, each datap setup will consist of the following elements:
 
-1. One or more **datap contexts**. Each context is defined in in a yaml file and contains a series of **taps**. Each tap represents 
-a specific dataset, as well as their source and pre-processing steps.
+1. One or more **datap contexts**. Each context is defined in in a yaml file and contains a series of hierarchically organised **taps**. Each tap represents a specific dataset, together with its source and pre-processing steps.
 2. One or more **R packages**, or your own R scripts containing functions. The R functions do the actual units of work of the pre-processing steps defined in (1), like e.g. downloading data from the internet, data cleaning, merging, etc. The packages are typically datap-agnostic.
-3. The **datap interpreter**, i.e. the R datap package. The interpreter parses the datap context, and maps pre-processing steps defined in (1) to actual library functions available in (2).
+3. The **datap interpreter**, i.e. the R datap package. The interpreter parses the datap context, and maps pre-processing steps defined in (1) to actual library functions available in (2), so as to provide - for each tap - an R function that can be called by the user of the library.
 
 This document is about the first part only: the datap context file definition.
 
+## Short Example
+
+### datap context
+
+Consider the following, very simple context that is provided with the datap package as *context2.yaml*:
+
+
+```{r, echo = FALSE, comment = ""}
+filePath <- system.file("extdata", "context2.yaml", package="datap")
+yamlString <- paste0(readLines(filePath), collapse = "\n")
+cat(yamlString)
+
+```
+
+It defines three taps (Apple, Tesla, and S&P500), and organises stocks and indices neatly in a hierarchical structure.
+
+### datap interpreter
+
+If you have the datap package installed, you can load the context into memory using the `datap::Load` function:
+
+```{r}
+library(datap)
+filePath <- system.file("extdata", "context2.yaml", package="datap")
+context <- Load(filePath)
+```
+
+The context looks like this:
+
+```{r}
+context
+```
+
+And you can directly navigate to a tap and tap into the data:
+
+```{r}
+teslaBars <- context$stocks$Tesla$tap()
+head(teslaBars)
+```
+
+For the user of the context, it is completely transparent where the data is coming from and how it is pre-processed. For example, the S&P500 index is downloaded from Yahoo finance and not from Quandl. Yet, the user accesses the dataset in exactly the same way:
+
+```{r}
+spx <- context$indices$`S&P500`$tap()
+head(spx$GSPC.Open)
+```
+
+However, in a real world scenario additional pre-processing steps are necessary to make sure that the structure of the data is indeed the same for datasets from different sources.
+
+In general, however, your data can be anything, and it can come from any source (the internet, a file, from memory, by calling an R function, or generated on the fly by your context, etc.).
+
 ## Syntax Description Conventions
 
 In this document, the datap syntax is described using the following conventions:
 
 * `>`: a reference to a specific datap element
 * `[]`: optional elements
 * `@`: replace the following string with an appropriate name
-* `n*`: repeat the element n times
+* `n*`: repeat the element n times, where n can be any positive number
 * `|`: or
 
 # datap Syntax
@@ -62,25 +112,26 @@ A datap [`>context`](#context) is defined in a single YAML document. A YAML docu
 A [`>context`](#context) spans a tree whose nodes are each one the following types of *joints*:
 
 * [`>tap`](#tap): entry point to data, can have parameters
-* [`>structure`](#structure): organise taps into hierarchies
 * flow control:
 	* [`>pipe`](#pipe): combine joints serially
 	* [`>junction`](#junction): merge multiple sub-joints into one
 * [`>processor`](#processor): unit of work (data acquisition and pre-processing)
 * error handling:
 	* [`>warning`](#warning)
 	* [`>error`](#error)
+* [`>module`](#module): define re-usable pipes
+* [`>structure`](#structure): organise taps into hierarchies
 
 Each joint consists of the following:
 
 1. a mandatory *type* (tap, structure, pipe, junction, processor, warning, error)
-2. depending on the type, named elements, namely;
+2. named elements, namely (depending on the [`>joint`](#Joints) type);
     + [`>attributes`](#attributes)
     + [`>variables`](#variables)
     + [`>parameters`](#parameters)
     + [`>function`](#function)
     + [`>condition`](#condition)
-3. other, nested *joints*
+3. other, nested [`>joints`](#Joints)
 
 The flow of data is from leafs towards the root, and ends at a [`>tap`](#tap). Thus, each sub-tree below a [`>tap`](#tap) defines the processing steps of a [`>tap`](#tap). In line with data flow, we use the term *upstream* to denote joints that are processed before a given joint. We use *downstream* to denote joints that are processed after a given joint.
 
@@ -96,7 +147,7 @@ A [`>variables`](#variables) section is an *associative list*, called "variables
     n* $variableName: $value
 ```
 
-The names of *[special references](#special-references)* cannot be used as variable name (namely: "inflow", "joint", "context").
+The names of *[special references](#special-reference)* cannot be used as variable name (namely: "inflow", "joint", "context").
 
 The *scope* of a variable is the sub-tree spanned by the joint in which the variable is defined. A variable value can be overwritten by an upstream joint.
 
@@ -117,10 +168,10 @@ Closing Prices:
 
 A [`>reference`](#reference) has an *`$`* prefix, and refers to a downstream [`>variable`](#variables), a [`>parameter`](#parameters), or a [`>special  reference`](#special-reference).
 
-You can use a [`>reference`](#reference) in a [`>parameter`](#parameters), a [`>function`](#function), or in another [`>variable`](#variables).
+You can use a [`>reference`](#reference) in a [`>parameter`](#parameters) or in a [`>variable`](#variables).
 
 ```
->parameters|>function|>variables:
+>parameters|>variables:
   $name: $@variableReferenceName
 ```
 
@@ -135,6 +186,22 @@ AAPL:
     maxNaRatio: $maxNaRatioDefault
 ```
 
+You can also use a [`>reference`](#reference) in a [`>function`](#function).
+
+For example:
+
+```{YAML}
+AAPL:
+  type: tap
+  variables:
+    ticker: "'YAHOO/AAPL'"
+  download:
+    type: processor
+    function: Quandl::Quandl(code = $ticker, type = 'xts')
+```
+
+>function|
+
 ### special reference
 
 The following variable references can be used without defining the variables downstream:
@@ -230,9 +297,8 @@ and
 
 The function syntax is similar to R, with a few differences:
 
-1. variables can be used, but they must be referenced using '$'. e.g. `sum(2, $param1)`
-2. string literals are interpreted as is, without the necessity of quotes, e.g. `paste(a, b)
-3. eclipsis (three dots / ...) are not supported
+1. datap variables can be used, but they must be referenced using '$'. e.g. `sum(2, $param1)`
+2. eclipsis (three dots / ...) are not supported
 
 However, you may nest functions, e.g. `sum(2, sum(3, 5))`, or `sum(seq(1, 10))`.
 You may also use named parameters, e.g. `sum(2, 3, na.rm = TRUE)`.