Merge pull request #3 from aabbasi-hbo/main

merge main to feature
aabbasi-hbo · Nov 4, 2022 · 0668b51 · 0668b51
2 parents b71b415 + 5d724f1
commit 0668b51
Show file tree

Hide file tree

Showing 150 changed files with 6,564 additions and 1,831 deletions.
diff --git a/.github/workflows/devskim-security-linter.yml b/.github/workflows/devskim-security-linter.yml
@@ -0,0 +1,37 @@
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party (Microsoft) and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+# For more details about Devskim, visit https://github.com/marketplace/actions/devskim 
+
+name: DevSkim
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+  schedule:
+    - cron: '00 4 * * *'
+
+jobs:
+  lint:
+    name: DevSkim
+    runs-on: ubuntu-20.04
+    permissions:
+      actions: read
+      contents: read
+      security-events: write
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Run DevSkim scanner
+        uses: microsoft/DevSkim-Action@v1
+        with:
+          ignore-globs: "**/.git/**,**/test/**"
+
+      - name: Upload DevSkim scan results to GitHub Security tab
+        uses: github/codeql-action/upload-sarif@v2
+        with:
+          sarif_file: devskim-results.sarif
diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml
@@ -52,27 +52,20 @@ jobs:
 
 
     steps:
-      - name: Deploy to Feathr SQL Registry Azure Web App
-        id: deploy-to-sql-webapp
-        uses: azure/webapps-deploy@v2
-        with:
-          app-name: 'feathr-sql-registry'
-          publish-profile: ${{ secrets.AZURE_WEBAPP_PUBLISH_PROFILE_FEATHR_SQL_REGISTRY }}
-          images: 'feathrfeaturestore/feathr-registry:nightly'
-
       - name: Deploy to Feathr Purview Registry Azure Web App
         id: deploy-to-purview-webapp
-        uses: azure/webapps-deploy@v2
-        with:
-          app-name: 'feathr-purview-registry'
-          publish-profile: ${{ secrets.AZURE_WEBAPP_PUBLISH_PROFILE_FEATHR_PURVIEW_REGISTRY }}
-          images: 'feathrfeaturestore/feathr-registry:nightly'
+        uses: distributhor/[email protected]
+        env:
+          webhook_url: ${{ secrets.AZURE_WEBAPP_FEATHR_PURVIEW_REGISTRY_WEBHOOK }}
 
       - name: Deploy to Feathr RBAC Registry Azure Web App
         id: deploy-to-rbac-webapp
-        uses: azure/webapps-deploy@v2
-        with:
-          app-name: 'feathr-rbac-registry'
-          publish-profile: ${{ secrets.AZURE_WEBAPP_PUBLISH_PROFILE_FEATHR_RBAC_REGISTRY }}
-          images: 'feathrfeaturestore/feathr-registry:nightly'
-
+        uses: distributhor/[email protected]
+        env:
+          webhook_url: ${{ secrets.AZURE_WEBAPP_FEATHR_RBAC_REGISTRY_WEBHOOK }}
+
+      - name: Deploy to Feathr SQL Registry Azure Web App
+        id: deploy-to-sql-webapp
+        uses: distributhor/[email protected]
+        env:
+          webhook_url: ${{ secrets.AZURE_WEBAPP_FEATHR_SQL_REGISTRY_WEBHOOK }}
diff --git a/.github/workflows/document-scan.yml b/.github/workflows/document-scan.yml
@@ -1,6 +1,9 @@
 name: Feathr Documents' Broken Link Check
 
-on: [push]
+on:
+  push:
+    branches: [main]
+
 jobs:
   check-links:
     runs-on: ubuntu-latest

diff --git a/.github/workflows/pull_request_push_test.yml b/.github/workflows/pull_request_push_test.yml
@@ -22,11 +22,15 @@ on:
       - "docs/**"
       - "ui/**"
       - "**/README.md"
+
+  schedule:
+    # Runs daily at 1 PM UTC (9 PM CST), will send notification to TEAMS_WEBHOOK
+    - cron: '00 13 * * *'
 
 jobs:
   sbt_test:
     runs-on: ubuntu-latest
-    if: github.event_name == 'push' || github.event_name == 'pull_request' || (github.event_name == 'pull_request_target' && contains(github.event.pull_request.labels.*.name, 'safe to test'))
+    if: github.event_name == 'schedule' || github.event_name == 'push' || github.event_name == 'pull_request' || (github.event_name == 'pull_request_target' && contains(github.event.pull_request.labels.*.name, 'safe to test'))
     steps:
       - uses: actions/checkout@v2
         with:
@@ -41,7 +45,7 @@ jobs:
 
   python_lint:
     runs-on: ubuntu-latest
-    if: github.event_name == 'push' || github.event_name == 'pull_request' || (github.event_name == 'pull_request_target' && contains(github.event.pull_request.labels.*.name, 'safe to test'))
+    if: github.event_name == 'schedule' || github.event_name == 'push' || github.event_name == 'pull_request' || (github.event_name == 'pull_request_target' && contains(github.event.pull_request.labels.*.name, 'safe to test'))
     steps:
       - name: Set up Python 3.8
         uses: actions/setup-python@v2
@@ -61,7 +65,7 @@ jobs:
 
   databricks_test:
     runs-on: ubuntu-latest
-    if: github.event_name == 'push' || github.event_name == 'pull_request' || (github.event_name == 'pull_request_target' && contains(github.event.pull_request.labels.*.name, 'safe to test'))
+    if: github.event_name == 'schedule' || github.event_name == 'push' || github.event_name == 'pull_request' || (github.event_name == 'pull_request_target' && contains(github.event.pull_request.labels.*.name, 'safe to test'))
     steps:
       - uses: actions/checkout@v2
         with:
@@ -87,8 +91,7 @@ jobs:
       - name: Install Feathr Package
         run: |
           python -m pip install --upgrade pip
-          python -m pip install pytest pytest-xdist databricks-cli
-          python -m pip install -e ./feathr_project/
+          python -m pip install -e ./feathr_project/[all]
           if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
       - name: Set env variable and upload jars
         env:
@@ -132,7 +135,7 @@ jobs:
   azure_synapse_test:
     # might be a bit duplication to setup both the azure_synapse test and databricks test, but for now we will keep those to accelerate the test speed
     runs-on: ubuntu-latest
-    if: github.event_name == 'push' || github.event_name == 'pull_request' || (github.event_name == 'pull_request_target' && contains(github.event.pull_request.labels.*.name, 'safe to test'))
+    if: github.event_name == 'schedule' || github.event_name == 'push' || github.event_name == 'pull_request' || (github.event_name == 'pull_request_target' && contains(github.event.pull_request.labels.*.name, 'safe to test'))
     steps:
       - uses: actions/checkout@v2
         with:
@@ -166,8 +169,7 @@ jobs:
       - name: Install Feathr Package
         run: |
           python -m pip install --upgrade pip
-          python -m pip install pytest pytest-xdist
-          python -m pip install -e ./feathr_project/
+          python -m pip install -e ./feathr_project/[all]
           if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
       - name: Run Feathr with Azure Synapse
         env:
@@ -203,7 +205,7 @@ jobs:
 
   local_spark_test:
     runs-on: ubuntu-latest
-    if: github.event_name == 'push' || github.event_name == 'pull_request' || (github.event_name == 'pull_request_target' && contains(github.event.pull_request.labels.*.name, 'safe to test'))
+    if: github.event_name == 'schedule' || github.event_name == 'push' || github.event_name == 'pull_request' || (github.event_name == 'pull_request_target' && contains(github.event.pull_request.labels.*.name, 'safe to test'))
     steps:
       - uses: actions/checkout@v2
         with:
@@ -229,9 +231,8 @@ jobs:
       - name: Install Feathr Package
         run: |
           python -m pip install --upgrade pip
-          python -m pip install pytest pytest-xdist
-          python -m pip install -e ./feathr_project/
-          if [ -f requirements.txt ]; then pip install -r requirements.txt; fi          
+          python -m pip install -e ./feathr_project/[all]
+          if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
       - name: Run Feathr with Local Spark
         env:
           PROJECT_CONFIG__PROJECT_NAME: "feathr_github_ci_local"
@@ -258,4 +259,26 @@ jobs:
           SQL1_PASSWORD: ${{secrets.SQL1_PASSWORD}}
         run: |
           # skip cloud related tests
-          pytest feathr_project/test/test_local_spark_e2e.py
+          pytest feathr_project/test/test_local_spark_e2e.py
+
+  failure_notification:
+    # If any failure, warning message will be sent
+    needs: [sbt_test, python_lint, databricks_test, azure_synapse_test, local_spark_test]
+    runs-on: ubuntu-latest
+    if: failure() && github.event_name == 'schedule'
+    steps:
+     - name: Warning
+       run: |
+        curl -H 'Content-Type: application/json' -d '{"text": "[WARNING] Daily CI has failure, please check: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"}' ${{ secrets.TEAMS_WEBHOOK }}
+
+  notification:
+    # Final Daily Report with all job status
+    needs: [sbt_test, python_lint, databricks_test, azure_synapse_test, local_spark_test]
+    runs-on: ubuntu-latest
+    if: always() && github.event_name == 'schedule'
+    steps:
+      - name: Get Date
+        run: echo "NOW=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
+      - name: Notification
+        run: |
+          curl -H 'Content-Type: application/json' -d '{"text": "${{env.NOW}} Daily Report: 1. SBT Test ${{needs.sbt_test.result}}, 2. Python Lint Test ${{needs.python_lint.result}}, 3. Databricks Test ${{needs.databricks_test.result}}, 4. Synapse Test ${{needs.azure_synapse_test.result}} , 5. LOCAL SPARK TEST ${{needs.local_spark_test.result}}. Link: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"}' ${{ secrets.TEAMS_WEBHOOK }}
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -40,7 +40,11 @@ Our open source community strives to:
 - **Be respectful**: We are a world-wide community of professionals, and we conduct ourselves professionally. Disagreement is no excuse for poor behavior and poor manners.
 - **Understand disagreements**: Disagreements, both social and technical, are useful learning opportunities. Seek to understand the other viewpoints and resolve differences constructively.
 - **Remember that we’re different**. The strength of our community comes from its diversity, people from a wide range of backgrounds. Different people have different perspectives on issues. Being unable to understand why someone holds a viewpoint doesn’t mean that they’re wrong. Focus on helping to resolve issues and learning from mistakes.
+- 
 
 ## Attribution & Acknowledgements
 
 This code of conduct is based on the Open Code of Conduct from the [TODOGroup](https://todogroup.org/blog/open-code-of-conduct/).
+
+# Committers
+Benjamin Le, David Stein, Edwin Cheung, Hangfei Lin, Jimmy Guo, Jinghui Mo, Li Lu, Rama Ramani, Ray Zhang, Xiaoyong Zhu
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -0,0 +1,15 @@
+# Component Governance Pipeline
+# Runs the Feathr code through Component Governance Detection tool and publishes the result under compliance tab.
+
+trigger:
+- main
+
+pool:
+  vmImage: ubuntu-latest
+
+steps:
+- task: ComponentGovernanceComponentDetection@0
+  inputs:
+    scanType: 'Register'
+    verbosity: 'Verbose'
+    alertWarningLevel: 'High'
diff --git a/build.sbt b/build.sbt
@@ -2,7 +2,7 @@ import sbt.Keys.publishLocalConfiguration
 
 ThisBuild / resolvers += Resolver.mavenLocal
 ThisBuild / scalaVersion     := "2.12.15"
-ThisBuild / version          := "0.8.0"
+ThisBuild / version          := "0.9.0-rc2"
 ThisBuild / organization     := "com.linkedin.feathr"
 ThisBuild / organizationName := "linkedin"
 val sparkVersion = "3.1.3"

diff --git a/docs/README.md b/docs/README.md
@@ -159,7 +159,7 @@ Read [Point-in-time Correctness and Point-in-time Join in Feathr](https://feathr
 
 ### Running Feathr Examples
 
-Follow the [quick start Jupyter Notebook](./samples/product_recommendation_demo.ipynb) to try it out. There is also a companion [quick start guide](https://feathr-ai.github.io/feathr/quickstart_synapse.html) containing a bit more explanation on the notebook.
+Follow the [quick start Jupyter Notebook](https://github.com/feathr-ai/feathr/blob/main/docs/samples/azure_synapse/product_recommendation_demo.ipynb) to try it out. There is also a companion [quick start guide](https://feathr-ai.github.io/feathr/quickstart_synapse.html) containing a bit more explanation on the notebook.
 
 ## 🗣️ Tech Talks on Feathr
 

diff --git a/docs/concepts/feature-registry.md b/docs/concepts/feature-registry.md
@@ -74,11 +74,13 @@ client.register_features()
 all_features = client.list_registered_features(project_name=client.project_name)
 ```
 
+Please avoid applying a same name to different features under a certain project. Since it will be treated as updating an exsiting project which is not supported by feathr and will cause errors.
+
 ### Reuse Features from Existing Registry
 
 The feature producers can just let the feature consumers know which features exist so the feature consumers can reuse them. For feature consumers, they can reuse existing features from the registry. The whole project can be retrieved to local environment by calling this API `client.get_features_from_registry` with a project name. This encourage feature reuse across organizations. For example, end users of a feature just need to read all feature definitions from the existing projects, then use a few features from the projects and join those features with a new dataset you have.
 
-For example, in the [product recommendation demo notebook](./../samples/product_recommendation_demo.ipynb), some other team members have already defined a few features, such as `feature_user_gift_card_balance` and `feature_user_has_valid_credit_card`. If we want to reuse those features for anti-abuse purpose in a new dataset, what you can do is like this, i.e. just call `get_features_from_registry` to get the features, then put the features you want to query to the anti-abuse dataset you have.
+For example, in the [product recommendation demo notebook](https://github.com/feathr-ai/feathr/blob/main/docs/samples/azure_synapse/product_recommendation_demo.ipynb), some other team members have already defined a few features, such as `feature_user_gift_card_balance` and `feature_user_has_valid_credit_card`. If we want to reuse those features for anti-abuse purpose in a new dataset, what you can do is like this, i.e. just call `get_features_from_registry` to get the features, then put the features you want to query to the anti-abuse dataset you have.
 
 ```python
 registered_features_dict = client.get_features_from_registry(client.project_name)

diff --git a/docs/concepts/materializing-features.md b/docs/concepts/materializing-features.md
@@ -31,6 +31,18 @@ More reference on the APIs:
 
 In the above example, we define a Redis table called `nycTaxiDemoFeature` and materialize two features called `f_location_avg_fare` and `f_location_max_fare` to Redis.
 
+## Incremental Aggregation
+Use incremental aggregation will significantly expedite the WindowAggTransformation feature calculation. 
+For example, aggregation sum of a feature F within a 180-day window at day T can be expressed as: F(T) = F(T - 1)+DirectAgg(T-1)-DirectAgg(T - 181). 
+Once a SNAPSHOT of the first day is generated, the calculation for the following days can leverage it.
+
+A storeName is required if incremental aggregated is enabled. There could be multiple output Datasets, and each of them need to be stored in a separate folder. The storeName is used as the folder name to create under the base "path".
+
+Incremental aggregation is enabled by default when using HdfsSink.
+
+More reference on the APIs:
+- [HdfsSink API doc](https://feathr.readthedocs.io/en/latest/feathr.html#feathr.HdfsSink)
+
 ## Feature Backfill
 
 It is also possible to backfill the features till a particular time, like below. If the `BackfillTime` part is not specified, it's by default to `now()` (i.e. if not specified, it's equivalent to `BackfillTime(start=now, end=now, step=timedelta(days=1))`).
@@ -149,3 +161,53 @@ More reference on the APIs:
 
 - [MaterializationSettings API](https://feathr.readthedocs.io/en/latest/feathr.html#feathr.MaterializationSettings)
 - [HdfsSink API](https://feathr.readthedocs.io/en/latest/feathr.html#feathr.HdfsSource)
+
+## Expected behavior on Feature Materialization
+
+When end users materialize features to a sink, what is the expected behavior?
+
+It seems to be a straightforward question, but actually it is not. Basically when end users want to materialize a feature, Feathr is expecting that: For a certain entity key (say a user_id), there will be multiple features (say user_total_gift_card_balance, and user_purchase_in_last_week). So two checks will be performed:
+
+1. Those features should have the same entity key (say a user_id). You cannot materialize features for two entity keys in the same materialization job (although you can do it in different jobs), for example materializing `uer_total_purchase` and `product_sold_in_last_week` in the same Feathr materialization job.
+2. Those features should all be "aggregated" feature. I.e. they should be a feature which has a type of `WindowAggTransformation`, such as `product_sold_in_last_week`, or `user_latest_total_gift_card_balance`.
+
+The first constraint is pretty straightforward to explain - since when Feathr materializes certain features, they are used to describe certain aspects of a given entity such as user. Describing `product_sold_in_last_week` would not make sense for users.
+
+The second constraint is a bit more interesting. For example, you have defined `user_total_gift_card_balance` and it has different value for the same user across different time, say the corresponding value is 40,30,20,20 for the last 4 days, like below.
+Original data:
+
+| UserId | user_total_gift_card_balance | Date       |
+| ------ | ---------------------------- | ---------- |
+| 1      | 40                           | 2022/01/01 |
+| 1      | 30                           | 2022/01/02 |
+| 1      | 20                           | 2022/01/03 |
+| 1      | 20                           | 2022/01/04 |
+| 2      | 40                           | 2022/01/01 |
+| 2      | 30                           | 2022/01/02 |
+| 2      | 20                           | 2022/01/03 |
+| 2      | 20                           | 2022/01/04 |
+| 3      | 40                           | 2022/01/01 |
+| 3      | 30                           | 2022/01/02 |
+| 3      | 20                           | 2022/01/03 |
+| 3      | 20                           | 2022/01/04 |
+
+However, the materialized features have no dates associated with them. I.e. the materialized result should be something like this:
+
+| UserId | user_total_gift_card_balance |
+| ------ | ---------------------------- |
+| 1      | ?                            |
+| 2      | ?                            |
+| 3      | ?                            |
+
+When you ask Feathr to "materialize" `user_total_gift_card_balance` for you, there's only one value that can be materialized, since the materialized feature does not have a date associated with them. So the problem is - for a given `user_id`, only one `user_total_gift_card_balance` can be its feature. Which value you are choosing out of the 4 values? A random value? The latest value?
+
+It might be natural to think that "we should materialize the latest feature", and that behavior, by definition, is an "aggregation" operation, since we have 4 values for a given `user_id` but we are only materializing and using one of them. In that case, Feathr asks you to explicitly say that you want to materialize the latest feature (i.e. by using [Point-in-time Join](./point-in-time-join.md))
+
+```python
+feature = Feature(name="user_total_gift_card_balance",
+            key=UserId,
+            feature_type=FLOAT,
+            transform=WindowAggTransformation(agg_expr="gift_card_balance",
+                                              agg_func="LATEST",
+                                              window="7d"))
+```
diff --git a/docs/dev_guide/build-and-push-feathr-registry-docker-image.md b/docs/dev_guide/build-and-push-feathr-registry-docker-image.md
@@ -76,8 +76,4 @@ docker push feathrfeaturestore/feathr-registry
 
 ## Published Feathr Registry Image
 
-The published feathr feature registry is located in [DockerHub here](https://hub.docker.com/r/feathrfeaturestore/feathr-registry).
-
-## Include the detailed track back info in registry api HTTP error response
-
-Set environment REGISTRY_DEBUGGING to any non empty string will enable the detailed track back info in registry api http response. This variable is helpful for python client debugging and should only be used for debugging purposes.
+The published feathr feature registry is located in [DockerHub here](https://hub.docker.com/r/feathrfeaturestore/feathr-registry).