- store dataset (.csv) in a Google Cloud Storage bucket.
- create a Dataflow batch job that read and process the csv file.
- in the Dataflow job, apply a "Group By" transform to get the count of listings by the "neighbourhood" field.
- store both the original csv data and the transformed data into their own separate BigQuery tables.
-
gcloud auth list
-
gcloud config list project
-
export PROJECT=""
-
gcloud config set project $PROJECT
-
gsutil mb - c regional -l us-east4 gs://$PROJECT
-
gsutil cp ./ gs://$PROJECT/
-
bq mk
-
export GOOGLE_APPLICATION_CREDENTIALS="/filename.json"
-
bq show -j --project_id=<project_id dataflow_job>
-
python2.7 -m virtual env
-
source env/bin/activate
-
deactivate (after job)
-
pip freeze -r requirements.txt (enviroment setup)
-
pip install apache-beam
- python local_directrunner_pipeline.py
- python dataflow_pipeline.py
--project=$PROJECT
--runner=DataflowRunner
--staging_location=gs://$PROJECT/temp
--temp_location gs://$PROJECT/temp
--input gs://$PROJECT/datafilename.csv --save_main_session