-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathreinvent_hudi_queries.hql
82 lines (79 loc) · 2.13 KB
/
reinvent_hudi_queries.hql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# This is the source data (publicly available)
CREATE EXTERNAL TABLE amazon_reviews_parquet(
marketplace string,
customer_id string,
review_id string,
product_id string,
product_parent string,
product_title string,
star_rating int,
helpful_votes int,
total_votes int,
vine string,
verified_purchase string,
review_headline string,
review_body string,
review_date date,
year int)
PARTITIONED BY (product_category string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
's3://amazon-reviews-pds/parquet/'
# Load partitions.
MSCK REPAIR TABLE amazon_reviews_parquet;
# This is the output table that will be used to feed the demo
CREATE EXTERNAL TABLE amazon_reviews_parquet_for_demo (
marketplace string,
customer_id string,
review_id string,
product_id string,
product_parent string,
product_title string,
star_rating int,
helpful_votes int,
total_votes int,
vine string,
verified_purchase string,
review_headline string,
review_body string,
review_date string,
year int)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
's3://<REPLACE WITH YOUR OWN BUCKET>/parquet/product_category=Home_Improvement/'
;
INSERT OVERWRITE TABLE amazon_reviews_parquet_for_demo
SELECT
marketplace,
customer_id,
review_id,
product_id,
product_parent,
product_title,
#This is to simulate some bad data.
case when customer_id = "42351637" then
100
else
star_rating
end as star_rating,
helpful_votes,
total_votes,
vine,
verified_purchase,
review_headline,
review_body,
review_date,
year
FROM amazon_reviews_parquet
#Select only one year and one category of data.
WHERE product_category='Home_Improvement' and year = 2015;