-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTMDB_movies_query.sql
401 lines (321 loc) · 8.88 KB
/
TMDB_movies_query.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
## 1/ Identify trends in movie release dates and analyze their impact on revenue.
---
SELECT
EXTRACT(YEAR FROM release_date) AS release_year,
COUNT(*) AS movie_count,
AVG(revenue) AS average_revenue
FROM
movies
WHERE
EXTRACT(YEAR FROM release_date) BETWEEN 1865 AND 2023
GROUP BY
release_year
ORDER BY
release_year;
---
SELECT
r.release_year,
r.movie_count,
t.average_revenue
FROM
(
SELECT
EXTRACT(YEAR FROM release_date) AS release_year,
COUNT(*) AS movie_count
FROM
movies
GROUP BY
release_year
) r
LEFT JOIN (
SELECT
EXTRACT(YEAR FROM release_date) AS release_year,
AVG(revenue) AS average_revenue
FROM
movies
GROUP BY
release_year
) t ON r.release_year = t.release_year
ORDER BY
r.release_year;
## 2/ Analyze the realationship between budget, revenue, and popularity to determine factors
that contribute to a movies success. (solution_1)
--- Calculate Basic Statistics:
SELECT
COUNT(*) AS total_movies,
AVG(budget) AS avg_budget,
AVG(revenue) AS avg_revenue,
AVG(popularity) AS avg_popularity,
MIN(budget) AS min_budget,
MAX(budget) AS max_budget,
MIN(revenue) AS min_revenue,
MAX(revenue) AS max_revenue,
MIN(popularity) AS min_popularity,
MAX(popularity) AS max_popularity
FROM movies;
--- Correlation Analysis:
SELECT
CORR(budget, revenue) AS budget_revenue_correlation,
CORR(budget, popularity) AS budget_popularity_correlation,
CORR(revenue, popularity) AS revenue_popularity_correlation
FROM movies;
--- Determine Success Factors:
WITH successful_movies AS (
SELECT *
FROM movies
WHERE revenue > (SELECT PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY revenue) FROM movies)
)
SELECT
AVG(budget) AS avg_budget_successful,
AVG(revenue) AS avg_revenue_successful,
AVG(popularity) AS avg_popularity_successful
FROM successful_movies;
WITH unsuccessful_movies AS (
SELECT *
FROM movies
WHERE revenue <= (SELECT PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY revenue) FROM movies)
)
SELECT
AVG(budget) AS avg_budget_unsuccessful,
AVG(revenue) AS avg_revenue_unsuccessful,
AVG(popularity) AS avg_popularity_unsuccessful
FROM unsuccessful_movies;
--- (solution_2)
SELECT
AVG(revenue) AS avg_revenue,
AVG(budget) AS avg_budget,
AVG(popularity) AS avg_popularity
FROM movies
WHERE status = 'Released' -- Consider only released movies (you can adjust this condition as needed);
## 3/ Explopre the impact of movie genres on popularity and revenue.
WITH genre_split AS (
SELECT id, unnest(string_to_array(genres, ', ')) AS genre
FROM movies
)
SELECT
genre,
AVG(popularity) AS average_popularity,
AVG(revenue) AS average_revenue
FROM
genre_split
JOIN
movies ON genre_split.id = movies.id
GROUP BY
genre
ORDER BY
average_popularity DESC, average_revenue DESC;
--- Explopre the impact of movie genres on popularity and revenue and average_vote.
SELECT
g.genre_name AS genre,
AVG(m.vote_average) AS average_vote,
SUM(m.revenue) AS total_revenue,
AVG(m.popularity) AS average_popularity
FROM
movies m
JOIN (
SELECT DISTINCT id, unnest(string_to_array(genres, ', ')) AS genre_name
FROM movies
) g ON g.id = m.id
GROUP BY g.genre_name
ORDER BY total_revenue DESC;
## 4/ Identify successful production companies and analyze their strategies.
--- Calculate the total revenue for each production company
SELECT
production_companies,
SUM(revenue) AS total_revenue
FROM
movies
where
production_companies is not null
GROUP BY
production_companies
ORDER BY
total_revenue DESC;
--- identifies production companies with an average vote greater than 7
and a revenue greater than $100 million.
SELECT
production_companies,
AVG(vote_average) AS average_vote,
SUM(revenue) AS total_revenue,
COUNT(*) AS movie_count
FROM movies
WHERE vote_average > 7 AND revenue > 100000000
GROUP BY production_companies
HAVING COUNT(*) > 5 -- You can adjust this threshold based on your definition of success
ORDER BY total_revenue DESC;
--- Identify (top 30/20/50) Successful Production Companies.
SELECT
production_companies,
SUM(revenue) AS total_revenue
FROM movies
where production_companies is not null
GROUP BY production_companies
ORDER BY total_revenue DESC
LIMIT 50;
--- Analyze Their Strategies.
WITH top_production_companies AS (
SELECT
production_companies AS company,
SUM(revenue) AS total_revenue
FROM movies
GROUP BY company
ORDER BY total_revenue DESC
LIMIT 50
)
SELECT
tpc.company,
COUNT(*) AS movie_count,
AVG(vote_average) AS average_vote,
AVG(popularity) AS average_popularity,
MAX(tpc.total_revenue) AS total_revenue
FROM movies AS m
JOIN top_production_companies AS tpc
ON m.production_companies = tpc.company
GROUP BY tpc.company
ORDER BY total_revenue DESC;
--- Identify Successful Production Companies (only released movies).
WITH successful_movies AS (
SELECT
id,
title,
vote_average,
revenue,
production_companies
FROM
movies
WHERE
status = 'Released'
AND revenue > 0
)
SELECT
pc.company_name,
COUNT(sm.id) AS total_movies,
AVG(sm.vote_average) AS average_vote,
SUM(sm.revenue) AS total_revenue
FROM
successful_movies AS sm
CROSS JOIN LATERAL unnest(string_to_array(sm.production_companies, '|')) AS pc(company_name)
GROUP BY
pc.company_name
ORDER BY
total_revenue DESC;
--- Analyze Their Strategies (only released movies).
WITH ProductionCompanyRevenue AS (
SELECT
production_companies AS company,
SUM(revenue) AS total_revenue
FROM
movies
WHERE
status = 'Released' -- Consider only released movies
GROUP BY
production_companies
)
SELECT
company,
SUM(total_revenue) AS total_revenue,
COUNT(DISTINCT id) AS total_movies_produced,
AVG(vote_average) AS average_vote,
AVG(runtime) AS average_runtime
FROM
movies
JOIN ProductionCompanyRevenue ON movies.production_companies = ProductionCompanyRevenue.company
WHERE
status = 'Released' -- Consider only released movies
GROUP BY
company
ORDER BY
total_revenue DESC
LIMIT 30; --- you can change LIMIT value.
## 5/ Investigate the correlation between runtime and audience engagement
--- correlation between runtime and vote average:
SELECT CORR(runtime, vote_average) AS correlation
FROM movies;
--- correlation between runtime and vote_count:
SELECT CORR(runtime, vote_count) AS correlation
FROM movies;
## 6/ utilize naturle language processing techniques to extract meaningful insights from movie overviews.
SELECT id, regexp_split_to_table(overview, E'\\s+') AS token
FROM movies;
SELECT token, COUNT(*) AS frequency
FROM (
SELECT id, regexp_split_to_table(overview, E'\\s+') AS token
FROM movies
) AS tokenized
WHERE token != ''
GROUP BY token
ORDER BY frequency DESC;
## 7/ Visualize movie popularity over time and identify popular genres in different periods.
-- Step 2: Extract the year from the release_date
SELECT
EXTRACT(YEAR FROM release_date) AS release_year,
popularity
INTO
temp_popularity
FROM
movies;
-- Step 3: Group by year and calculate popularity
SELECT
release_year,
SUM(popularity) AS total_popularity
FROM
temp_popularity
where
release_year is not null
GROUP BY
release_year
ORDER BY
release_year;
-- Step 4: Identify popular genres
WITH genre_popularity AS (
SELECT
EXTRACT(YEAR FROM m.release_date) AS release_year,
genre AS genres,
SUM(m.popularity) AS genre_popularity
FROM
movies m
CROSS JOIN LATERAL unnest(string_to_array(m.genres, ',')) AS genre
GROUP BY
release_year, genre
)
-- Step 5: Data Visualization
SELECT
release_year,
genres,
SUM(genre_popularity) AS total_genre_popularity
FROM
genre_popularity
where release_year is not null
GROUP BY
release_year, genres
ORDER BY
release_year, total_genre_popularity DESC;
## 8/ top 50 movies by revenue up to 1000000, and vote_average (10, 9, 8)
SELECT
title,
vote_average,
status,
release_date,
revenue,
runtime,
adult,
tagline,
genres,
production_companies
FROM movies
WHERE
title IS NOT NULL
AND popularity IS NOT NULL
AND runtime IS NOT NULL
AND status IS NOT NULL
AND release_date IS NOT NULL
AND revenue IS NOT NULL
AND adult IS NOT NULL
AND tagline IS NOT NULL
AND genres IS NOT NULL
AND production_companies IS NOT NULL
AND revenue >= 1000000
AND vote_average IN (10, 9, 8)
AND runtime >= 100
ORDER BY vote_average DESC, popularity DESC, runtime DESC
LIMIT 50;