-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbikeshare.py
272 lines (211 loc) · 10.5 KB
/
bikeshare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
import time
import pandas as pd
import numpy as np
#Creating a dictionary containing the data sources for the three cities
CITY_DATA = { 'chicago': 'chicago.csv', 'Chicago': 'chicago.csv',
'New York City': 'new_york_city.csv', 'New york city': 'new_york_city.csv',
'new york city': 'new_york_city.csv', 'washington': 'washington.csv',
'Washington': 'washington.csv' }
#Function to figure out the filtering requirements of the user
def get_filters():
print('Hello! I am Essam Let\'s explore some US bikeshare data!')
#Initializing an empty city variable to store city choice from user
#You will see this repeat throughout the program
city = ''
#Running this loop to ensure the correct user input gets selected else repeat
while city not in CITY_DATA.keys():
print("\nWelcome to my program. Please choose your city:")
print("\n1. Chicago 2. New York City 3. Washington")
print("\nAccepted input:\nFull name of city; not case sensitive (e.g. chicago or CHICAGO).\nFull name in title case (e.g. Chicago).")
#Taking user input and converting into lower to standardize them
#You will find this happening at every stage of input throughout this
city = input().lower()
if city not in CITY_DATA.keys():
print("\nPlease check your input, it doesn\'t appear to be conforming to any of the accepted input formats.")
print("\nRestarting...")
print(f"\nYou have chosen {city.title()} as your city.")
#Creating a dictionary to store all the months including the 'all' option
MONTH_DATA = {'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5, 'june': 6, 'all': 7}
month = ''
while month not in MONTH_DATA.keys():
print("\nPlease enter the month, between January to June, for which you're seeking the data:")
print("\nAccepted input:\nFull month name; not case sensitive (e.g. january or JANUARY).\nFull month name in title case (e.g. April).")
print("\n(You may also opt to view data for all months, please type 'all' or 'All' or 'ALL' for that.)")
month = input().lower()
if month not in MONTH_DATA.keys():
print("\nInvalid input. Please try again in the accepted input format.")
print("\nRestarting...")
print(f"\nYou have chosen {month.title()} as your month.")
#Creating a list to store all the days including the 'all' option
DAY_LIST = ['all', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
day = ''
while day not in DAY_LIST:
print("\nPlease enter a day in the week of your choice for which you're seeking the data:")
print("\nAccepted input:\nDay name; not case sensitive (e.g. monday or MONDAY).\nDay name in title case (e.g. Monday).")
print("\n(You can also put 'all' or 'All' to view data for all days in a week.)")
day = input().lower()
if day not in DAY_LIST:
print("\nInvalid input. Please try again in one of the accepted input formats.")
print("\nRestarting...")
print(f"\nYou have chosen {day.title()} as your day.")
print(f"\nYou have chosen to view data for city: {city.upper()}, month/s: {month.upper()} and day/s: {day.upper()}.")
print('-'*80)
#Returning the city, month and day selections
return city, month, day
#Function to load data from .csv files
def load_data(city, month, day):
#Load data for city
print("\nLoading data...")
df = pd.read_csv(CITY_DATA[city])
#Convert the Start Time column to datetime
df['Start Time'] = pd.to_datetime(df['Start Time'])
#Extract month and day of week from Start Time to create new columns
df['month'] = df['Start Time'].dt.month
df['day_of_week'] = df['Start Time'].dt.weekday_name
#Filter by month if applicable
if month != 'all':
#Use the index of the months list to get the corresponding int
months = ['january', 'february', 'march', 'april', 'may', 'june']
month = months.index(month) + 1
#Filter by month to create the new dataframe
df = df[df['month'] == month]
#Filter by day of week if applicable
if day != 'all':
#Filter by day of week to create the new dataframe
df = df[df['day_of_week'] == day.title()]
#Returns the selected file as a dataframe (df) with relevant columns
return df
#Function to calculate all the time-related statistics for the chosen data
def time_stats(df):
print('\nCalculating The Most Frequent Times of Travel...\n')
start_time = time.time()
#Uses mode method to find the most popular month
popular_month = df['month'].mode()[0]
print(f"Most Popular Month (1 = January,...,6 = June): {popular_month}")
#Uses mode method to find the most popular day
popular_day = df['day_of_week'].mode()[0]
print(f"\nMost Popular Day: {popular_day}")
#Extract hour from the Start Time column to create an hour column
df['hour'] = df['Start Time'].dt.hour
#Uses mode method to find the most popular hour
popular_hour = df['hour'].mode()[0]
print(f"\nMost Popular Start Hour: {popular_hour}")
#Prints the time taken to perform the calculation
#You will find this in all the functions involving any calculation
#throughout this program
print(f"\nThis took {(time.time() - start_time)} seconds.")
print('-'*80)
#Function to calculate station related statistics
def station_stats(df):
print('\nCalculating The Most Popular Stations and Trip...\n')
start_time = time.time()
#Uses mode method to find the most common start station
common_start_station = df['Start Station'].mode()[0]
print(f"The most commonly used start station: {common_start_station}")
#Uses mode method to find the most common end station
common_end_station = df['End Station'].mode()[0]
print(f"\nThe most commonly used end station: {common_end_station}")
#Uses str.cat to combine two columsn in the df
#Assigns the result to a new column 'Start To End'
#Uses mode on this new column to find out the most common combination
#of start and end stations
df['Start To End'] = df['Start Station'].str.cat(df['End Station'], sep=' to ')
combo = df['Start To End'].mode()[0]
print(f"\nThe most frequent combination of trips are from {combo}.")
print(f"\nThis took {(time.time() - start_time)} seconds.")
print('-'*80)
#Function for trip duration related statistics
def trip_duration_stats(df):
print('\nCalculating Trip Duration...\n')
start_time = time.time()
#Uses sum method to calculate the total trip duration
total_duration = df['Trip Duration'].sum()
#Finds out the duration in minutes and seconds format
minute, second = divmod(total_duration, 60)
#Finds out the duration in hour and minutes format
hour, minute = divmod(minute, 60)
print(f"The total trip duration is {hour} hours, {minute} minutes and {second} seconds.")
#Calculating the average trip duration using mean method
average_duration = round(df['Trip Duration'].mean())
#Finds the average duration in minutes and seconds format
mins, sec = divmod(average_duration, 60)
#This filter prints the time in hours, mins, sec format if the mins exceed 60
if mins > 60:
hrs, mins = divmod(mins, 60)
print(f"\nThe average trip duration is {hrs} hours, {mins} minutes and {sec} seconds.")
else:
print(f"\nThe average trip duration is {mins} minutes and {sec} seconds.")
print(f"\nThis took {(time.time() - start_time)} seconds.")
print('-'*80)
#Function to calculate user statistics
def user_stats(df):
print('\nCalculating User Stats...\n')
start_time = time.time()
#The total users are counted using value_counts method
#They are then displayed by their types (e.g. Subscriber or Customer)
user_type = df['User Type'].value_counts()
print(f"The types of users by number are given below:\n\n{user_type}")
#This try clause is implemented to display the numebr of users by Gender
#However, not every df may have the Gender column, hence this...
try:
gender = df['Gender'].value_counts()
print(f"\nThe types of users by gender are given below:\n\n{gender}")
except:
print("\nThere is no 'Gender' column in this file.")
#Similarly, this try clause is there to ensure only df containing
#'Birth Year' column are displayed
#The earliest birth year, most recent birth year and the most common
#birth years are displayed
try:
earliest = int(df['Birth Year'].min())
recent = int(df['Birth Year'].max())
common_year = int(df['Birth Year'].mode()[0])
print(f"\nThe earliest year of birth: {earliest}\n\nThe most recent year of birth: {recent}\n\nThe most common year of birth: {common_year}")
except:
print("There are no birth year details in this file.")
print(f"\nThis took {(time.time() - start_time)} seconds.")
print('-'*80)
#Function to display the data frame itself as per user request
def display_data(df):
BIN_RESPONSE_LIST = ['yes', 'no']
rdata = ''
#counter variable is initialized as a tag to ensure only details from
#a particular point is displayed
counter = 0
while rdata not in BIN_RESPONSE_LIST:
print("\nDo you wish to view the raw data?")
print("\nAccepted responses:\nYes or yes\nNo or no")
rdata = input().lower()
#the raw data from the df is displayed if user opts for it
if rdata == "yes":
print(df.head())
elif rdata not in BIN_RESPONSE_LIST:
print("\nPlease check your input.")
print("Input does not seem to match any of the accepted responses.")
print("\nRestarting...\n")
#Extra while loop here to ask user if they want to continue viewing data
while rdata == 'yes':
print("Do you wish to view more raw data?")
counter += 5
rdata = input().lower()
#If user opts for it, this displays next 5 rows of data
if rdata == "yes":
print(df[counter:counter+5])
elif rdata != "yes":
break
print('-'*80)
#Main function to call all the previous functions
def main():
while True:
city, month, day = get_filters()
df = load_data(city, month, day)
display_data(df)
time_stats(df)
station_stats(df)
trip_duration_stats(df)
user_stats(df)
restart = input('\nWould you like to restart? Enter yes or no.\n')
if restart.lower() != 'yes':
break
if __name__ == "__main__":
main()