-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWeatherLinearRegression.py
148 lines (118 loc) · 5.71 KB
/
WeatherLinearRegression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from datetime import datetime
from pandas.plotting import register_matplotlib_converters
#import globals
from model5 import temp
pd.set_option('display.max_columns', 30)
pd.set_option('display.width', 2000)
class Reg:
def data_preprocessing(self):
self.data = pd.read_csv('./austin_weather2.csv')
print("\n",self.data.head())
# drop or delete the unnecessary columns in the data.
self.events = self.data[['Events']].replace(' ', 'None')
self.events = self.events[['Events']].replace('None',0)
self.events = self.events[['Events']].replace('Thunderstorm', 1)
self.events = self.events[['Events']].replace(' Thunderstorm', 1)
self.events = self.events[['Events']].replace('Snow', 2)
self.events = self.events[['Events']].replace('Rain', 3)
self.events = self.events[['Events']].replace('Rain ', 3)
self.events = self.events[['Events']].replace('Fog', 4)
self.events = self.events[['Events']].replace('Fog ', 4)
self.data = self.data.drop(['Events','Date', 'SeaLevelPressureHighInches',
'SeaLevelPressureLowInches'], axis=1)
# some values have 'T' which denotes trace rainfall
# we need to replace all occurrences of T with 0
# so that we can use the data in our model
self.data = self.data.replace('T', 0.0)
# the data also contains '-' which indicates no
# or NIL. This means that data is not available
# we need to replace these values as well.
self.data = self.data.replace('-', 0.0)
# save the data in a csv file
self.data.to_csv('./austin_final.csv')
# read the cleaned data
self.data = pd.read_csv("./austin_final.csv")
def Feature_Extraction(self):
# the features or the 'x' values of the data
# these columns are used to train the model
# the last column, i.e, precipitation column
# will serve as the label
self.X = self.data
columns_of_interest = ['TempAvgF','DewPointAvgF', 'HumidityAvgPercent', 'SeaLevelPressureAvgInches', 'VisibilityAvgMiles', 'WindAvgMPH', 'PrecipitationSumInches']
self.X=self.data[columns_of_interest]
# the output or the label.
self.Y = self.events
# reshaping it into a 2-D vector
self.Y = self.Y.values.reshape(-1, 1)
# consider a random day in the dataset
# we shall plot a graph and observe this
# day
self.day_index = 798
self.days = [i for i in range(self.Y.size)]
def regression_model(self):
# initialize a linear regression classifier
clf = LinearRegression()
# train the classifier with our
# input data.
#self.X = self.X.drop([174,175,176,177,596,597,598,638,639,741,742,953])
print("lenght of Self.x",len(self.X))
print("lenght of Self.y",len(self.Y))
clf.fit(self.X, self.Y)
# give a sample input to test our model
# this is a 2-D vector that contains values
# for each column in the dataset.
inp = np.array([[74], [60], [45], [67], [49], [43], [33], [45],
[57], [29.68], [10], [7], [2], [0], [20], [4], [31]])
inp = inp.reshape(1, -1)
# print the output.
new_yhat=np.delete(temp,0)
new_yhat=new_yhat.reshape(1,-1)
#yhat.pop(0)
print('The precipitation in inches for the input is:', clf.predict(new_yhat))
# plot a graph of the precipitation levels
# versus the total number of days.
# one day, which is in red, is
# tracked here. It has a precipitation
# of approx. 2 inches.
print("the precipitation trend graph: ")
plt.scatter(self.days, self.Y, color='g')
plt.scatter(self.days[self.day_index], self.Y[self.day_index], color='r')
plt.title("Precipitation level")
plt.xlabel("Days")
plt.ylabel("Precipitation in inches")
plt.show()
self.x_vis = self.X.filter(['TempAvgF', 'DewPointAvgF', 'HumidityAvgPercent',
'SeaLevelPressureAvgInches', 'VisibilityAvgMiles',
'WindAvgMPH'], axis=1)
# plot a graph with a few features (x values)
# against the precipitation or rainfall to observe
# the trends
print("Precipitation vs selected attributes graph: ")
for i in range(self.x_vis.columns.size):
plt.subplot(3, 2, i + 1)
plt.scatter(self.days, self.x_vis[self.x_vis.columns.values[i][:100]],
color='g')
plt.scatter(self.days[self.day_index],
self.x_vis[self.x_vis.columns.values[i]][self.day_index],
color='r')
plt.title(self.x_vis.columns.values[i])
plt.show()
#A day (in red) having precipitation of about 2 inches is tracked across multiple parameters
# (the same day is tracker across multiple features such as temperature, pressure, etc).
# The x-axis denotes the days and the y-axis denotes the magnitude of the feature such as temperature, pressure, etc.
# From the graph, it can be observed that rainfall can be expected to be high when the temperature is high and humidity is high.
Reg=Reg()
Reg.data_preprocessing()
Reg.Feature_Extraction()
Reg.regression_model()
#generating a time series graph
df = pd.read_csv('./austin_weather.csv',header=0, index_col=0, parse_dates=True)
df.plot()
plt.title("Time Stamp")
plt.ylabel("Temperature")
plt.show()