Chapter 05 prep completed.

liuyubobobo · Dec 7, 2017 · 69c6ab4 · 69c6ab4
1 parent c2c3345
commit 69c6ab4
Show file tree

Hide file tree

Showing 12 changed files with 1,077 additions and 1 deletion.
diff --git a/05-Linear-Regression/05-R-Squared/05-R-Squared.ipynb b/05-Linear-Regression/05-R-Squared/05-R-Squared.ipynb
@@ -29,7 +29,7 @@
    "outputs": [],
    "source": [
     "boston = datasets.load_boston()\n",
-    "x = boston.data[:,5] # 只使用房间数量这个维度\n",
+    "x = boston.data[:,5] # 只使用房间数量这个特征\n",
     "y = boston.target\n",
     "\n",
     "x = x[y < 50.0]\n",

diff --git a/...egression/07-Linear-Regression-in-scikit-learn/07-Linear-Regression-in-scikit-learn.ipynb b/...egression/07-Linear-Regression-in-scikit-learn/07-Linear-Regression-in-scikit-learn.ipynb
@@ -0,0 +1,283 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## scikit-learn 中的 Linear Regression"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "from sklearn import datasets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "boston = datasets.load_boston()\n",
+    "\n",
+    "X = boston.data\n",
+    "y = boston.target\n",
+    "\n",
+    "X = X[y < 50.0]\n",
+    "y = y[y < 50.0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(490, 13)"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from playML.model_selection import train_test_split\n",
+    "\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 使用我们自己制作 Linear Regression\n",
+    "\n",
+    "代码参见 [这里](playML/LinearRegression.py)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "LinearRegression()"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from playML.LinearRegression import LinearRegression\n",
+    "\n",
+    "reg = LinearRegression()\n",
+    "reg.fit_normal(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([ -1.18919477e-01,   3.63991462e-02,  -3.56494193e-02,\n",
+       "         5.66737830e-02,  -1.16195486e+01,   3.42022185e+00,\n",
+       "        -2.31470282e-02,  -1.19509560e+00,   2.59339091e-01,\n",
+       "        -1.40112724e-02,  -8.36521175e-01,   7.92283639e-03,\n",
+       "        -3.81966137e-01])"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "reg.coef_"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "34.161435496224712"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "reg.intercept_"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.81298026026584658"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "reg.score(X_test, y_test)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### scikit-learn中的线性回归"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from sklearn.linear_model import LinearRegression\n",
+    "\n",
+    "sk_reg = LinearRegression()\n",
+    "sk_reg.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([ -1.18919477e-01,   3.63991462e-02,  -3.56494193e-02,\n",
+       "         5.66737830e-02,  -1.16195486e+01,   3.42022185e+00,\n",
+       "        -2.31470282e-02,  -1.19509560e+00,   2.59339091e-01,\n",
+       "        -1.40112724e-02,  -8.36521175e-01,   7.92283639e-03,\n",
+       "        -3.81966137e-01])"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sk_reg.coef_"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "34.161435496246924"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sk_reg.intercept_"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.81298026026584758"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sk_reg.score(X_test, y_test)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/05-Linear-Regression/07-Linear-Regression-in-scikit-learn/playML/LinearRegression.py b/05-Linear-Regression/07-Linear-Regression-in-scikit-learn/playML/LinearRegression.py
@@ -0,0 +1,43 @@
+import numpy as np
+from .metrics import r2_score
+
+
+class LinearRegression:
+
+    def __init__(self):
+        """初始化Linear Regression模型"""
+        self.coef_ = None
+        self.intercept_ = None
+        self._theta = None
+
+    def fit_normal(self, X_train, y_train):
+        """根据训练数据集X_train, y_train训练Linear Regression模型"""
+        assert X_train.shape[0] == y_train.shape[0], \
+            "the size of X_train must be equal to the size of y_train"
+
+        X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
+        self._theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train)
+
+        self.intercept_ = self._theta[0]
+        self.coef_ = self._theta[1:]
+
+        return self
+
+    def predict(self, X_predict):
+        """给定待预测数据集X_predict，返回表示X_predict的结果向量"""
+        assert self.intercept_ is not None and self.coef_ is not None, \
+            "must fit before predict!"
+        assert X_predict.shape[1] == len(self.coef_), \
+            "the feature number of X_predict must be equal to X_train"
+
+        X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict])
+        return X_b.dot(self._theta)
+
+    def score(self, X_test, y_test):
+        """根据测试数据集 X_test 和 y_test 确定当前模型的准确度"""
+
+        y_predict = self.predict(X_test)
+        return r2_score(y_test, y_predict)
+
+    def __repr__(self):
+        return "LinearRegression()"
diff --git a/05-Linear-Regression/07-Linear-Regression-in-scikit-learn/playML/SimpleLinearRegression.py b/05-Linear-Regression/07-Linear-Regression-in-scikit-learn/playML/SimpleLinearRegression.py
@@ -0,0 +1,47 @@
+import numpy as np
+from .metrics import r2_score
+
+
+class SimpleLinearRegression:
+
+    def __init__(self):
+        """初始化Simple Linear Regression模型"""
+        self.a_ = None
+        self.b_ = None
+
+    def fit(self, x_train, y_train):
+        """根据训练数据集x_train, y_train训练Simple Linear Regression模型"""
+        assert x_train.ndim == 1, \
+            "Simple Linear Regressor can only solve single feature training data."
+        assert len(x_train) == len(y_train), \
+            "the size of x_train must be equal to the size of y_train"
+
+        x_mean = np.mean(x_train)
+        y_mean = np.mean(y_train)
+
+        self.a_ = (x_train - x_mean).dot(y_train - y_mean) / (x_train - x_mean).dot(x_train - x_mean)
+        self.b_ = y_mean - self.a_ * x_mean
+
+        return self
+
+    def predict(self, x_predict):
+        """给定待预测数据集x_predict，返回表示x_predict的结果向量"""
+        assert x_predict.ndim == 1, \
+            "Simple Linear Regressor can only solve single feature training data."
+        assert self.a_ is not None and self.b_ is not None, \
+            "must fit before predict!"
+
+        return np.array([self._predict(x) for x in x_predict])
+
+    def _predict(self, x_single):
+        """给定单个待预测数据x，返回x的预测结果值"""
+        return self.a_ * x_single + self.b_
+
+    def score(self, X_test, y_test):
+        """根据测试数据集 X_test 和 y_test 确定当前模型的准确度"""
+
+        y_predict = self.predict(X_test)
+        return r2_score(y_test, y_predict)
+
+    def __repr__(self):
+        return "SimpleLinearRegression()"
diff --git a/05-Linear-Regression/07-Linear-Regression-in-scikit-learn/playML/__init__.py b/05-Linear-Regression/07-Linear-Regression-in-scikit-learn/playML/__init__.py