Chapter 04 section 03 completed.

98hq · Dec 5, 2017 · 72fda63 · 72fda63
1 parent 23967fa
commit 72fda63
Show file tree

Hide file tree

Showing 10 changed files with 1,657 additions and 223 deletions.
diff --git a/04-kNN/03-Test-Our-Algorithm/03-Test-Our-Algorithm.ipynb b/04-kNN/03-Test-Our-Algorithm/03-Test-Our-Algorithm.ipynb
diff --git a/04-kNN/03-Test-Our-Algorithm/playML/kNN.py b/04-kNN/03-Test-Our-Algorithm/playML/kNN.py
@@ -25,7 +25,7 @@ def fit(self, X_train, y_train):
 
     def predict(self, X_predict):
         """给定待预测数据集X_predict，返回表示X_predict的结果向量"""
-        assert self._X_train is not None and self._X_train is not None, \
+        assert self._X_train is not None and self._y_train is not None, \
                 "must fit before predict!"
         assert X_predict.shape[1] == self._X_train.shape[1], \
                 "the feature number of X_predict must be equal to X_train"
@@ -34,16 +34,17 @@ def predict(self, X_predict):
         return np.array(y_predict)
 
     def _predict(self, x):
-        """给定单个待预测数据x，返回x_predict的预测结果值"""
+        """给定单个待预测数据x，返回x的预测结果值"""
         assert x.shape[0] == self._X_train.shape[1], \
             "the feature number of x must be equal to X_train"
-        distances = [(sqrt(((x_train - x) ** 2).sum()), self._y_train[i])
-                     for i, x_train in enumerate(self._X_train)]
-        distances.sort()
 
-        topK_y = [pair[1] for pair in distances[:self.k]]
+        distances = [sqrt(np.sum((x_train - x) ** 2))
+                     for x_train in self._X_train]
+        nearest = np.argsort(distances)
 
+        topK_y = [self._y_train[i] for i in nearest[:self.k]]
         votes = Counter(topK_y)
+
         return votes.most_common(1)[0][0]
 
     def __repr__(self):

diff --git a/04-kNN/04-Hyper-Parameter-K/04-Hyper-Parameter-K.ipynb b/04-kNN/04-Hyper-Parameter-K/04-Hyper-Parameter-K.ipynb
@@ -9,7 +9,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 21,
    "metadata": {
     "collapsed": true
    },
@@ -22,7 +22,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
@@ -31,7 +31,7 @@
        "dict_keys(['data', 'target', 'target_names', 'images', 'DESCR'])"
       ]
      },
-     "execution_count": 2,
+     "execution_count": 22,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -43,7 +43,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [
     {
@@ -52,7 +52,7 @@
        "(1797, 64)"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 23,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -64,7 +64,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
@@ -73,7 +73,7 @@
        "(1797,)"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 24,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -85,7 +85,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 25,
    "metadata": {
     "collapsed": true
    },
@@ -98,16 +98,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "array([6, 8, 5, 8, 7, 6, 4, 0, 2, 6])"
+       "array([1, 4, 9, 2, 0, 9, 0, 6, 1, 1])"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 26,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -118,7 +118,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 27,
    "metadata": {
     "collapsed": true
    },
@@ -133,16 +133,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "0.9916434540389972"
+       "0.9805013927576601"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 28,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -160,16 +160,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 29,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "0.9916434540389972"
+       "0.9805013927576601"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 29,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -180,28 +180,6 @@
     "accuracy_score(y_test, y_predict)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0.99164345403899723"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from sklearn.metrics import accuracy_score\n",
-    "\n",
-    "accuracy_score(y_test, y_predict)"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -211,16 +189,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "array([4, 6, 5, 8, 9, 2, 7, 1, 6, 4])"
+       "array([3, 9, 5, 1, 2, 9, 2, 4, 1, 1])"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 30,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -235,7 +213,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 31,
    "metadata": {
     "collapsed": true
    },
@@ -250,7 +228,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 32,
    "metadata": {},
    "outputs": [
     {
@@ -259,7 +237,7 @@
        "0.98888888888888893"
       ]
      },
-     "execution_count": 18,
+     "execution_count": 32,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -272,7 +250,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 33,
    "metadata": {},
    "outputs": [
     {
@@ -281,7 +259,7 @@
        "0.98888888888888893"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 33,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -299,15 +277,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 34,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "best_k = 1\n",
-      "best_score = 0.991666666667\n"
+      "best_k = 3\n",
+      "best_score = 0.988888888889\n"
      ]
     }
    ],

diff --git a/04-kNN/04-Hyper-Parameter-K/playML/kNN.py b/04-kNN/04-Hyper-Parameter-K/playML/kNN.py
@@ -25,7 +25,7 @@ def fit(self, X_train, y_train):
 
     def predict(self, X_predict):
         """给定待预测数据集X_predict，返回表示X_predict的结果向量"""
-        assert self._X_train is not None and self._X_train is not None, \
+        assert self._X_train is not None and self._y_train is not None, \
                 "must fit before predict!"
         assert X_predict.shape[1] == self._X_train.shape[1], \
                 "the feature number of X_predict must be equal to X_train"
@@ -34,16 +34,17 @@ def predict(self, X_predict):
         return np.array(y_predict)
 
     def _predict(self, x):
-        """给定单个待预测数据x，返回x_predict的预测结果值"""
+        """给定单个待预测数据x，返回x的预测结果值"""
         assert x.shape[0] == self._X_train.shape[1], \
             "the feature number of x must be equal to X_train"
-        distances = [(sqrt(((x_train - x) ** 2).sum()), self._y_train[i])
-                     for i, x_train in enumerate(self._X_train)]
-        distances.sort()
 
-        topK_y = [pair[1] for pair in distances[:self.k]]
+        distances = [sqrt(np.sum((x_train - x) ** 2))
+                     for x_train in self._X_train]
+        nearest = np.argsort(distances)
 
+        topK_y = [self._y_train[i] for i in nearest[:self.k]]
         votes = Counter(topK_y)
+
         return votes.most_common(1)[0][0]
 
     def __repr__(self):