MXNetEdge · YuelinZhang0822 · Aug 5, 2018 · Aug 5, 2018 · Aug 8, 2018 · Aug 8, 2018
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
+# Pycharm Project
+.idea/
 
-mxnet-bot/.DS_Store
+# OSX Stuff
 .DS_Store
+mxnet-bot/.DS_Store
diff --git a/mxnet-bot/PredictLabels/DataFetcher.py b/mxnet-bot/PredictLabels/DataFetcher.py
@@ -0,0 +1,135 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This scipt is served to fetch GitHub issues into a json file
+from __future__ import print_function
+import os
+import requests
+import json
+import re
+import pandas as pd
+import logging
+
+
+class DataFetcher:
+
+    def __init__(self,
+                 github_user=os.environ.get("github_user"),
+                 github_oauth_token=os.environ.get("github_oauth_token"),
+                 repo=os.environ.get("repo")):
+        """
+        This DataFetcher serves to fetch issues data
+        Args:
+            github_user(str): the github id. ie: "CathyZhang0822"
+            github_oauth_token(str): the github oauth token, paired with github_user to realize authorization
+            repo(str): the repo name
+        """
+        self.github_user = github_user
+        self.github_oauth_token = github_oauth_token
+        self.repo = repo
+        self.auth = (self.github_user, self.github_oauth_token)
+        self.json_data = None
+
+    def cleanstr(self, raw_string, sub_string):
+        """
+        This method is to convert all non-alphanumeric charaters from 
+        raw_string into substring
+        """
+        clean = re.sub("[^0-9a-zA-Z]", sub_string, raw_string)
+        return clean.lower()
+
+    def count_pages(self, state):
+        """
+        This method is to count how many pages of issues/labels in total
+        state can be "open"/"closed"/"all"
+        """
+        url = 'https://api.github.com/repos/%s/issues' % self.repo
+        response = requests.get(url, {'state': state},
+                                auth=self.auth)
+        assert response.status_code == 200, "Authorization failed"
+        if "link" not in response.headers:
+            return 1
+        return int(self.cleanstr(response.headers['link'], " ").split()[-3])
+
+    def fetch_issues(self, issue_nums):
+        """
+        This method is to fetch issues data
+        issue_num: a list of issue ids
+        return issues' data in pandas dataframe format
+        """
+        assert issue_nums != [], "Empty Input!"
+        logging.info("Reading issues:{}".format(", ".join([str(num) for num in issue_nums])))
+        data = []
+        for number in issue_nums:
+            url = 'https://api.github.com/repos/' + self.repo + '/issues/' + str(number)
+            response = requests.get(url, auth=self.auth)
+            item = response.json()
+            assert 'title' in item, "{} issues doesn't exist!".format(str(number))
+            data += [{'id': str(number), 'title': item['title'], 'body': item['body']}]
+        return pd.DataFrame(data)
+
+    def data2json(self, state, labels=None, other_labels=False):
+        """
+        This method is to store issues' data into a json file, return json file's name
+        state can be either "open"/"closed"/"all"
+        labels is a list of target labels we are interested in
+        other_labels can be either "True"/"False"
+        """
+        assert state in set(['all', 'open', 'closed']), "Invalid State!"
+        logging.info("Reading {} issues..".format(state))
+        pages = self.count_pages(state)
+        data = []
+        for x in range(1, pages+1):
+            url = 'https://api.github.com/repos/' + self.repo + '/issues?page=' + str(x) \
+                  + '&per_page=30'.format(repo=self.repo)
+            response = requests.get(url,
+                                    {'state': state,
+                                     'base': 'master',
+                                     'sort': 'created'},
+                                    auth=self.auth)
+            for item in response.json():
+                if "pull_request" in item:
+                    continue
+                if "labels" in item:
+                    issue_labels=list(set([item['labels'][i]['name'] for i in range(len(item['labels']))]))
+                else:
+                    continue
+                if labels is not None:
+                    # fetch issue which has at least one target label
+                    for label in labels:
+                        if label in issue_labels:
+                            if other_labels:
+                                # besides target labels, we still want other labels
+                                data += [{'id': item['number'],'title': item['title'], 'body': item['body'], 'labels': issue_labels}]
+                            else:
+                                # only record target labels
+                                if(label in set(["Feature", "Call for Contribution", "Feature request"])):
+                                    label = "Feature"
+                                data += [{'id': item['number'], 'title': item['title'], 'body': item['body'], 'labels': label}]
+                            # if have this break, then we only pick up the first target label
+                            break
+                else:
+                    # fetch all issues
+                    data += [{'id': item['number'], 'title': item['title'], 'body': item['body'], 'labels': issue_labels}]
+        self.json_data = data
+        s_labels = "_".join(labels) if labels is not None else "all_labels"
+        filename = "{}_data.json_{}".format(state, s_labels)
+        logging.info("Writing json file..")
+        with open(filename, 'w') as write_file:
+            json.dump(data, write_file)
+        logging.info("{} json file is ready!".format(filename))
+        return filename
diff --git a/mxnet-bot/PredictLabels/Dockerfile b/mxnet-bot/PredictLabels/Dockerfile
@@ -0,0 +1,26 @@
+FROM python:3.6.6
+
+# Update packages
+RUN apt-get update
+
+# Install prerequisite for matplotlib
+RUN apt-get -y install libxft-dev libfreetype6 libfreetype6-dev
+
+# Bundle app source
+COPY . /src
+
+EXPOSE 8000
+WORKDIR /src
+
+#install Python modules
+RUN pip install -r requirements.txt
+
+# Environment Variables
+ENV github_user your_github_id
+ENV github_oauth_token your_github_read_only_token
+ENV repo repo_name
+
+# Run it
+ENTRYPOINT ["python", "application.py"]
+
+
diff --git a/mxnet-bot/PredictLabels/Dockerrun.aws.json b/mxnet-bot/PredictLabels/Dockerrun.aws.json
@@ -0,0 +1,7 @@
+{
+  "AWSEBDockerrunVersion": "1",
+  "Logging": "/tmp/sample-app",
+  "Image": {
+  	"Update": "false"
+  }
+}
diff --git a/mxnet-bot/PredictLabels/Predictor.py b/mxnet-bot/PredictLabels/Predictor.py
@@ -0,0 +1,137 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from sklearn.preprocessing import LabelEncoder
+from SentenceParser import SentenceParser
+from DataFetcher import DataFetcher
+import numpy as np
+import pickle
+import re
+import logging
+import os
+
+
+class Predictor:
+    # keywords will be used to apply rule-based algorithms
+    keywords = {"CI": ["ci", "ccache", "jenkins"],
+                "Flaky": ["flaky"],
+                "Gluon": ["gluon"],
+                "Cuda": ["cuda", "cudnn"],
+                "Scala": ["scala"],
+                "mkldnn": ["mkldnn, mkl"],
+                "ONNX": ["onnx"]}
+
+    def __init__(self):
+        """
+        Predictor serves to apply rule-based and ML algorithms to predict labels
+        """
+        self.tv = None
+        self.labels = None
+        self.clf = None
+
+    def reload(self, tmp_dir):
+        """
+        This method is to load models
+        """
+        with open(os.path.join(tmp_dir.name,'Vectorizer.p'), "rb") as tv:
+            self.tv = pickle.load(tv)
+        with open(os.path.join(tmp_dir.name,'Classifier.p'), "rb") as clf:
+            self.clf = pickle.load(clf)
+        with open(os.path.join(tmp_dir.name,'Labels.p'), "rb") as labels:
+            self.labels = pickle.load(labels)
+
+    def tokenize(self, row):
+        """
+        This method is to tokenize a sentence into a list of words
+        Args:
+            row(string): a sentence
+        Return:
+            words(list): a list of words
+        """
+        row = re.sub('[^a-zA-Z0-9]', ' ', row).lower()
+        words = set(row.split())
+        return words
+
+    def rule_based(self, issues):
+        """
+        This method applies rule_based algorithms to predict labels
+        Args:
+            issues(list): a list of issue numbers
+        Return:
+            rule_based_predictions(list of lists): labels which satisfy rules
+        """
+        DF = DataFetcher()
+        df_test = DF.fetch_issues(issues)
+        rule_based_predictions = []
+        for i in range(len(issues)):
+            # extract every issue's title
+            row = df_test.loc[i, 'title']
+            # apply rule-based algorithms
+            single_issue_predictions = []
+            if "feature request" in row.lower():
+                single_issue_predictions.append("Feature")
+            if "c++" in row.lower():
+                single_issue_predictions.append("C++")
+            tokens = self.tokenize(row)
+            for k, v in self.keywords.items():
+                for keyword in v:
+                    if keyword in tokens:
+                        single_issue_predictions.append(k)
+            rule_based_predictions.append(single_issue_predictions)
+        return rule_based_predictions
+
+    def ml_predict(self, issues, threshold=0.3):
+        """
+        This method applies machine learning algorithms to predict labels
+        Args:
+            issues(list): a list of issue numbers
+            threshold(float): threshold of probability
+        Return:
+            ml_predictions(list of lists): predictions
+        """
+        # step1: fetch data
+        DF = DataFetcher()
+        df_test = DF.fetch_issues(issues)
+        # step2: data cleaning
+        SP = SentenceParser()
+        SP.data = df_test
+        SP.clean_body('body', True, True)
+        SP.merge_column(['title', 'title', 'title', 'body'], 'train')
+        test_text = SP.process_text('train', True, False, True)
+        # step3: word embedding
+        test_data_tfidf = self.tv.transform(test_text).toarray()
+        le = LabelEncoder()
+        le.fit_transform(self.labels)
+        # step4: classification
+        probs = self.clf.predict_proba(test_data_tfidf)
+        # pick up top 2 predictions which exceeds threshold
+        best_n = np.argsort(probs, axis=1)[:, -2:]
+        ml_predictions = []
+        for i in range(len(best_n)):
+            # INFO:Predictor:issue:11919,Performance:0.47353076240017744,Question:0.2440056213336274
+            logging.info("issue:{}, {}:{}, {}:{}".format(str(issues[i]), str(le.classes_[best_n[i][-1]]), str(probs[i][best_n[i][-1]]),
+                        str(le.classes_[best_n[i][-2]]), str(probs[i][best_n[i][-2]])))
+            single_issue_predictions = [le.classes_[best_n[i][j]] for j in range(-1, -3, -1) if probs[i][best_n[i][j]] > threshold]
+            ml_predictions.append(single_issue_predictions)
+        return ml_predictions
+
+    def predict(self, issues):
+        # return predictions of both rule_base algorithms and machine learning methods
+        rule_based_predictions = self.rule_based(issues)
+        ml_predictions = self.ml_predict(issues)
+        predictions = [list(set(rule_based_predictions[i]+ml_predictions[i])) for i in range(len(ml_predictions))]
+        return predictions
diff --git a/mxnet-bot/PredictLabels/README.md b/mxnet-bot/PredictLabels/README.md
@@ -0,0 +1,25 @@
+# Elastic Beanstalk Web Server
+
+A web server built on [AWS Elastic Beanstalk](https://aws.amazon.com/elasticbeanstalk/) which can response to GET/POST requests and realize self-maintenance. It mainly has 2 features:
+  * Train models: it will retrain Machine Learning models every 24 hours automatically using latest data.
+  * Predict labels: once it receives GET/POST requests with issues ID, it will send predictions back.
+
+## Set up
+*Make sure you are in current directory.*
+* Configure Dockerfile: In `Dockerfile`. Set environment variables (last 3 lines) with real `github_user`, `github_oauth_token`(READ only token) and `repo`.
+* Open terminal, run:
+```bash
+zip eb.zip application.py cron.yaml DataFetcher.py \
+Dockerfile Dockerrun.aws.json plot_piechart.py Predictor.py SentenceParser.py Trainer.py \
+requirements.txt stopwords.txt
+```
+It will zip all needed files into `eb.zip`
+* Manually create a new Elastic Beanstalk application.
+    1. Go to AWS Elastic Beanstalk console, click ***Create New Application***. Fill in *Application Name* and *Description*, click ***Create***.
+    2. Under ***Select environment tier***, select ***Web server environment***, click ***Select***.
+    3. Under **Base configuration**, select **Preconfigured platform**. In its dropdown, select **Docker**. Then select ***Upload your code***, upload `eb.zip`.
+    4. Click ***Configure more options***. Modify Intances, in the dropdown of Instance type, select t2.large. Click ***Create Environment*** (No need to select a security group, EB will create one.)
+    5. It will take about 10 minutes to setup the environment. 
+    6. Once the environment is setup, it will take 5-10 minutes to generate models. 
+    7. Write down URL. (ie: http://labelbot-env.pgc55xzpte.us-east-1.elasticbeanstalk.com)
+