From 5804b02efca5646c0161ab5bff8a95fa820d269b Mon Sep 17 00:00:00 2001 From: ted chang Date: Mon, 30 Dec 2019 06:59:37 -0800 Subject: [PATCH 1/7] Initial commit for Text-sentiment-classifier --- bert-text-classifier/README.md | 121 ++++++++++++ bert-text-classifier/examples/test.html | 14 ++ bert-text-classifier/examples/test.js | 2 + bert-text-classifier/package.json | 82 ++++++++ bert-text-classifier/rollup.config.js | 78 ++++++++ .../src/js/sentimentanalysis.ts | 51 +++++ .../src/server/sentimentanalysis.ts | 62 ++++++ bert-text-classifier/src/server/util.ts | 29 +++ bert-text-classifier/src/test.ts | 79 ++++++++ .../src/text-sentiment-classifier.ts | 62 ++++++ bert-text-classifier/src/tokenization.ts | 184 ++++++++++++++++++ bert-text-classifier/tsconfig.json | 27 +++ 12 files changed, 791 insertions(+) create mode 100644 bert-text-classifier/README.md create mode 100644 bert-text-classifier/examples/test.html create mode 100644 bert-text-classifier/examples/test.js create mode 100644 bert-text-classifier/package.json create mode 100644 bert-text-classifier/rollup.config.js create mode 100644 bert-text-classifier/src/js/sentimentanalysis.ts create mode 100644 bert-text-classifier/src/server/sentimentanalysis.ts create mode 100644 bert-text-classifier/src/server/util.ts create mode 100644 bert-text-classifier/src/test.ts create mode 100644 bert-text-classifier/src/text-sentiment-classifier.ts create mode 100644 bert-text-classifier/src/tokenization.ts create mode 100644 bert-text-classifier/tsconfig.json diff --git a/bert-text-classifier/README.md b/bert-text-classifier/README.md new file mode 100644 index 0000000..637cc64 --- /dev/null +++ b/bert-text-classifier/README.md @@ -0,0 +1,121 @@ +# MAX for TensorFlow.js: Text Sentiment Classifier + +This is a TensorFlow.js port of the [MAX Human Pose Estimator](https://developer.ibm.com/exchanges/models/all/max-text-sentiment-classifier/) This model is able to detect whether a text fragment leans towards a positive or a negative sentiment. + +## Install + +### Browser + +```html + + +``` + +### Node.js + +``` +npm install --save @codait/text-sentiment-classifier +``` + +## Usage + +The complete examples for browser and Node.js environments are in the [`/examples`](https://github.com/CODAIT/max-tfjs-models/tree/master/text-sentiment-classifier/examples) directory. + +### Browser + +> **Note**: _When loaded in a browser, the global variable `textSentimentClassifier` will be available to access the API._ + +```javascript + +textSentimentClassifier + .predict("i like strawberries") + .then(prediction => { + console.log(prediction) + }); +``` + +### Node.js + +```javascript +const tc = require('@codait/text-sentiment-classifier'); +tc.predict("i like strawberries").then(res=>console.log(res)); //{ pos: 0.9981953501701355, neg: 0.0018045296892523766 } + +``` + +### API + +- **loadModel()** + + Loads the model files. + + Running in Node.js the first time will download the model assets locally under `/model` directory. The subsequent calls will load the model from the directory. + + Returns the TensorFlow.js model. + +- **processInput(text)** + + Processes the input text to the shape and format expected by the model. + + `text` - sentence to be processed. It should be a sentence with a period although this is not necessary. + + Returns a named tensor map that contains: + `{'segment_ids_1': Tensor of shape [128], + 'input_ids_1': Tensor of shape [128], + 'input_mask_1': Tensor of shape [128]}` + +- **runInference(inputFeatures)** + + Runs inference on the named tensor map passed. The output is a tensor that contains softmax of positive and negative percentages. + + `inputFeature` - a named tensor map representation of a text. + + Returns the inference results as a 1D tensor. + +- **processOutput(tensor)** + + Transform the inference output to a Json object. + + `tensor` - the model output from running inference. + + Returns an object containing: `{neg: number, pos: number}` + + +- **predict(text)** + + Loads the model, processes the input text, runs inference, processes the inference output, and returns a prediction object. This is a convenience function to avoid having to call each of the functions (`loadModel`, `processInput`, `runInference`, `processOutput`) individually. + + `text` - sentence to be analyzed. It should be a sentence with a period although this is not necessary. + + Returns an object containing: `{neg: number, pos: number}` + +- **encode(text)** + + Tokenize the text as token ids using the BERT 32k vocabularies. + + `text` - sentence to be encoded. + + Returns an array of BERT token ids. + +- **idsToTokens(ids)** + + Transform the BERT token ids into tokens. + + `ids` - BERT token ids. + + Returns an array of BERT tokens. + +- **version** + + Returns the version + +## Model + +The model assets produced by converting the pre-trained model to the TensorFlow.js format can be found in the `/model` directory after loadModel is called in Node.js. + +## Resources + +- [MAX Text Sentiment Classifier](https://developer.ibm.com/exchanges/models/all/max-text-sentiment-classifier/) + +## License + +[Apache-2.0](https://github.com/CODAIT/max-tfjs-models/blob/master/LICENSE) diff --git a/bert-text-classifier/examples/test.html b/bert-text-classifier/examples/test.html new file mode 100644 index 0000000..3a15c18 --- /dev/null +++ b/bert-text-classifier/examples/test.html @@ -0,0 +1,14 @@ + + + + text classifier + + + + + + + diff --git a/bert-text-classifier/examples/test.js b/bert-text-classifier/examples/test.js new file mode 100644 index 0000000..c53dfa1 --- /dev/null +++ b/bert-text-classifier/examples/test.js @@ -0,0 +1,2 @@ +const tc = require("../dist/src/max.sentimentclass.cjs.js"); +tc.predict("i like strawberries").then(res=>console.log(res)); diff --git a/bert-text-classifier/package.json b/bert-text-classifier/package.json new file mode 100644 index 0000000..f2c6545 --- /dev/null +++ b/bert-text-classifier/package.json @@ -0,0 +1,82 @@ +{ + "name": "@codait/text-sentiment-classifier", + "version": "0.1.0", + "description": "This model is able to detect whether a text fragment leans towards a positive or a negative sentiment.", + "main": "dist/src/max.sentimentclass.cjs.js", + "module": "dist/src/max.sentimentclass.es.js", + "jsdelivr": "dist/src/max.sentimentclass.js", + "unpkg": "dist/max.sentimentclass.js", + "scripts": { + "clean": "rimraf dist && mkdirp dist", + "rollup": "rollup --config rollup.config.js", + "uglify": "uglifyjs dist/max.sentimentclass.js -mc --source-map --output dist/max.humanpose.min.js", + "copy": "ncp model dist/model", + "build": "npm run clean && npm run rollup", + "test": "standard && npm run build && jasmine test/test.js", + "watch:js": "rollup --config rollup.config.js --watch", + "watch:ts": "tsc --watch", + "dev": "concurrently -c \"bgBlue.bold,bgMagenta.bold\" \"npm:watch:js\" \"npm:watch:ts\"" + }, + "files": [ + "dist", + "model", + "src" + ], + "author": "Ted Chang (https://github.com/tedhtchang/)", + "license": "Apache-2.0", + "keywords": [ + "bert text sentiment classifier", + "model asset exchange", + "max", + "tensorflow.js", + "tensorflowjs", + "tensorflow", + "tf.js", + "tfjs", + "machine learning", + "Wordpiece tokenizer" + ], + "repository": { + "type": "git", + "url": "git+https://github.com/CODAIT/max-tfjs-models.git" + }, + "bugs": { + "url": "https://github.com/CODAIT/max-tfjs-models/issues" + }, + "homepage": "https://github.com/CODAIT/max-tfjs-models/tree/master/bert-text-classifier#readme", + "dependencies": { + "@tensorflow-models/universal-sentence-encoder": "^1.2.1", + "@tensorflow/tfjs": "^1.5.1", + "@tensorflow/tfjs-node": "^1.4.0", + "@types/express": "^4.16.1", + "@types/tar": "^4.0.3", + "express": "^4.17.1", + "node-fetch": "^2.6.0", + "numpy": "0.0.1", + "tar": "^5.0.5" + }, + "devDependencies": { + "@types/jasmine": "^3.5.0", + "@types/rollup-plugin-json": "^3.0.2", + "builtin-modules": "^3.1.0", + "concurrently": "^4.1.0", + "jasmine": "^3.5.0", + "jasmine-ts": "^0.3.0", + "mkdirp": "^0.5.1", + "ncp": "^2.0.0", + "rimraf": "^2.6.3", + "rollup": "^1.13.1", + "rollup-plugin-json": "^4.0.0", + "rollup-plugin-node-resolve": "^5.0.1", + "rollup-plugin-replace": "^2.2.0", + "rollup-plugin-typescript2": "^0.21.1", + "ts-node": "^8.5.4", + "typescript": "^3.7.3", + "unicode-12.1.0": "^0.8.0" + }, + "standard": { + "ignore": [ + "dist" + ] + } +} diff --git a/bert-text-classifier/rollup.config.js b/bert-text-classifier/rollup.config.js new file mode 100644 index 0000000..f1fbda1 --- /dev/null +++ b/bert-text-classifier/rollup.config.js @@ -0,0 +1,78 @@ +import node from 'rollup-plugin-node-resolve'; +import typescript from 'rollup-plugin-typescript2'; +import json from 'rollup-plugin-json'; +import builtins from 'builtin-modules'; +import replace from 'rollup-plugin-replace'; + +const jsonPlugin = json({ + include: './package.json', + preferConst: true, + indent: ' ', + compact: true, + namedExports: ['version'] +}) + +export default[ + { + input: 'src/text-sentiment-classifier.ts', + output: [ + { + name: 'textSentimentClassifier', + file: 'dist/src/max.sentimentclass.js', + format: 'iife', + sourcemap: true + }, + { + name: 'textSentimentClassifier', + file: 'dist/src/max.sentimentclass.es.js', + format: 'es', + sourcemap: true + } + ], + plugins: [ + typescript({ + clean: true, + tsconfigOverride: { + compilerOptions: { + module: 'ES2015', + noUnusedLocals: false, + inlineSourceMap: false + } + } + }), + replace({ + 'server/sentimentanalysis': 'js/sentimentanalysis', + include: ['src/text-sentiment-classifier.ts'] + }), + jsonPlugin, + node(), + ], + external: builtins + }, + { + input: 'src/text-sentiment-classifier.ts', + output:[ + { + name: 'textSentimentClassifier', + file: 'dist/src/max.sentimentclass.cjs.js', + format: 'cjs', + sourcemap: true + }, + ], + plugins: [ + typescript({ + clean: true, + tsconfigOverride: { + compilerOptions: { + module: 'ES2015', + noUnusedLocals: false, + inlineSourceMap: false + } + } + }), + jsonPlugin, + node() + ], + external: builtins + } +] diff --git a/bert-text-classifier/src/js/sentimentanalysis.ts b/bert-text-classifier/src/js/sentimentanalysis.ts new file mode 100644 index 0000000..2d7f5b6 --- /dev/null +++ b/bert-text-classifier/src/js/sentimentanalysis.ts @@ -0,0 +1,51 @@ +import WordPieceTokenizer from "../tokenization"; +import * as tf from '@tensorflow/tfjs'; + +const vocabUrl = 'http://s3.us.cloud-object-storage.appdomain.cloud/bert-sentiment-tfjs/model/vocab.json' +const modelUrl = 'http://s3.us.cloud-object-storage.appdomain.cloud/bert-sentiment-tfjs/model/model.json'; + + +export default class SentimentAnalysis { + private _model: tf.GraphModel; + private _tokenizer: WordPieceTokenizer; + + public get tokenizer(): WordPieceTokenizer { + return this._tokenizer; + } + + public get model() : tf.GraphModel { + return this._model; + } + + async init(){ + if(! this.model) await this.loadModel(); + if(! this.tokenizer) await this.loadTokenizer(); + } + + async loadModel(){ + this._model = await tf.loadGraphModel( + modelUrl, {requestInit: {headers: {"origin": "localhost"}}}) + // console.log(`Model loaded from ${modelUrl}.`); + } + + async loadTokenizer(){ + this._tokenizer = new WordPieceTokenizer(true); + await this.tokenizer.init(vocabUrl); + // console.log("Tokenizer loaded.") + } + /** + * Classify a text input and return a json object with pos and neg + * sentiment percentages + */ + async analyzeText(text: string){ + return await this.inference(await this.tokenizer.inputFeature(text)); + } + + async inference(feature: tf.NamedTensorMap){ + if (! this.model) await this.loadModel(); + return tf.tidy(() => { + let pred: tf.Tensor = this.model.execute({...feature}, 'loss/Softmax') as tf.Tensor; + return pred.squeeze([0]); + }); + } +} diff --git a/bert-text-classifier/src/server/sentimentanalysis.ts b/bert-text-classifier/src/server/sentimentanalysis.ts new file mode 100644 index 0000000..0d10970 --- /dev/null +++ b/bert-text-classifier/src/server/sentimentanalysis.ts @@ -0,0 +1,62 @@ +import WordPieceTokenizer from "../tokenization"; +import * as tf from '@tensorflow/tfjs'; +import {fetchModel, untar} from './util'; + +const vocabUrl = 'http://s3.us.cloud-object-storage.appdomain.cloud/bert-sentiment-tfjs/model/vocab.json' +const modelArch = 'http://s3.us.cloud-object-storage.appdomain.cloud/bert-sentiment-tfjs/model.tgz'; + +export default class SentimentAnalysis { + private _model: tf.GraphModel; + private _tokenizer: WordPieceTokenizer; + + public get tokenizer(): WordPieceTokenizer { + return this._tokenizer; + } + + public get model() : tf.GraphModel { + return this._model; + } + + async init(){ + if(! this.model) await this.loadModel(); + if(! this.tokenizer) await this.loadTokenizer(); + } + + async loadModel(){ + const tfn = require('@tensorflow/tfjs-node'); + const fs = require('fs'); + const path = require('path'); + const modelTgz = path.join(`${__dirname}`, '..', '..', '/model/model.tgz'); + const modelJson = path.join(`${__dirname}`,'..', '..', '/model/model.json'); + // console.log(modelJson); + const modelDir = path.join(`${__dirname}`, '..', '..','/model'); + if(!fs.existsSync(modelJson)){ + await fetchModel(modelArch, modelDir); + await untar(modelTgz,modelDir); + } + const fileSystem = require('@tensorflow/tfjs-node/dist/io/file_system'); + this._model = await tfn.loadGraphModel(fileSystem.fileSystem(modelJson)); + // console.log(`Model loaded from ${modelJson}.`); + } + + async loadTokenizer(){ + this._tokenizer = new WordPieceTokenizer(true); + await this.tokenizer.init(vocabUrl); + // console.log("Tokenizer loaded.") + } + /** + * Classify a text input and return a json object with pos and neg + * sentiment percentages + */ + async analyzeText(text: string){ + return await this.inference(await this.tokenizer.inputFeature(text)); + } + + async inference(feature: tf.NamedTensorMap){ + if (! this.model) await this.loadModel(); + return tf.tidy(() => { + let pred: tf.Tensor = this.model.execute({...feature}, 'loss/Softmax') as tf.Tensor; + return pred.squeeze([0]); + }); + } +} diff --git a/bert-text-classifier/src/server/util.ts b/bert-text-classifier/src/server/util.ts new file mode 100644 index 0000000..674376a --- /dev/null +++ b/bert-text-classifier/src/server/util.ts @@ -0,0 +1,29 @@ +import fetch from 'node-fetch'; +import fs from 'fs'; +import path from 'path'; + + +export async function fetchModel(url: string, dir: string){ + const target = path.join(dir, 'model.tgz'); + const res = await fetch(url); + // console.log("in fetching mode model"); + const fileStream = fs.createWriteStream(target); + return new Promise((resolve, reject) => { + res.body.pipe(fileStream); + res.body.on("error", (err)=>{ + reject(err); + }); + fileStream.on("finish", ()=>{ + resolve(); + }) + }); +} +export async function untar(source: string, dest: string){ + const {extract} = require('tar'); + await extract({ + file: source, + cwd: dest, + strip: 1 + }) +} + diff --git a/bert-text-classifier/src/test.ts b/bert-text-classifier/src/test.ts new file mode 100644 index 0000000..036ddf7 --- /dev/null +++ b/bert-text-classifier/src/test.ts @@ -0,0 +1,79 @@ +import sa from '../src/text-sentiment-classifier'; +import * as tf from '@tensorflow/tfjs'; + +const posText = "i like strawberries"; +const negText = "i hate strawberries"; +const posTokenids = [ 1045, 2066, 13137, 20968 ]; +const posTokens = ['▁i', '▁like', '▁straw', 'berries']; +const resTensor = tf.tensor1d([0.4, 0.6], 'float32'); + +function genInput(){ + // Novel input lengths force recompilation which slows down inference, so it's a good idea to enforce a max length. + const EXAMPLE_INPUT_LENGTH = 15; + const MAX_INPUT_LENGTH = 128; + // This is the tokenization of '[CLS] Hello, my dog is cute. [SEP]'. + const input_ids = tf.tensor1d([101, 7592, 1010, 2026, 3899, 2003, 10140, 1012, 102, 0, 0, 0, 0, 0, 0], 'int32') + .pad([[0,MAX_INPUT_LENGTH - EXAMPLE_INPUT_LENGTH]]).expandDims(); + const segment_ids = tf.tensor1d( + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'int32') + .pad([[0, MAX_INPUT_LENGTH - EXAMPLE_INPUT_LENGTH]]).expandDims(); + const input_mask = tf.tensor1d( + [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0], 'int32') + .pad([[0, MAX_INPUT_LENGTH - EXAMPLE_INPUT_LENGTH]]).expandDims(0); + return {'segment_ids_1': segment_ids, + 'input_ids_1': input_ids, + 'input_mask_1': input_mask} +} + +describe('Bert Sentiment Classifier', () => { + it('version returns a valid version number', () => { + expect(sa.version).toMatch(/(\d+)\.(\d+)\.(\d+)/); + }); + + it('encode returns correct the coresponding token ids', async () => { + const res = await sa.encode(posText); + expect(res).toEqual(posTokenids); + }); + + it('runInference takes proper named tensor map and returns a tensor', async () => { + const res = await sa.runInference(genInput()); + expect(res).toBeInstanceOf(tf.Tensor); + }); + + it('idsToTokens convert token ids back to tokens', async () => { + const res = await sa.idsToTokens(posTokenids); + expect(res).toEqual(posTokens); + }); + + it('processOutput convert a result Tensor to a Json object', () => { + const res = sa.processOutput(resTensor); + expect(res.neg).toBeInstanceOf(Number); + expect(res.pos).toBeInstanceOf(Number); + }) + + it('processInput convert text to input for inference', async () => { + const res = await sa.processInput(posText); + expect(res.input_ids_1.shape).toEqual([1,128]); + expect(res.segment_ids_1.shape).toEqual([1,128]); + expect(res.input_mask_1.shape).toEqual([1,128]); + }); + beforeEach(function() { + // var originalTimeout = jasmine.DEFAULT_TIMEOUT_INTERVAL; + jasmine.DEFAULT_TIMEOUT_INTERVAL = 60000; + }); + + it('predict() leans towards pos prediction', async () => { + const res = await sa.predict(posText); + expect(res.pos).toBeGreaterThan(res.neg); + }); + + it('predict() leans towards neg prediction', async () => { + const res = await sa.predict(negText); + expect(res.neg).toBeGreaterThan(res.pos); + }); + + it('The sum of neg and pos of prediction should be close to 1', async () => { + const res = await sa.predict(negText); + expect(res.neg + res.pos).toBeCloseTo(1); + }); +}); diff --git a/bert-text-classifier/src/text-sentiment-classifier.ts b/bert-text-classifier/src/text-sentiment-classifier.ts new file mode 100644 index 0000000..dc36a31 --- /dev/null +++ b/bert-text-classifier/src/text-sentiment-classifier.ts @@ -0,0 +1,62 @@ +import * as tf from "@tensorflow/tfjs"; +import SentimentAnalysis from "./server/sentimentanalysis"; +import packageJson from "../package.json"; + +export interface SentRes{ + neg: number, + pos: number +} + +async function processInput(text: string): Promise{ + const sa = new SentimentAnalysis(); + await sa.loadTokenizer() + return await sa.tokenizer.inputFeature(text); +} + +async function loadModel(): Promise { + const sa = new SentimentAnalysis(); + await sa.loadModel() + return sa.model; +} + +async function runInference(feature: tf.NamedTensorMap): Promise{ + const sa = new SentimentAnalysis(); + await sa.init() + return await sa.inference(feature); +} + +function processOutput(res: tf.Tensor): SentRes{ + const arr = res.arraySync() as number[]; + return {"pos": arr[0], "neg": arr[1]}; +} + +async function predict(text: string): Promise{ + const sa = new SentimentAnalysis(); + await sa.init(); + let res = await sa.analyzeText(text); + return processOutput(res); +} + +async function encode(text: string): Promise{ + const sa = new SentimentAnalysis(); + await sa.loadTokenizer() + return await sa.tokenizer.tokenize(text); +} + +async function idsToTokens(ids: number[]){ + const sa = new SentimentAnalysis(); + await sa.loadTokenizer(); + return sa.tokenizer.convertIdsToTokens(ids); +} + + +export default { + processInput, + loadModel, + runInference, + processOutput, + predict, + encode, + idsToTokens, + version: packageJson.version +} diff --git a/bert-text-classifier/src/tokenization.ts b/bert-text-classifier/src/tokenization.ts new file mode 100644 index 0000000..9680c81 --- /dev/null +++ b/bert-text-classifier/src/tokenization.ts @@ -0,0 +1,184 @@ +import * as use from '@tensorflow-models/universal-sentence-encoder'; +import * as tf from '@tensorflow/tfjs'; + +function isPunctuation(cp: number):boolean { + // Checks a cp is a punctuation character or not. + return (punctuations.indexOf(cp) !== -1); +} + +function runStripAccents(text: string){ + // strips accent marks from text + text = text.normalize("NFD"); + nsmarks.forEach((cp: number) => {text = text.replace(String.fromCodePoint(cp), "")}); + return text; +} + +function isWhiteSpace(cp: number): boolean{ + // \t, \n, and \r are technically control characters but we treat them + // as whitespace since they are generally considered as such. + if (cp === 32 || cp === 9 || cp === 10 || cp === 13) { + return true; + } + if (spcsep.indexOf(cp) != -1){ + return true; + } + return false; +} + +function isControl(cp: number): boolean{ + // "\t" "\n" "\r" are technically control characters but we count them as whitespace + // characters. + if (ctlchar.indexOf(cp) !== -1 && cp !== 9 && cp !== 10 && cp !== 13){ + return true; + } + return false; +} + +function runSplitOnPunctuation(text: string){ + // Splits punctuation with a space on a piece of text + // e.g.: + // "abc?" -> "abc ?" + // "abc?def" -> "abc ? def" + // "abc??def" -> abc ? ? def" + const output = []; + let preCodePoint = -1; + let preCodePointIsPunc = false; + for (let i = 0; i < text.length; i++){ + // current cp is a punctuation AND previous codePoint is not a space + // e.g abc? + const codePoint = text.charCodeAt(i); + const isPuncCurrent = isPunctuation(codePoint); + if (isPuncCurrent && preCodePoint !== 32) { + output.push(32); + // previous cp is not space AND previous codePoint is punctuation + // e.g. abc?? + } else if (preCodePointIsPunc && codePoint !== 32){ + output.push(32); + } + output.push(codePoint); + preCodePoint = codePoint; + preCodePointIsPunc = isPuncCurrent; + } + return String.fromCodePoint(...output); +} + +function cleanText(text: string){ + //Performs invalid character removal and whitespace cleanup on text. + const output = []; + for (let i = 0; i < text.length; i++){ + let cp = text.charCodeAt(i); + if (cp === 0 || cp === 65533 || isControl(cp)){ + continue; + } + if (isWhiteSpace(cp)){ + output.push(32); + } + else { + output.push(cp); + } + } + return String.fromCharCode(...output); +} + +export default class WordPieceTokenizer{ + //Runs basic tokenization (punctuation splitting, lower casing, etc.). + private doLowerCase: boolean; + tokenizer: use.Tokenizer; + clsId: number; + sepId: number; + + constructor(doLowerCase: boolean){ + this.doLowerCase = doLowerCase; + } + + async init(pathToVocabulary: string, ) { + await this.loadTokenizer(pathToVocabulary); + this.clsId = this.convertTokenToId('[CLS]')[0]; + this.sepId = this.convertTokenToId('[SEP]')[0]; + } + + async tokenize(text: string){ + text = cleanText(text); + text = runSplitOnPunctuation(text); + text = runStripAccents(text); + if (this.doLowerCase){ + text = text.toLowerCase() + } + return this.tokenizer.encode(text); + } + async loadTokenizer(pathToVocabulary: string){ + this.tokenizer = await use.loadTokenizer(pathToVocabulary); + //console.log("Loaded Tokenizer."); + } + + convertIdsToTokens(ids: number[]){ + let tokens: string[] = []; + ids.forEach( id => {tokens.push(this.tokenizer.vocabulary[id][0])}); + return tokens; + } + + convertTokenToId(token: string){ + //convert a token directly to token ID without any pre processing + return this.tokenizer.encode(token); + } + + async inputFeature(text: string){ + const singleInput = await this.convertSingleExample(text); + let inputIds = tf.tensor1d(singleInput.inputIds, 'int32').expandDims(0); + let inputMask = tf.tensor1d(singleInput.inputMask, 'int32').expandDims(0); + let segmentIds = tf.tensor1d(singleInput.segmentIds, 'int32').expandDims(0); + return {"input_ids_1": inputIds, "input_mask_1": inputMask, "segment_ids_1": segmentIds}; + } + + async convertSingleExample(text: string){ + // converts single example to feature input. This is derived from: + // https://github.com/google-research/bert/blob/88a817c37f788702a363ff935fd173b6dc6ac0d6/run_classifier.py#L377-L476 + + let inputIds: number[] = []; + let inputMask: number[] = []; + let segmentIds: number[] = []; + const tokenIds = await this.tokenize(text); + const maxSeqLength = 128; + + inputIds.push(this.clsId) + inputMask.push(1); + segmentIds.push(0); + + inputIds.push(...tokenIds); + tokenIds.forEach(id => { + inputMask.push(1); + segmentIds.push(0); + }); + + inputIds.push(this.sepId) + inputMask.push(1); + segmentIds.push(0); + + // pad with 0 up to the maxSeqLength + const numTokens = inputIds.length + for (let i = 0; i < maxSeqLength - numTokens; i++){ + inputIds.push(0); + inputMask.push(0); + segmentIds.push(0); + } + // console.log('input_ids: ', inputIds); + // console.log('input_mask: ', inputMask); + // console.log('segmentIds: ', segmentIds); + // console.log('tokens: ', this.convertIdsToTokens(inputIds)); + return {inputIds, segmentIds, inputMask}; + } +} + + +// Unicode code points are extracted from repo below which is intended for node.js not client js. +// https://github.com/mathiasbynens/unicode-12.1.0/blob/master/General_Category/Nonspacing_Mark/code-points.js +var nsmarks = [768,769,770,771,772,773,774,775,776,777,778,779,780,781,782,783,784,785,786,787,788,789,790,791,792,793,794,795,796,797,798,799,800,801,802,803,804,805,806,807,808,809,810,811,812,813,814,815,816,817,818,819,820,821,822,823,824,825,826,827,828,829,830,831,832,833,834,835,836,837,838,839,840,841,842,843,844,845,846,847,848,849,850,851,852,853,854,855,856,857,858,859,860,861,862,863,864,865,866,867,868,869,870,871,872,873,874,875,876,877,878,879,1155,1156,1157,1158,1159,1425,1426,1427,1428,1429,1430,1431,1432,1433,1434,1435,1436,1437,1438,1439,1440,1441,1442,1443,1444,1445,1446,1447,1448,1449,1450,1451,1452,1453,1454,1455,1456,1457,1458,1459,1460,1461,1462,1463,1464,1465,1466,1467,1468,1469,1471,1473,1474,1476,1477,1479,1552,1553,1554,1555,1556,1557,1558,1559,1560,1561,1562,1611,1612,1613,1614,1615,1616,1617,1618,1619,1620,1621,1622,1623,1624,1625,1626,1627,1628,1629,1630,1631,1648,1750,1751,1752,1753,1754,1755,1756,1759,1760,1761,1762,1763,1764,1767,1768,1770,1771,1772,1773,1809,1840,1841,1842,1843,1844,1845,1846,1847,1848,1849,1850,1851,1852,1853,1854,1855,1856,1857,1858,1859,1860,1861,1862,1863,1864,1865,1866,1958,1959,1960,1961,1962,1963,1964,1965,1966,1967,1968,2027,2028,2029,2030,2031,2032,2033,2034,2035,2045,2070,2071,2072,2073,2075,2076,2077,2078,2079,2080,2081,2082,2083,2085,2086,2087,2089,2090,2091,2092,2093,2137,2138,2139,2259,2260,2261,2262,2263,2264,2265,2266,2267,2268,2269,2270,2271,2272,2273,2275,2276,2277,2278,2279,2280,2281,2282,2283,2284,2285,2286,2287,2288,2289,2290,2291,2292,2293,2294,2295,2296,2297,2298,2299,2300,2301,2302,2303,2304,2305,2306,2362,2364,2369,2370,2371,2372,2373,2374,2375,2376,2381,2385,2386,2387,2388,2389,2390,2391,2402,2403,2433,2492,2497,2498,2499,2500,2509,2530,2531,2558,2561,2562,2620,2625,2626,2631,2632,2635,2636,2637,2641,2672,2673,2677,2689,2690,2748,2753,2754,2755,2756,2757,2759,2760,2765,2786,2787,2810,2811,2812,2813,2814,2815,2817,2876,2879,2881,2882,2883,2884,2893,2902,2914,2915,2946,3008,3021,3072,3076,3134,3135,3136,3142,3143,3144,3146,3147,3148,3149,3157,3158,3170,3171,3201,3260,3263,3270,3276,3277,3298,3299,3328,3329,3387,3388,3393,3394,3395,3396,3405,3426,3427,3530,3538,3539,3540,3542,3633,3636,3637,3638,3639,3640,3641,3642,3655,3656,3657,3658,3659,3660,3661,3662,3761,3764,3765,3766,3767,3768,3769,3770,3771,3772,3784,3785,3786,3787,3788,3789,3864,3865,3893,3895,3897,3953,3954,3955,3956,3957,3958,3959,3960,3961,3962,3963,3964,3965,3966,3968,3969,3970,3971,3972,3974,3975,3981,3982,3983,3984,3985,3986,3987,3988,3989,3990,3991,3993,3994,3995,3996,3997,3998,3999,4000,4001,4002,4003,4004,4005,4006,4007,4008,4009,4010,4011,4012,4013,4014,4015,4016,4017,4018,4019,4020,4021,4022,4023,4024,4025,4026,4027,4028,4038,4141,4142,4143,4144,4146,4147,4148,4149,4150,4151,4153,4154,4157,4158,4184,4185,4190,4191,4192,4209,4210,4211,4212,4226,4229,4230,4237,4253,4957,4958,4959,5906,5907,5908,5938,5939,5940,5970,5971,6002,6003,6068,6069,6071,6072,6073,6074,6075,6076,6077,6086,6089,6090,6091,6092,6093,6094,6095,6096,6097,6098,6099,6109,6155,6156,6157,6277,6278,6313,6432,6433,6434,6439,6440,6450,6457,6458,6459,6679,6680,6683,6742,6744,6745,6746,6747,6748,6749,6750,6752,6754,6757,6758,6759,6760,6761,6762,6763,6764,6771,6772,6773,6774,6775,6776,6777,6778,6779,6780,6783,6832,6833,6834,6835,6836,6837,6838,6839,6840,6841,6842,6843,6844,6845,6912,6913,6914,6915,6964,6966,6967,6968,6969,6970,6972,6978,7019,7020,7021,7022,7023,7024,7025,7026,7027,7040,7041,7074,7075,7076,7077,7080,7081,7083,7084,7085,7142,7144,7145,7149,7151,7152,7153,7212,7213,7214,7215,7216,7217,7218,7219,7222,7223,7376,7377,7378,7380,7381,7382,7383,7384,7385,7386,7387,7388,7389,7390,7391,7392,7394,7395,7396,7397,7398,7399,7400,7405,7412,7416,7417,7616,7617,7618,7619,7620,7621,7622,7623,7624,7625,7626,7627,7628,7629,7630,7631,7632,7633,7634,7635,7636,7637,7638,7639,7640,7641,7642,7643,7644,7645,7646,7647,7648,7649,7650,7651,7652,7653,7654,7655,7656,7657,7658,7659,7660,7661,7662,7663,7664,7665,7666,7667,7668,7669,7670,7671,7672,7673,7675,7676,7677,7678,7679,8400,8401,8402,8403,8404,8405,8406,8407,8408,8409,8410,8411,8412,8417,8421,8422,8423,8424,8425,8426,8427,8428,8429,8430,8431,8432,11503,11504,11505,11647,11744,11745,11746,11747,11748,11749,11750,11751,11752,11753,11754,11755,11756,11757,11758,11759,11760,11761,11762,11763,11764,11765,11766,11767,11768,11769,11770,11771,11772,11773,11774,11775,12330,12331,12332,12333,12441,12442,42607,42612,42613,42614,42615,42616,42617,42618,42619,42620,42621,42654,42655,42736,42737,43010,43014,43019,43045,43046,43204,43205,43232,43233,43234,43235,43236,43237,43238,43239,43240,43241,43242,43243,43244,43245,43246,43247,43248,43249,43263,43302,43303,43304,43305,43306,43307,43308,43309,43335,43336,43337,43338,43339,43340,43341,43342,43343,43344,43345,43392,43393,43394,43443,43446,43447,43448,43449,43452,43453,43493,43561,43562,43563,43564,43565,43566,43569,43570,43573,43574,43587,43596,43644,43696,43698,43699,43700,43703,43704,43710,43711,43713,43756,43757,43766,44005,44008,44013,64286,65024,65025,65026,65027,65028,65029,65030,65031,65032,65033,65034,65035,65036,65037,65038,65039,65056,65057,65058,65059,65060,65061,65062,65063,65064,65065,65066,65067,65068,65069,65070,65071,66045,66272,66422,66423,66424,66425,66426,68097,68098,68099,68101,68102,68108,68109,68110,68111,68152,68153,68154,68159,68325,68326,68900,68901,68902,68903,69446,69447,69448,69449,69450,69451,69452,69453,69454,69455,69456,69633,69688,69689,69690,69691,69692,69693,69694,69695,69696,69697,69698,69699,69700,69701,69702,69759,69760,69761,69811,69812,69813,69814,69817,69818,69888,69889,69890,69927,69928,69929,69930,69931,69933,69934,69935,69936,69937,69938,69939,69940,70003,70016,70017,70070,70071,70072,70073,70074,70075,70076,70077,70078,70089,70090,70091,70092,70191,70192,70193,70196,70198,70199,70206,70367,70371,70372,70373,70374,70375,70376,70377,70378,70400,70401,70459,70460,70464,70502,70503,70504,70505,70506,70507,70508,70512,70513,70514,70515,70516,70712,70713,70714,70715,70716,70717,70718,70719,70722,70723,70724,70726,70750,70835,70836,70837,70838,70839,70840,70842,70847,70848,70850,70851,71090,71091,71092,71093,71100,71101,71103,71104,71132,71133,71219,71220,71221,71222,71223,71224,71225,71226,71229,71231,71232,71339,71341,71344,71345,71346,71347,71348,71349,71351,71453,71454,71455,71458,71459,71460,71461,71463,71464,71465,71466,71467,71727,71728,71729,71730,71731,71732,71733,71734,71735,71737,71738,72148,72149,72150,72151,72154,72155,72160,72193,72194,72195,72196,72197,72198,72199,72200,72201,72202,72243,72244,72245,72246,72247,72248,72251,72252,72253,72254,72263,72273,72274,72275,72276,72277,72278,72281,72282,72283,72330,72331,72332,72333,72334,72335,72336,72337,72338,72339,72340,72341,72342,72344,72345,72752,72753,72754,72755,72756,72757,72758,72760,72761,72762,72763,72764,72765,72767,72850,72851,72852,72853,72854,72855,72856,72857,72858,72859,72860,72861,72862,72863,72864,72865,72866,72867,72868,72869,72870,72871,72874,72875,72876,72877,72878,72879,72880,72882,72883,72885,72886,73009,73010,73011,73012,73013,73014,73018,73020,73021,73023,73024,73025,73026,73027,73028,73029,73031,73104,73105,73109,73111,73459,73460,92912,92913,92914,92915,92916,92976,92977,92978,92979,92980,92981,92982,94031,94095,94096,94097,94098,113821,113822,119143,119144,119145,119163,119164,119165,119166,119167,119168,119169,119170,119173,119174,119175,119176,119177,119178,119179,119210,119211,119212,119213,119362,119363,119364,121344,121345,121346,121347,121348,121349,121350,121351,121352,121353,121354,121355,121356,121357,121358,121359,121360,121361,121362,121363,121364,121365,121366,121367,121368,121369,121370,121371,121372,121373,121374,121375,121376,121377,121378,121379,121380,121381,121382,121383,121384,121385,121386,121387,121388,121389,121390,121391,121392,121393,121394,121395,121396,121397,121398,121403,121404,121405,121406,121407,121408,121409,121410,121411,121412,121413,121414,121415,121416,121417,121418,121419,121420,121421,121422,121423,121424,121425,121426,121427,121428,121429,121430,121431,121432,121433,121434,121435,121436,121437,121438,121439,121440,121441,121442,121443,121444,121445,121446,121447,121448,121449,121450,121451,121452,121461,121476,121499,121500,121501,121502,121503,121505,121506,121507,121508,121509,121510,121511,121512,121513,121514,121515,121516,121517,121518,121519,122880,122881,122882,122883,122884,122885,122886,122888,122889,122890,122891,122892,122893,122894,122895,122896,122897,122898,122899,122900,122901,122902,122903,122904,122907,122908,122909,122910,122911,122912,122913,122915,122916,122918,122919,122920,122921,122922,123184,123185,123186,123187,123188,123189,123190,123628,123629,123630,123631,125136,125137,125138,125139,125140,125141,125142,125252,125253,125254,125255,125256,125257,125258,917760,917761,917762,917763,917764,917765,917766,917767,917768,917769,917770,917771,917772,917773,917774,917775,917776,917777,917778,917779,917780,917781,917782,917783,917784,917785,917786,917787,917788,917789,917790,917791,917792,917793,917794,917795,917796,917797,917798,917799,917800,917801,917802,917803,917804,917805,917806,917807,917808,917809,917810,917811,917812,917813,917814,917815,917816,917817,917818,917819,917820,917821,917822,917823,917824,917825,917826,917827,917828,917829,917830,917831,917832,917833,917834,917835,917836,917837,917838,917839,917840,917841,917842,917843,917844,917845,917846,917847,917848,917849,917850,917851,917852,917853,917854,917855,917856,917857,917858,917859,917860,917861,917862,917863,917864,917865,917866,917867,917868,917869,917870,917871,917872,917873,917874,917875,917876,917877,917878,917879,917880,917881,917882,917883,917884,917885,917886,917887,917888,917889,917890,917891,917892,917893,917894,917895,917896,917897,917898,917899,917900,917901,917902,917903,917904,917905,917906,917907,917908,917909,917910,917911,917912,917913,917914,917915,917916,917917,917918,917919,917920,917921,917922,917923,917924,917925,917926,917927,917928,917929,917930,917931,917932,917933,917934,917935,917936,917937,917938,917939,917940,917941,917942,917943,917944,917945,917946,917947,917948,917949,917950,917951,917952,917953,917954,917955,917956,917957,917958,917959,917960,917961,917962,917963,917964,917965,917966,917967,917968,917969,917970,917971,917972,917973,917974,917975,917976,917977,917978,917979,917980,917981,917982,917983,917984,917985,917986,917987,917988,917989,917990,917991,917992,917993,917994,917995,917996,917997,917998,917999]; + +// https://github.com/mathiasbynens/unicode-12.1.0/blob/master/General_Category/Punctuation/code-points.js +var punctuations = [33,34,35,37,38,39,40,41,42,44,45,46,47,58,59,63,64,91,92,93,95,123,125,161,167,171,182,183,187,191,894,903,1370,1371,1372,1373,1374,1375,1417,1418,1470,1472,1475,1478,1523,1524,1545,1546,1548,1549,1563,1566,1567,1642,1643,1644,1645,1748,1792,1793,1794,1795,1796,1797,1798,1799,1800,1801,1802,1803,1804,1805,2039,2040,2041,2096,2097,2098,2099,2100,2101,2102,2103,2104,2105,2106,2107,2108,2109,2110,2142,2404,2405,2416,2557,2678,2800,3191,3204,3572,3663,3674,3675,3844,3845,3846,3847,3848,3849,3850,3851,3852,3853,3854,3855,3856,3857,3858,3860,3898,3899,3900,3901,3973,4048,4049,4050,4051,4052,4057,4058,4170,4171,4172,4173,4174,4175,4347,4960,4961,4962,4963,4964,4965,4966,4967,4968,5120,5742,5787,5788,5867,5868,5869,5941,5942,6100,6101,6102,6104,6105,6106,6144,6145,6146,6147,6148,6149,6150,6151,6152,6153,6154,6468,6469,6686,6687,6816,6817,6818,6819,6820,6821,6822,6824,6825,6826,6827,6828,6829,7002,7003,7004,7005,7006,7007,7008,7164,7165,7166,7167,7227,7228,7229,7230,7231,7294,7295,7360,7361,7362,7363,7364,7365,7366,7367,7379,8208,8209,8210,8211,8212,8213,8214,8215,8216,8217,8218,8219,8220,8221,8222,8223,8224,8225,8226,8227,8228,8229,8230,8231,8240,8241,8242,8243,8244,8245,8246,8247,8248,8249,8250,8251,8252,8253,8254,8255,8256,8257,8258,8259,8261,8262,8263,8264,8265,8266,8267,8268,8269,8270,8271,8272,8273,8275,8276,8277,8278,8279,8280,8281,8282,8283,8284,8285,8286,8317,8318,8333,8334,8968,8969,8970,8971,9001,9002,10088,10089,10090,10091,10092,10093,10094,10095,10096,10097,10098,10099,10100,10101,10181,10182,10214,10215,10216,10217,10218,10219,10220,10221,10222,10223,10627,10628,10629,10630,10631,10632,10633,10634,10635,10636,10637,10638,10639,10640,10641,10642,10643,10644,10645,10646,10647,10648,10712,10713,10714,10715,10748,10749,11513,11514,11515,11516,11518,11519,11632,11776,11777,11778,11779,11780,11781,11782,11783,11784,11785,11786,11787,11788,11789,11790,11791,11792,11793,11794,11795,11796,11797,11798,11799,11800,11801,11802,11803,11804,11805,11806,11807,11808,11809,11810,11811,11812,11813,11814,11815,11816,11817,11818,11819,11820,11821,11822,11824,11825,11826,11827,11828,11829,11830,11831,11832,11833,11834,11835,11836,11837,11838,11839,11840,11841,11842,11843,11844,11845,11846,11847,11848,11849,11850,11851,11852,11853,11854,11855,12289,12290,12291,12296,12297,12298,12299,12300,12301,12302,12303,12304,12305,12308,12309,12310,12311,12312,12313,12314,12315,12316,12317,12318,12319,12336,12349,12448,12539,42238,42239,42509,42510,42511,42611,42622,42738,42739,42740,42741,42742,42743,43124,43125,43126,43127,43214,43215,43256,43257,43258,43260,43310,43311,43359,43457,43458,43459,43460,43461,43462,43463,43464,43465,43466,43467,43468,43469,43486,43487,43612,43613,43614,43615,43742,43743,43760,43761,44011,64830,64831,65040,65041,65042,65043,65044,65045,65046,65047,65048,65049,65072,65073,65074,65075,65076,65077,65078,65079,65080,65081,65082,65083,65084,65085,65086,65087,65088,65089,65090,65091,65092,65093,65094,65095,65096,65097,65098,65099,65100,65101,65102,65103,65104,65105,65106,65108,65109,65110,65111,65112,65113,65114,65115,65116,65117,65118,65119,65120,65121,65123,65128,65130,65131,65281,65282,65283,65285,65286,65287,65288,65289,65290,65292,65293,65294,65295,65306,65307,65311,65312,65339,65340,65341,65343,65371,65373,65375,65376,65377,65378,65379,65380,65381,65792,65793,65794,66463,66512,66927,67671,67871,67903,68176,68177,68178,68179,68180,68181,68182,68183,68184,68223,68336,68337,68338,68339,68340,68341,68342,68409,68410,68411,68412,68413,68414,68415,68505,68506,68507,68508,69461,69462,69463,69464,69465,69703,69704,69705,69706,69707,69708,69709,69819,69820,69822,69823,69824,69825,69952,69953,69954,69955,70004,70005,70085,70086,70087,70088,70093,70107,70109,70110,70111,70200,70201,70202,70203,70204,70205,70313,70731,70732,70733,70734,70735,70747,70749,70854,71105,71106,71107,71108,71109,71110,71111,71112,71113,71114,71115,71116,71117,71118,71119,71120,71121,71122,71123,71124,71125,71126,71127,71233,71234,71235,71264,71265,71266,71267,71268,71269,71270,71271,71272,71273,71274,71275,71276,71484,71485,71486,71739,72162,72255,72256,72257,72258,72259,72260,72261,72262,72346,72347,72348,72350,72351,72352,72353,72354,72769,72770,72771,72772,72773,72816,72817,73463,73464,73727,74864,74865,74866,74867,74868,92782,92783,92917,92983,92984,92985,92986,92987,92996,93847,93848,93849,93850,94178,113823,121479,121480,121481,121482,121483,125278,125279]; + +// https://github.com/mathiasbynens/unicode-12.1.0/blob/master/General_Category/Control/code-points.js +var ctlchar = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159]; + +// https://github.com/mathiasbynens/unicode-12.1.0/blob/master/General_Category/Space_Separator/code-points.js +var spcsep = [32,160,5760,8192,8193,8194,8195,8196,8197,8198,8199,8200,8201,8202,8239,8287,12288]; diff --git a/bert-text-classifier/tsconfig.json b/bert-text-classifier/tsconfig.json new file mode 100644 index 0000000..42bbfe3 --- /dev/null +++ b/bert-text-classifier/tsconfig.json @@ -0,0 +1,27 @@ +{ + "compilerOptions": { + "resolveJsonModule": true, + "esModuleInterop": true, + "module": "commonjs", + "moduleResolution": "node", + "noImplicitAny": true, + "sourceMap": true, + "removeComments": true, + "preserveConstEnums": true, + "declaration": true, + "target": "es5", + "lib": ["es2015", "dom"], + "outDir": "./dist", + "noUnusedLocals": true, + "noImplicitReturns": true, + "noImplicitThis": true, + "alwaysStrict": true, + "noUnusedParameters": false, + "pretty": true, + "noFallthroughCasesInSwitch": true, + "allowUnreachableCode": false, + "experimentalDecorators": true, + "skipLibCheck": true + }, + "include": ["src/**/*"] +} From 5c1566023f8f97bfa085e290ef5d1cc16efa8e0f Mon Sep 17 00:00:00 2001 From: Ted Chang Date: Tue, 28 Jan 2020 18:16:53 -0800 Subject: [PATCH 2/7] addressed some reviewer comments --- bert-text-classifier/README.md | 2 +- bert-text-classifier/examples/test.html | 2 +- bert-text-classifier/package.json | 5 ++--- bert-text-classifier/spec/support/jasmine.json | 3 +++ bert-text-classifier/{src => test}/test.ts | 0 5 files changed, 7 insertions(+), 5 deletions(-) create mode 100644 bert-text-classifier/spec/support/jasmine.json rename bert-text-classifier/{src => test}/test.ts (100%) diff --git a/bert-text-classifier/README.md b/bert-text-classifier/README.md index 637cc64..142d559 100644 --- a/bert-text-classifier/README.md +++ b/bert-text-classifier/README.md @@ -1,6 +1,6 @@ # MAX for TensorFlow.js: Text Sentiment Classifier -This is a TensorFlow.js port of the [MAX Human Pose Estimator](https://developer.ibm.com/exchanges/models/all/max-text-sentiment-classifier/) This model is able to detect whether a text fragment leans towards a positive or a negative sentiment. +This is a TensorFlow.js port of the [MAX Text Sentiment Classifier](https://developer.ibm.com/exchanges/models/all/max-text-sentiment-classifier/) This model is able to detect whether a text fragment leans towards a positive or a negative sentiment. ## Install diff --git a/bert-text-classifier/examples/test.html b/bert-text-classifier/examples/test.html index 3a15c18..d1f3784 100644 --- a/bert-text-classifier/examples/test.html +++ b/bert-text-classifier/examples/test.html @@ -2,7 +2,7 @@ text classifier - + - - - - - + + text classifier + + +

Text Sentiment Classifier

+
+ + +
+

+ + Loading Model... +

+ + + diff --git a/bert-text-classifier/examples/test.js b/bert-text-classifier/examples/test.js index c53dfa1..7674c62 100644 --- a/bert-text-classifier/examples/test.js +++ b/bert-text-classifier/examples/test.js @@ -1,2 +1,3 @@ const tc = require("../dist/src/max.sentimentclass.cjs.js"); tc.predict("i like strawberries").then(res=>console.log(res)); +tc.encode("i like strawberries").then(res=>console.log(res)); \ No newline at end of file diff --git a/bert-text-classifier/src/js/sentimentanalysis.ts b/bert-text-classifier/src/js/sentimentanalysis.ts index 2d7f5b6..b54ddb6 100644 --- a/bert-text-classifier/src/js/sentimentanalysis.ts +++ b/bert-text-classifier/src/js/sentimentanalysis.ts @@ -1,8 +1,8 @@ import WordPieceTokenizer from "../tokenization"; import * as tf from '@tensorflow/tfjs'; -const vocabUrl = 'http://s3.us.cloud-object-storage.appdomain.cloud/bert-sentiment-tfjs/model/vocab.json' -const modelUrl = 'http://s3.us.cloud-object-storage.appdomain.cloud/bert-sentiment-tfjs/model/model.json'; +const vocabUrl = 'https://s3.us-south.cloud-object-storage.appdomain.cloud/max-assets-prod/max-text-sentiment-classifier/tfjs/0.1.0/vocab.json' +const modelUrl = 'https://s3.us-south.cloud-object-storage.appdomain.cloud/max-assets-prod/max-text-sentiment-classifier/tfjs/0.1.0/model.json'; export default class SentimentAnalysis { diff --git a/bert-text-classifier/src/server/sentimentanalysis.ts b/bert-text-classifier/src/server/sentimentanalysis.ts index 0d10970..b3a5869 100644 --- a/bert-text-classifier/src/server/sentimentanalysis.ts +++ b/bert-text-classifier/src/server/sentimentanalysis.ts @@ -1,9 +1,9 @@ import WordPieceTokenizer from "../tokenization"; import * as tf from '@tensorflow/tfjs'; -import {fetchModel, untar} from './util'; +import {IORouter} from '@tensorflow/tfjs-core/dist/io/router_registry'; -const vocabUrl = 'http://s3.us.cloud-object-storage.appdomain.cloud/bert-sentiment-tfjs/model/vocab.json' -const modelArch = 'http://s3.us.cloud-object-storage.appdomain.cloud/bert-sentiment-tfjs/model.tgz'; +const vocabUrl = 'https://s3.us-south.cloud-object-storage.appdomain.cloud/max-assets-prod/max-text-sentiment-classifier/tfjs/0.1.0/vocab.json' +const modelJsonUrl = 'https://s3.us-south.cloud-object-storage.appdomain.cloud/max-assets-prod/max-text-sentiment-classifier/tfjs/0.1.0/model.json' export default class SentimentAnalysis { private _model: tf.GraphModel; @@ -26,23 +26,19 @@ export default class SentimentAnalysis { const tfn = require('@tensorflow/tfjs-node'); const fs = require('fs'); const path = require('path'); - const modelTgz = path.join(`${__dirname}`, '..', '..', '/model/model.tgz'); const modelJson = path.join(`${__dirname}`,'..', '..', '/model/model.json'); - // console.log(modelJson); const modelDir = path.join(`${__dirname}`, '..', '..','/model'); if(!fs.existsSync(modelJson)){ - await fetchModel(modelArch, modelDir); - await untar(modelTgz,modelDir); + console.log('Downloading Model...'); + tf.io.registerLoadRouter(tfn.io.http as IORouter); + await tf.io.copyModel(modelJsonUrl, 'file://' + modelDir); } - const fileSystem = require('@tensorflow/tfjs-node/dist/io/file_system'); - this._model = await tfn.loadGraphModel(fileSystem.fileSystem(modelJson)); - // console.log(`Model loaded from ${modelJson}.`); + this._model = await tfn.loadGraphModel('file://' + modelJson); } async loadTokenizer(){ this._tokenizer = new WordPieceTokenizer(true); await this.tokenizer.init(vocabUrl); - // console.log("Tokenizer loaded.") } /** * Classify a text input and return a json object with pos and neg diff --git a/bert-text-classifier/src/text-sentiment-classifier.ts b/bert-text-classifier/src/text-sentiment-classifier.ts index dc36a31..0ca355a 100644 --- a/bert-text-classifier/src/text-sentiment-classifier.ts +++ b/bert-text-classifier/src/text-sentiment-classifier.ts @@ -7,20 +7,18 @@ export interface SentRes{ pos: number } +const sa = new SentimentAnalysis(); async function processInput(text: string): Promise{ - const sa = new SentimentAnalysis(); - await sa.loadTokenizer() + if(! sa.tokenizer) await sa.loadTokenizer(); return await sa.tokenizer.inputFeature(text); } async function loadModel(): Promise { - const sa = new SentimentAnalysis(); - await sa.loadModel() + if(! sa.model) await sa.loadModel(); return sa.model; } async function runInference(feature: tf.NamedTensorMap): Promise{ - const sa = new SentimentAnalysis(); await sa.init() return await sa.inference(feature); } @@ -31,21 +29,18 @@ function processOutput(res: tf.Tensor): SentRes{ } async function predict(text: string): Promise{ - const sa = new SentimentAnalysis(); await sa.init(); let res = await sa.analyzeText(text); return processOutput(res); } async function encode(text: string): Promise{ - const sa = new SentimentAnalysis(); - await sa.loadTokenizer() + if(! sa.tokenizer) await sa.loadTokenizer(); return await sa.tokenizer.tokenize(text); } async function idsToTokens(ids: number[]){ - const sa = new SentimentAnalysis(); - await sa.loadTokenizer(); + if(! sa.tokenizer) await sa.loadTokenizer(); return sa.tokenizer.convertIdsToTokens(ids); } diff --git a/bert-text-classifier/test/test.ts b/bert-text-classifier/test/test.ts index 036ddf7..54d5719 100644 --- a/bert-text-classifier/test/test.ts +++ b/bert-text-classifier/test/test.ts @@ -1,3 +1,5 @@ +/* globals jasmine, describe, it, expect, tf */ + import sa from '../src/text-sentiment-classifier'; import * as tf from '@tensorflow/tfjs'; @@ -59,7 +61,7 @@ describe('Bert Sentiment Classifier', () => { }); beforeEach(function() { // var originalTimeout = jasmine.DEFAULT_TIMEOUT_INTERVAL; - jasmine.DEFAULT_TIMEOUT_INTERVAL = 60000; + jasmine.DEFAULT_TIMEOUT_INTERVAL = 120000; }); it('predict() leans towards pos prediction', async () => { From 61722f5d1efc49cee2cde33a6250e340ee39d096 Mon Sep 17 00:00:00 2001 From: Ted Chang Date: Wed, 12 Feb 2020 14:55:06 -0800 Subject: [PATCH 4/7] add conversion manifest --- .../model/conversion_manifest.json | 124 ++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 bert-text-classifier/model/conversion_manifest.json diff --git a/bert-text-classifier/model/conversion_manifest.json b/bert-text-classifier/model/conversion_manifest.json new file mode 100644 index 0000000..bf7de50 --- /dev/null +++ b/bert-text-classifier/model/conversion_manifest.json @@ -0,0 +1,124 @@ +{ + "name": "Max-Text-Sentiment-Classifier", + "url": "https://developer.ibm.com/exchanges/models/all/max-text-sentiment-classifier/", + "source": "max-text-sentiment-classifier/1.0/assets.tar.gz/sentiment_BERT_base_uncased/", + "framework": "tf_js", + "converter": { + "tensorflowjs": { + "version": "1.2.9", + "params": { + "output_node_names": "loss/Softmax", + "input_format": "tf_saved_model", + "output_json": true + } + } + }, + "output": [ + "model.json", + "group1-shard1of105.bin", + "group1-shard2of105.bin", + "group1-shard3of105.bin", + "group1-shard4of105.bin", + "group1-shard5of105.bin", + "group1-shard6of105.bin", + "group1-shard7of105.bin", + "group1-shard8of105.bin", + "group1-shard9of105.bin", + "group1-shard10of105.bin", + "group1-shard11of105.bin", + "group1-shard12of105.bin", + "group1-shard13of105.bin", + "group1-shard14of105.bin", + "group1-shard15of105.bin", + "group1-shard16of105.bin", + "group1-shard17of105.bin", + "group1-shard18of105.bin", + "group1-shard19of105.bin", + "group1-shard20of105.bin", + "group1-shard21of105.bin", + "group1-shard22of105.bin", + "group1-shard23of105.bin", + "group1-shard24of105.bin", + "group1-shard25of105.bin", + "group1-shard26of105.bin", + "group1-shard27of105.bin", + "group1-shard28of105.bin", + "group1-shard29of105.bin", + "group1-shard30of105.bin", + "group1-shard31of105.bin", + "group1-shard32of105.bin", + "group1-shard33of105.bin", + "group1-shard34of105.bin", + "group1-shard35of105.bin", + "group1-shard36of105.bin", + "group1-shard37of105.bin", + "group1-shard38of105.bin", + "group1-shard39of105.bin", + "group1-shard40of105.bin", + "group1-shard41of105.bin", + "group1-shard42of105.bin", + "group1-shard43of105.bin", + "group1-shard44of105.bin", + "group1-shard45of105.bin", + "group1-shard46of105.bin", + "group1-shard47of105.bin", + "group1-shard48of105.bin", + "group1-shard49of105.bin", + "group1-shard50of105.bin", + "group1-shard51of105.bin", + "group1-shard52of105.bin", + "group1-shard53of105.bin", + "group1-shard54of105.bin", + "group1-shard55of105.bin", + "group1-shard56of105.bin", + "group1-shard57of105.bin", + "group1-shard58of105.bin", + "group1-shard59of105.bin", + "group1-shard60of105.bin", + "group1-shard61of105.bin", + "group1-shard62of105.bin", + "group1-shard63of105.bin", + "group1-shard64of105.bin", + "group1-shard65of105.bin", + "group1-shard66of105.bin", + "group1-shard67of105.bin", + "group1-shard68of105.bin", + "group1-shard69of105.bin", + "group1-shard70of105.bin", + "group1-shard71of105.bin", + "group1-shard72of105.bin", + "group1-shard73of105.bin", + "group1-shard74of105.bin", + "group1-shard75of105.bin", + "group1-shard76of105.bin", + "group1-shard77of105.bin", + "group1-shard78of105.bin", + "group1-shard79of105.bin", + "group1-shard80of105.bin", + "group1-shard81of105.bin", + "group1-shard82of105.bin", + "group1-shard83of105.bin", + "group1-shard84of105.bin", + "group1-shard85of105.bin", + "group1-shard86of105.bin", + "group1-shard87of105.bin", + "group1-shard88of105.bin", + "group1-shard89of105.bin", + "group1-shard90of105.bin", + "group1-shard91of105.bin", + "group1-shard92of105.bin", + "group1-shard93of105.bin", + "group1-shard94of105.bin", + "group1-shard95of105.bin", + "group1-shard96of105.bin", + "group1-shard97of105.bin", + "group1-shard98of105.bin", + "group1-shard99of105.bin", + "group1-shard100of105.bin", + "group1-shard101of105.bin", + "group1-shard102of105.bin", + "group1-shard103of105.bin", + "group1-shard104of105.bin", + "group1-shard105of105.bin" + ] +} \ No newline at end of file From 5c1647104d2c08fb7510c2ac7627890313c7e77f Mon Sep 17 00:00:00 2001 From: ted chang Date: Mon, 17 Feb 2020 12:08:32 -0800 Subject: [PATCH 5/7] change relative path --- bert-text-classifier/examples/test.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bert-text-classifier/examples/test.html b/bert-text-classifier/examples/test.html index 20ac6d6..6d31a8d 100644 --- a/bert-text-classifier/examples/test.html +++ b/bert-text-classifier/examples/test.html @@ -14,7 +14,7 @@

Text Sentiment Classifier

Loading Model...

- + diff --git a/bert-text-classifier/examples/test.js b/bert-text-classifier/examples/test.js index 7674c62..86c58f0 100644 --- a/bert-text-classifier/examples/test.js +++ b/bert-text-classifier/examples/test.js @@ -1,3 +1,3 @@ const tc = require("../dist/src/max.sentimentclass.cjs.js"); tc.predict("i like strawberries").then(res=>console.log(res)); -tc.encode("i like strawberries").then(res=>console.log(res)); \ No newline at end of file +tc.encode("i like strawberries").then(res=>console.log(res)); diff --git a/bert-text-classifier/src/text-sentiment-classifier.ts b/bert-text-classifier/src/text-sentiment-classifier.ts index 0ca355a..925bc76 100644 --- a/bert-text-classifier/src/text-sentiment-classifier.ts +++ b/bert-text-classifier/src/text-sentiment-classifier.ts @@ -19,7 +19,7 @@ async function loadModel(): Promise { } async function runInference(feature: tf.NamedTensorMap): Promise{ - await sa.init() + await sa.init(); return await sa.inference(feature); } @@ -30,8 +30,10 @@ function processOutput(res: tf.Tensor): SentRes{ async function predict(text: string): Promise{ await sa.init(); - let res = await sa.analyzeText(text); - return processOutput(res); + return sa.analyzeText(text) + .catch((err) => console.log(err)) + .then(processOutput); + } async function encode(text: string): Promise{ From 1ab70b00278fc6f9e10ce399885ab77823b11e73 Mon Sep 17 00:00:00 2001 From: Ted Chang Date: Tue, 18 Feb 2020 17:59:58 -0800 Subject: [PATCH 7/7] fix duplicate file handler error --- bert-text-classifier/package.json | 1 - bert-text-classifier/src/server/sentimentanalysis.ts | 8 ++++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/bert-text-classifier/package.json b/bert-text-classifier/package.json index 750090b..1defa6a 100644 --- a/bert-text-classifier/package.json +++ b/bert-text-classifier/package.json @@ -45,7 +45,6 @@ "homepage": "https://github.com/CODAIT/max-tfjs-models/tree/master/bert-text-classifier#readme", "dependencies": { "@tensorflow-models/universal-sentence-encoder": "^1.2.1", - "@tensorflow/tfjs": "^1.5.1", "@tensorflow/tfjs-node": "^1.5.1", "@types/express": "^4.16.1", "@types/tar": "^4.0.3", diff --git a/bert-text-classifier/src/server/sentimentanalysis.ts b/bert-text-classifier/src/server/sentimentanalysis.ts index b3a5869..702f798 100644 --- a/bert-text-classifier/src/server/sentimentanalysis.ts +++ b/bert-text-classifier/src/server/sentimentanalysis.ts @@ -1,9 +1,10 @@ import WordPieceTokenizer from "../tokenization"; -import * as tf from '@tensorflow/tfjs'; +import * as tf from '@tensorflow/tfjs-node'; import {IORouter} from '@tensorflow/tfjs-core/dist/io/router_registry'; const vocabUrl = 'https://s3.us-south.cloud-object-storage.appdomain.cloud/max-assets-prod/max-text-sentiment-classifier/tfjs/0.1.0/vocab.json' const modelJsonUrl = 'https://s3.us-south.cloud-object-storage.appdomain.cloud/max-assets-prod/max-text-sentiment-classifier/tfjs/0.1.0/model.json' +tf.io.registerLoadRouter(tf.io.http as IORouter); export default class SentimentAnalysis { private _model: tf.GraphModel; @@ -23,17 +24,16 @@ export default class SentimentAnalysis { } async loadModel(){ - const tfn = require('@tensorflow/tfjs-node'); const fs = require('fs'); const path = require('path'); const modelJson = path.join(`${__dirname}`,'..', '..', '/model/model.json'); const modelDir = path.join(`${__dirname}`, '..', '..','/model'); if(!fs.existsSync(modelJson)){ console.log('Downloading Model...'); - tf.io.registerLoadRouter(tfn.io.http as IORouter); await tf.io.copyModel(modelJsonUrl, 'file://' + modelDir); } - this._model = await tfn.loadGraphModel('file://' + modelJson); + const fileSystem = require('@tensorflow/tfjs-node/dist/io/file_system'); + this._model = await tf.loadGraphModel(fileSystem.fileSystem(modelJson)); } async loadTokenizer(){