diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d5c1656 --- /dev/null +++ b/.gitignore @@ -0,0 +1,29 @@ +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +.idea \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index e69de29..85a4efa 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,18 @@ +# BertViz + +Tool for visualizing BERT's attention layers + + +## Authors + +* **Jesse Vig** - *Initial work* - [jessevig](https://github.com/jessevig) + +## License + +This project is licensed under the Apache 2.0 License - see the [LICENSE.md](LICENSE.md) file for details + +## Acknowledgments + +This project incorporates code from the following repos: +* https://github.com/tensorflow/tensor2tensor +* https://github.com/huggingface/pytorch-pretrained-BERT diff --git a/bertviz/__init__.py b/bertviz/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bertviz/attention.js b/bertviz/attention.js new file mode 100644 index 0000000..676ff00 --- /dev/null +++ b/bertviz/attention.js @@ -0,0 +1,363 @@ +/** + * @fileoverview Transformer Visualization D3 javascript code. + */ + +requirejs(['jquery', 'd3'], +function($, d3) { + +var attention = window.attention; + +const TEXT_SIZE = 15; +const BOXWIDTH = TEXT_SIZE * 8; +const BOXHEIGHT = TEXT_SIZE * 1.5; +const WIDTH = 2000; +const HEIGHT = attention.all.bot_text.length * BOXHEIGHT * 2 + 100; +const MATRIX_WIDTH = 150; +const head_colours = d3.scale.category10(); +const CHECKBOX_SIZE = 20; + +function lighten(colour) { + var c = d3.hsl(colour); + var increment = (1 - c.l) * 0.6; + c.l += increment; + c.s -= increment; + return c; +} + +function transpose(mat) { + return mat[0].map(function(col, i) { + return mat.map(function(row) { + return row[i]; + }); + }); +} + +function zip(a, b) { + return a.map(function (e, i) { + return [e, b[i]]; + }); +} + + +function renderVis(id, top_text, bot_text, attention_heads, config) { + $(id).empty(); + var svg = d3.select(id) + .append('svg') + .attr("width", WIDTH) + .attr("height", HEIGHT); + + var att_data = []; + for (var i=0; i < attention_heads.length; i++) { + var att_trans = transpose(attention_heads[i]); + att_data.push(zip(attention_heads[i], att_trans)); + } + + renderText(svg, top_text, true, att_data, 0); + renderText(svg, bot_text, false, att_data, MATRIX_WIDTH + BOXWIDTH); + + renderAttentionHighlights(svg, att_data); + + svg.append("g").classed("attention_heads", true); + + renderAttention(svg, attention_heads); + + draw_checkboxes(config, 0, svg, attention_heads); +} + + +function renderText(svg, text, is_top, att_data, left_pos) { + var id = is_top ? "top" : "bottom"; + var textContainer = svg.append("svg:g") + .attr("id", id); + + textContainer.append("g").classed("attention_boxes", true) + .selectAll("g") + .data(att_data) + .enter() + .append("g") + .selectAll("rect") + .data(function(d) {return d;}) + .enter() + .append("rect") + .attr("x", function(d, i, j) { + return left_pos + box_offset(j); + }) + .attr("y", function(d, i) { + return (+1) * BOXHEIGHT; + }) + .attr("width", BOXWIDTH/active_heads()) + .attr("height", function() { return BOXHEIGHT; }) + .attr("fill", function(d, i, j) { + return head_colours(j); + }) + .style("opacity", 0.0); + + + var tokenContainer = textContainer.append("g").selectAll("g") + .data(text) + .enter() + .append("g"); + + tokenContainer.append("rect") + .classed("background", true) + .style("opacity", 0.0) + .attr("fill", "lightgray") + .attr("x", left_pos) + .attr("y", function(d, i) { + return (i+1) * BOXHEIGHT; + }) + .attr("width", BOXWIDTH) + .attr("height", BOXHEIGHT); + + var theText = tokenContainer.append("text") + .text(function(d) { return d; }) + .attr("font-size", TEXT_SIZE + "px") + .style("cursor", "default") + .style("-webkit-user-select", "none") + .attr("x", left_pos) + .attr("y", function(d, i) { + return (i+1) * BOXHEIGHT; + }); + + if (is_top) { + theText.style("text-anchor", "end") + .attr("dx", BOXWIDTH - TEXT_SIZE) + .attr("dy", TEXT_SIZE); + } else { + theText.style("text-anchor", "start") + .attr("dx", + TEXT_SIZE) + .attr("dy", TEXT_SIZE); + } + + tokenContainer.on("mouseover", function(d, index) { + textContainer.selectAll(".background") + .style("opacity", function(d, i) { + return i == index ? 1.0 : 0.0; + }); + + svg.selectAll(".attention_heads").style("display", "none"); + + svg.selectAll(".line_heads") // To get the nesting to work. + .selectAll(".att_lines") + .attr("stroke-opacity", function(d) { + return 1.0; + }) + .attr("y1", function(d, i) { + if (is_top) { + return (index+1) * BOXHEIGHT + (BOXHEIGHT/2); + } else { + return (i+1) * BOXHEIGHT + (BOXHEIGHT/2); + } + }) + .attr("x1", BOXWIDTH) + .attr("y2", function(d, i) { + if (is_top) { + return (i+1) * BOXHEIGHT + (BOXHEIGHT/2); + } else { + return (index+1) * BOXHEIGHT + (BOXHEIGHT/2); + } + }) + .attr("x2", BOXWIDTH + MATRIX_WIDTH) + .attr("stroke-width", 2) + .attr("stroke", function(d, i, j) { + return head_colours(j); + }) + .attr("stroke-opacity", function(d, i, j) { + if (is_top) {d = d[0];} else {d = d[1];} + if (config.head_vis[j]) { + if (d) { + return d[index]; + } else { + return 0.0; + } + } else { + return 0.0; + } + }); + + + function updateAttentionBoxes() { + var id = is_top ? "bottom" : "top"; + var the_left_pos = is_top ? MATRIX_WIDTH + BOXWIDTH : 0; + svg.select("#" + id) + .selectAll(".attention_boxes") + .selectAll("g") + .selectAll("rect") + .attr("x", function(d, i, j) { return the_left_pos + box_offset(j); }) + .attr("y", function(d, i) { return (i+1) * BOXHEIGHT; }) + .attr("width", BOXWIDTH/active_heads()) + .attr("height", function() { return BOXHEIGHT; }) + .style("opacity", function(d, i, j) { + if (is_top) {d = d[0];} else {d = d[1];} + if (config.head_vis[j]) + if (d) { + return d[index]; + } else { + return 0.0; + } + else + return 0.0; + + }); + } + + updateAttentionBoxes(); + }); + + textContainer.on("mouseleave", function() { + d3.select(this).selectAll(".background") + .style("opacity", 0.0); + + svg.selectAll(".att_lines").attr("stroke-opacity", 0.0); + svg.selectAll(".attention_heads").style("display", "inline"); + svg.selectAll(".attention_boxes") + .selectAll("g") + .selectAll("rect") + .style("opacity", 0.0); + }); +} + +function renderAttentionHighlights(svg, attention) { + var line_container = svg.append("g"); + line_container.selectAll("g") + .data(attention) + .enter() + .append("g") + .classed("line_heads", true) + .selectAll("line") + .data(function(d){return d;}) + .enter() + .append("line").classed("att_lines", true); +} + +function renderAttention(svg, attention_heads) { + var line_container = svg.selectAll(".attention_heads"); + line_container.html(null); + for(var h=0; h").val(i).text(i)); +} + +$("#layer").on('change', function(e) { + config.layer = +e.currentTarget.value; + render(); +}); + +$("#att_type").on('change', function(e) { + config.att_type = e.currentTarget.value; + render(); +}); + +$("button").on('click', visualize); + +visualize(); + +}); \ No newline at end of file diff --git a/bertviz/attention.py b/bertviz/attention.py new file mode 100644 index 0000000..6b0ebcf --- /dev/null +++ b/bertviz/attention.py @@ -0,0 +1,137 @@ +# coding=utf-8 +# Copyright 2018 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Changes made by Jesse Vig on 12/12/18 +# - Adapted to BERT model + + +"""Module for postprocessing and displaying transformer attentions. + +This module is designed to be called from an ipython notebook. +""" + +import json +import os + +import IPython.display as display + +import numpy as np + +vis_html = """ + + Layer: + Attention: + +
+""" + +__location__ = os.path.realpath( + os.path.join(os.getcwd(), os.path.dirname(__file__))) +vis_js = open(os.path.join(__location__, 'attention.js')).read() + + +def show(tokens_a, tokens_b, attn): + """Displays attention visualization""" + attentions = _get_attentions(tokens_a, tokens_b, attn) + att_json = json.dumps(attentions) + _show_attention(att_json) + + +def _show_attention(att_json): + display.display(display.HTML(vis_html)) + display.display(display.Javascript('window.attention = %s' % att_json)) + display.display(display.Javascript(vis_js)) + + +def _get_attentions(tokens_a, tokens_b, attn): + """Compute representation of the attention ready for the d3 visualization. + + Args: + tokens_a: list of strings, words to be displayed on the left of the vis + tokens_b: list of strings, words to be displayed on the right of the vis + attn: numpy array, attention + [num_layers, batch_size, num_heads, seq_len, seq_len] + + + Returns: + Dictionary of attention representations with the structure: + { + 'all': Representations for showing all attentions at the same time. + 'a': Sentence A self-attention + 'b': Sentence B self-attention + 'ab': Sentence A -> Sentence B attention + 'ba': Sentence B -> Sentence A attention + } + and each sub-dictionary has structure: + { + 'att': list of inter attentions matrices, one for each attention head + 'top_text': list of strings, words to be displayed on the left of the vis + 'bot_text5': list of strings, words to be displayed on the right of the vis + } + """ + def format_mat(mat): + return mat.transpose(0, 2, 1).tolist() + + + all_attns = [] + a_attns = [] + b_attns = [] + ab_attns = [] + ba_attns = [] + slice_a = slice(0, len(tokens_a)) + slice_b = slice(len(tokens_a), len(tokens_a) + len(tokens_b)) + num_layers = len(attn) + for layer in range(num_layers): + layer_attn = attn[layer][0] + all_attns.append(format_mat(layer_attn)) + a_attns.append(format_mat(layer_attn[:, slice_a, slice_a])) + b_attns.append(format_mat(layer_attn[:, slice_b, slice_b])) + ab_attns.append(format_mat(layer_attn[:, slice_a, slice_b])) + ba_attns.append(format_mat(layer_attn[:, slice_b, slice_a])) + + attentions = { + 'all': { + 'att': all_attns, + 'top_text': tokens_a + tokens_b, + 'bot_text': tokens_a + tokens_b + }, + 'a': { + 'att': a_attns, + 'top_text': tokens_a, + 'bot_text': tokens_a + }, + 'b': { + 'att': b_attns, + 'top_text': tokens_b, + 'bot_text': tokens_b + }, + 'ab': { + 'att': ab_attns, + 'top_text': tokens_a, + 'bot_text': tokens_b + }, + 'ba': { + 'att': ba_attns, + 'top_text': tokens_b, + 'bot_text': tokens_a + } + } + return attentions diff --git a/bertviz/bertviz.ipynb b/bertviz/bertviz.ipynb new file mode 100644 index 0000000..6de58c1 --- /dev/null +++ b/bertviz/bertviz.ipynb @@ -0,0 +1,76 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from bertviz import attention, visualization\n", + "import numpy as np\n", + "from pytorch_pretrained_bert import BertModel, BertTokenizer, BertConfig" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "%%javascript\n", + "require.config({\n", + " paths: {\n", + " d3: '//cdnjs.cloudflare.com/ajax/libs/d3/3.4.8/d3.min'\n", + " }\n", + "});\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# config = BertConfig.from_json_file('tests/fixtures/config.json')\n", + "# model = BertModel(config)\n", + "# tokenizer = BertTokenizer('tests/fixtures/vocab.txt')\n", + "# attention_visualizer = visualization.AttentionVisualizer(model, tokenizer)\n", + "# sentence1 = 'The quickest brown fox'\n", + "# sentence2 = \"the quick brown fox jumped over the laziest lazy elmo\"\n", + "# tokens_a, tokens_b, attn = attention_visualizer.get_viz_data(sentence1, sentence2)\n", + "# attention.show(tokens_a, tokens_b, attn)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bert_version = 'bert-base-uncased'\n", + "model = BertModel.from_pretrained(bert_version)\n", + "tokenizer = BertTokenizer.from_pretrained(bert_version)\n", + "# sentence_a = 'The quickest brown fox jumped over the lazy dog'\n", + "# sentence_b = \"the quick brown fox jumped over the laziest lazy elmo\"\n", + "sentence_a = \"I went to the store.\"\n", + "sentence_b = \"At the store, I bought fresh strawberries.\"\n", + "attention_visualizer = visualization.AttentionVisualizer(model, tokenizer)\n", + "tokens_a, tokens_b, attn = attention_visualizer.get_viz_data(sentence_a, sentence_b)\n", + "attention.show(tokens_a, tokens_b, attn)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/bertviz/debug.py b/bertviz/debug.py new file mode 100644 index 0000000..c2e8970 --- /dev/null +++ b/bertviz/debug.py @@ -0,0 +1,15 @@ +from bertviz.visualization import AttentionVisualizer +from bertviz.attention import _get_attentions, show +from bertviz.pytorch_pretrained_bert import BertTokenizer, BertModel, BertConfig +import numpy as np +import unittest + + +config = BertConfig.from_json_file('tests/fixtures/config.json') +model = BertModel(config) +tokenizer = BertTokenizer('tests/fixtures/vocab.txt') +attention_visualizer = AttentionVisualizer(model, tokenizer) +sentence1 = 'The quickest brown fox jumped over the lazy dog' +sentence2 = "the quick brown fox jumped over the laziest lazy elmo" +tokens_a, tokens_b, attn = attention_visualizer.get_viz_data(sentence1, sentence2) +show(tokens_a, tokens_b, attn) \ No newline at end of file diff --git a/bertviz/pytorch_pretrained_bert/__init__.py b/bertviz/pytorch_pretrained_bert/__init__.py new file mode 100755 index 0000000..fc9b15a --- /dev/null +++ b/bertviz/pytorch_pretrained_bert/__init__.py @@ -0,0 +1,7 @@ +from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer +from .modeling import (BertConfig, BertModel, BertForPreTraining, + BertForMaskedLM, BertForNextSentencePrediction, + BertForSequenceClassification, BertForTokenClassification, + BertForQuestionAnswering) +from .optimization import BertAdam +from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE diff --git a/bertviz/pytorch_pretrained_bert/__main__.py b/bertviz/pytorch_pretrained_bert/__main__.py new file mode 100755 index 0000000..73f1909 --- /dev/null +++ b/bertviz/pytorch_pretrained_bert/__main__.py @@ -0,0 +1,19 @@ +# coding: utf8 +if __name__ == '__main__': + import sys + try: + from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch + except ModuleNotFoundError: + print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, " + "In that case, it requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions.") + raise + + if len(sys.argv) != 5: + # pylint: disable=line-too-long + print("Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`") + else: + PYTORCH_DUMP_OUTPUT = sys.argv.pop() + TF_CONFIG = sys.argv.pop() + TF_CHECKPOINT = sys.argv.pop() + convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT) diff --git a/bertviz/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py b/bertviz/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py new file mode 100755 index 0000000..20fdd8c --- /dev/null +++ b/bertviz/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py @@ -0,0 +1,112 @@ +# coding=utf-8 +# Copyright 2018 The HugginFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert BERT checkpoint.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import re +import argparse +import tensorflow as tf +import torch +import numpy as np + +from .modeling import BertConfig, BertForPreTraining + +def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): + config_path = os.path.abspath(bert_config_file) + tf_path = os.path.abspath(tf_checkpoint_path) + print("Converting TensorFlow checkpoint from {} with config at {}".format(tf_path, config_path)) + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + print("Loading TF weight {} with shape {}".format(name, shape)) + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array) + + # Initialise PyTorch model + config = BertConfig.from_json_file(bert_config_file) + print("Building PyTorch model from configuration: {}".format(str(config))) + model = BertForPreTraining(config) + + for name, array in zip(names, arrays): + name = name.split('/') + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if name[-1] in ["adam_v", "adam_m"]: + print("Skipping {}".format("/".join(name))) + continue + pointer = model + for m_name in name: + if re.fullmatch(r'[A-Za-z]+_\d+', m_name): + l = re.split(r'_(\d+)', m_name) + else: + l = [m_name] + if l[0] == 'kernel': + pointer = getattr(pointer, 'weight') + elif l[0] == 'output_bias': + pointer = getattr(pointer, 'bias') + elif l[0] == 'output_weights': + pointer = getattr(pointer, 'weight') + else: + pointer = getattr(pointer, l[0]) + if len(l) >= 2: + num = int(l[1]) + pointer = pointer[num] + if m_name[-11:] == '_embeddings': + pointer = getattr(pointer, 'weight') + elif m_name == 'kernel': + array = np.transpose(array) + try: + assert pointer.shape == array.shape + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + print("Initialize PyTorch weight {}".format(name)) + pointer.data = torch.from_numpy(array) + + # Save pytorch-model + print("Save PyTorch model to {}".format(pytorch_dump_path)) + torch.save(model.state_dict(), pytorch_dump_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + ## Required parameters + parser.add_argument("--tf_checkpoint_path", + default = None, + type = str, + required = True, + help = "Path the TensorFlow checkpoint path.") + parser.add_argument("--bert_config_file", + default = None, + type = str, + required = True, + help = "The config json file corresponding to the pre-trained BERT model. \n" + "This specifies the model architecture.") + parser.add_argument("--pytorch_dump_path", + default = None, + type = str, + required = True, + help = "Path to the output PyTorch model.") + args = parser.parse_args() + convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, + args.bert_config_file, + args.pytorch_dump_path) diff --git a/bertviz/pytorch_pretrained_bert/file_utils.py b/bertviz/pytorch_pretrained_bert/file_utils.py new file mode 100755 index 0000000..f734b7e --- /dev/null +++ b/bertviz/pytorch_pretrained_bert/file_utils.py @@ -0,0 +1,233 @@ +""" +Utilities for working with the local dataset cache. +This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp +Copyright by the AllenNLP authors. +""" + +import os +import logging +import shutil +import tempfile +import json +from urllib.parse import urlparse +from pathlib import Path +from typing import Optional, Tuple, Union, IO, Callable, Set +from hashlib import sha256 +from functools import wraps + +from tqdm import tqdm + +import boto3 +from botocore.exceptions import ClientError +import requests + +logger = logging.getLogger(__name__) # pylint: disable=invalid-name + +PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', + Path.home() / '.pytorch_pretrained_bert')) + + +def url_to_filename(url: str, etag: str = None) -> str: + """ + Convert `url` into a hashed filename in a repeatable way. + If `etag` is specified, append its hash to the url's, delimited + by a period. + """ + url_bytes = url.encode('utf-8') + url_hash = sha256(url_bytes) + filename = url_hash.hexdigest() + + if etag: + etag_bytes = etag.encode('utf-8') + etag_hash = sha256(etag_bytes) + filename += '.' + etag_hash.hexdigest() + + return filename + + +def filename_to_url(filename: str, cache_dir: str = None) -> Tuple[str, str]: + """ + Return the url and etag (which may be ``None``) stored for `filename`. + Raise ``FileNotFoundError`` if `filename` or its stored metadata do not exist. + """ + if cache_dir is None: + cache_dir = PYTORCH_PRETRAINED_BERT_CACHE + + cache_path = os.path.join(cache_dir, filename) + if not os.path.exists(cache_path): + raise FileNotFoundError("file {} not found".format(cache_path)) + + meta_path = cache_path + '.json' + if not os.path.exists(meta_path): + raise FileNotFoundError("file {} not found".format(meta_path)) + + with open(meta_path) as meta_file: + metadata = json.load(meta_file) + url = metadata['url'] + etag = metadata['etag'] + + return url, etag + + +def cached_path(url_or_filename: Union[str, Path], cache_dir: str = None) -> str: + """ + Given something that might be a URL (or might be a local path), + determine which. If it's a URL, download the file and cache it, and + return the path to the cached file. If it's already a local path, + make sure the file exists and then return the path. + """ + if cache_dir is None: + cache_dir = PYTORCH_PRETRAINED_BERT_CACHE + if isinstance(url_or_filename, Path): + url_or_filename = str(url_or_filename) + + parsed = urlparse(url_or_filename) + + if parsed.scheme in ('http', 'https', 's3'): + # URL, so get it from the cache (downloading if necessary) + return get_from_cache(url_or_filename, cache_dir) + elif os.path.exists(url_or_filename): + # File, and it exists. + return url_or_filename + elif parsed.scheme == '': + # File, but it doesn't exist. + raise FileNotFoundError("file {} not found".format(url_or_filename)) + else: + # Something unknown + raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename)) + + +def split_s3_path(url: str) -> Tuple[str, str]: + """Split a full s3 path into the bucket name and path.""" + parsed = urlparse(url) + if not parsed.netloc or not parsed.path: + raise ValueError("bad s3 path {}".format(url)) + bucket_name = parsed.netloc + s3_path = parsed.path + # Remove '/' at beginning of path. + if s3_path.startswith("/"): + s3_path = s3_path[1:] + return bucket_name, s3_path + + +def s3_request(func: Callable): + """ + Wrapper function for s3 requests in order to create more helpful error + messages. + """ + + @wraps(func) + def wrapper(url: str, *args, **kwargs): + try: + return func(url, *args, **kwargs) + except ClientError as exc: + if int(exc.response["Error"]["Code"]) == 404: + raise FileNotFoundError("file {} not found".format(url)) + else: + raise + + return wrapper + + +@s3_request +def s3_etag(url: str) -> Optional[str]: + """Check ETag on S3 object.""" + s3_resource = boto3.resource("s3") + bucket_name, s3_path = split_s3_path(url) + s3_object = s3_resource.Object(bucket_name, s3_path) + return s3_object.e_tag + + +@s3_request +def s3_get(url: str, temp_file: IO) -> None: + """Pull a file directly from S3.""" + s3_resource = boto3.resource("s3") + bucket_name, s3_path = split_s3_path(url) + s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file) + + +def http_get(url: str, temp_file: IO) -> None: + req = requests.get(url, stream=True) + content_length = req.headers.get('Content-Length') + total = int(content_length) if content_length is not None else None + progress = tqdm(unit="B", total=total) + for chunk in req.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + progress.update(len(chunk)) + temp_file.write(chunk) + progress.close() + + +def get_from_cache(url: str, cache_dir: str = None) -> str: + """ + Given a URL, look for the corresponding dataset in the local cache. + If it's not there, download it. Then return the path to the cached file. + """ + if cache_dir is None: + cache_dir = PYTORCH_PRETRAINED_BERT_CACHE + + os.makedirs(cache_dir, exist_ok=True) + + # Get eTag to add to filename, if it exists. + if url.startswith("s3://"): + etag = s3_etag(url) + else: + response = requests.head(url, allow_redirects=True) + if response.status_code != 200: + raise IOError("HEAD request failed for url {} with status code {}" + .format(url, response.status_code)) + etag = response.headers.get("ETag") + + filename = url_to_filename(url, etag) + + # get cache path to put the file + cache_path = os.path.join(cache_dir, filename) + + if not os.path.exists(cache_path): + # Download to temporary file, then copy to cache dir once finished. + # Otherwise you get corrupt cache entries if the download gets interrupted. + with tempfile.NamedTemporaryFile() as temp_file: + logger.info("%s not found in cache, downloading to %s", url, temp_file.name) + + # GET file object + if url.startswith("s3://"): + s3_get(url, temp_file) + else: + http_get(url, temp_file) + + # we are copying the file before closing it, so flush to avoid truncation + temp_file.flush() + # shutil.copyfileobj() starts at the current position, so go to the start + temp_file.seek(0) + + logger.info("copying %s to cache at %s", temp_file.name, cache_path) + with open(cache_path, 'wb') as cache_file: + shutil.copyfileobj(temp_file, cache_file) + + logger.info("creating metadata file for %s", cache_path) + meta = {'url': url, 'etag': etag} + meta_path = cache_path + '.json' + with open(meta_path, 'w') as meta_file: + json.dump(meta, meta_file) + + logger.info("removing temp file %s", temp_file.name) + + return cache_path + + +def read_set_from_file(filename: str) -> Set[str]: + ''' + Extract a de-duped collection (set) of text from a file. + Expected file format is one item per line. + ''' + collection = set() + with open(filename, 'r') as file_: + for line in file_: + collection.add(line.rstrip()) + return collection + + +def get_file_extension(path: str, dot=True, lower: bool = True): + ext = os.path.splitext(path)[1] + ext = ext if dot else ext[1:] + return ext.lower() if lower else ext diff --git a/bertviz/pytorch_pretrained_bert/modeling.py b/bertviz/pytorch_pretrained_bert/modeling.py new file mode 100755 index 0000000..7a820ea --- /dev/null +++ b/bertviz/pytorch_pretrained_bert/modeling.py @@ -0,0 +1,1054 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Changes made by Jesse Vig on 12/12/18: +# - Return attention weights from BertModel +# + +"""PyTorch BERT model.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import copy +import json +import math +import logging +import tarfile +import tempfile +import shutil + +import torch +from torch import nn +from torch.nn import CrossEntropyLoss + +from .file_utils import cached_path + +logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt='%m/%d/%Y %H:%M:%S', + level=logging.INFO) +logger = logging.getLogger(__name__) + +PRETRAINED_MODEL_ARCHIVE_MAP = { + 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz", + 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz", + 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz", + 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz", + 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz", + 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz", + 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz", +} +CONFIG_NAME = 'bert_config.json' +WEIGHTS_NAME = 'pytorch_model.bin' + + +def gelu(x): + """Implementation of the gelu activation function. + For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): + 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) + """ + return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) + + +def swish(x): + return x * torch.sigmoid(x) + + +ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish} + + +class BertConfig(object): + """Configuration class to store the configuration of a `BertModel`. + """ + + def __init__(self, + vocab_size_or_config_json_file, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02): + """Constructs BertConfig. + + Args: + vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`. + hidden_size: Size of the encoder layers and the pooler layer. + num_hidden_layers: Number of hidden layers in the Transformer encoder. + num_attention_heads: Number of attention heads for each attention layer in + the Transformer encoder. + intermediate_size: The size of the "intermediate" (i.e., feed-forward) + layer in the Transformer encoder. + hidden_act: The non-linear activation function (function or string) in the + encoder and pooler. If string, "gelu", "relu" and "swish" are supported. + hidden_dropout_prob: The dropout probabilitiy for all fully connected + layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob: The dropout ratio for the attention + probabilities. + max_position_embeddings: The maximum sequence length that this model might + ever be used with. Typically set this to something large just in case + (e.g., 512 or 1024 or 2048). + type_vocab_size: The vocabulary size of the `token_type_ids` passed into + `BertModel`. + initializer_range: The sttdev of the truncated_normal_initializer for + initializing all weight matrices. + """ + if isinstance(vocab_size_or_config_json_file, str): + with open(vocab_size_or_config_json_file, "r") as reader: + json_config = json.loads(reader.read()) + for key, value in json_config.items(): + self.__dict__[key] = value + elif isinstance(vocab_size_or_config_json_file, int): + self.vocab_size = vocab_size_or_config_json_file + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + else: + raise ValueError("First argument must be either a vocabulary size (int)" + "or the path to a pretrained model config file (str)") + + @classmethod + def from_dict(cls, json_object): + """Constructs a `BertConfig` from a Python dictionary of parameters.""" + config = BertConfig(vocab_size_or_config_json_file=-1) + for key, value in json_object.items(): + config.__dict__[key] = value + return config + + @classmethod + def from_json_file(cls, json_file): + """Constructs a `BertConfig` from a json file of parameters.""" + with open(json_file, "r") as reader: + text = reader.read() + return cls.from_dict(json.loads(text)) + + def __repr__(self): + return str(self.to_json_string()) + + def to_dict(self): + """Serializes this instance to a Python dictionary.""" + output = copy.deepcopy(self.__dict__) + return output + + def to_json_string(self): + """Serializes this instance to a JSON string.""" + return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + + +class BertLayerNorm(nn.Module): + def __init__(self, config, variance_epsilon=1e-12): + """Construct a layernorm module in the TF style (epsilon inside the square root). + """ + super(BertLayerNorm, self).__init__() + self.gamma = nn.Parameter(torch.ones(config.hidden_size)) + self.beta = nn.Parameter(torch.zeros(config.hidden_size)) + self.variance_epsilon = variance_epsilon + + def forward(self, x): + u = x.mean(-1, keepdim=True) + s = (x - u).pow(2).mean(-1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.variance_epsilon) + return self.gamma * x + self.beta + + +class BertEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings. + """ + + def __init__(self, config): + super(BertEmbeddings, self).__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = BertLayerNorm(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, input_ids, token_type_ids=None): + seq_length = input_ids.size(1) + position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) + position_ids = position_ids.unsqueeze(0).expand_as(input_ids) + if token_type_ids is None: + token_type_ids = torch.zeros_like(input_ids) + + words_embeddings = self.word_embeddings(input_ids) + position_embeddings = self.position_embeddings(position_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = words_embeddings + position_embeddings + token_type_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class BertSelfAttention(nn.Module): + def __init__(self, config): + super(BertSelfAttention, self).__init__() + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (config.hidden_size, config.num_attention_heads)) + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward(self, hidden_states, attention_mask): + mixed_query_layer = self.query(hidden_states) + mixed_key_layer = self.key(hidden_states) + mixed_value_layer = self.value(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) + key_layer = self.transpose_for_scores(mixed_key_layer) + value_layer = self.transpose_for_scores(mixed_value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + context_layer = torch.matmul(attention_probs, value_layer) + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + return context_layer, attention_probs + + +class BertSelfOutput(nn.Module): + def __init__(self, config): + super(BertSelfOutput, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = BertLayerNorm(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertAttention(nn.Module): + def __init__(self, config): + super(BertAttention, self).__init__() + self.self = BertSelfAttention(config) + self.output = BertSelfOutput(config) + + def forward(self, input_tensor, attention_mask): + self_output, attention_probs = self.self(input_tensor, attention_mask) + attention_output = self.output(self_output, input_tensor) + return attention_output, attention_probs + + +class BertIntermediate(nn.Module): + def __init__(self, config): + super(BertIntermediate, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + self.intermediate_act_fn = ACT2FN[config.hidden_act] \ + if isinstance(config.hidden_act, str) else config.hidden_act + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class BertOutput(nn.Module): + def __init__(self, config): + super(BertOutput, self).__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = BertLayerNorm(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertLayer(nn.Module): + def __init__(self, config): + super(BertLayer, self).__init__() + self.attention = BertAttention(config) + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + + def forward(self, hidden_states, attention_mask): + attention_output, attention_probs = self.attention(hidden_states, attention_mask) + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output, attention_probs + + +class BertEncoder(nn.Module): + def __init__(self, config): + super(BertEncoder, self).__init__() + layer = BertLayer(config) + self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)]) + + def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True): + all_encoder_layers = [] + attention_probs_list = [] + for layer_module in self.layer: + hidden_states, attention_probs = layer_module(hidden_states, attention_mask) + attention_probs_list.append(attention_probs) + if output_all_encoded_layers: + all_encoder_layers.append(hidden_states) + if not output_all_encoded_layers: + all_encoder_layers.append(hidden_states) + attention_tensor = torch.stack(attention_probs_list) + return all_encoder_layers, attention_tensor + + +class BertPooler(nn.Module): + def __init__(self, config): + super(BertPooler, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class BertPredictionHeadTransform(nn.Module): + def __init__(self, config): + super(BertPredictionHeadTransform, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.transform_act_fn = ACT2FN[config.hidden_act] \ + if isinstance(config.hidden_act, str) else config.hidden_act + self.LayerNorm = BertLayerNorm(config) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class BertLMPredictionHead(nn.Module): + def __init__(self, config, bert_model_embedding_weights): + super(BertLMPredictionHead, self).__init__() + self.transform = BertPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(bert_model_embedding_weights.size(1), + bert_model_embedding_weights.size(0), + bias=False) + self.decoder.weight = bert_model_embedding_weights + self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0))) + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + self.bias + return hidden_states + + +class BertOnlyMLMHead(nn.Module): + def __init__(self, config, bert_model_embedding_weights): + super(BertOnlyMLMHead, self).__init__() + self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights) + + def forward(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class BertOnlyNSPHead(nn.Module): + def __init__(self, config): + super(BertOnlyNSPHead, self).__init__() + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, pooled_output): + seq_relationship_score = self.seq_relationship(pooled_output) + return seq_relationship_score + + +class BertPreTrainingHeads(nn.Module): + def __init__(self, config, bert_model_embedding_weights): + super(BertPreTrainingHeads, self).__init__() + self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights) + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, sequence_output, pooled_output): + prediction_scores = self.predictions(sequence_output) + seq_relationship_score = self.seq_relationship(pooled_output) + return prediction_scores, seq_relationship_score + + +class PreTrainedBertModel(nn.Module): + """ An abstract class to handle weights initialization and + a simple interface for dowloading and loading pretrained models. + """ + + def __init__(self, config, *inputs, **kwargs): + super(PreTrainedBertModel, self).__init__() + if not isinstance(config, BertConfig): + raise ValueError( + "Parameter config in `{}(config)` should be an instance of class `BertConfig`. " + "To create a model from a Google pretrained model use " + "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format( + self.__class__.__name__, self.__class__.__name__ + )) + self.config = config + + def init_bert_weights(self, module): + """ Initialize the weights. + """ + if isinstance(module, (nn.Linear, nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, BertLayerNorm): + module.beta.data.normal_(mean=0.0, std=self.config.initializer_range) + module.gamma.data.normal_(mean=0.0, std=self.config.initializer_range) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + @classmethod + def from_pretrained(cls, pretrained_model_name, cache_dir=None, *inputs, **kwargs): + """ + Instantiate a PreTrainedBertModel from a pre-trained model file. + Download and cache the pre-trained model file if needed. + + Params: + pretrained_model_name: either: + - a str with the name of a pre-trained model to load selected in the list of: + . `bert-base-uncased` + . `bert-large-uncased` + . `bert-base-cased` + . `bert-base-multilingual` + . `bert-base-chinese` + - a path or url to a pretrained model archive containing: + . `bert_config.json` a configuration file for the model + . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance + *inputs, **kwargs: additional input for the specific Bert class + (ex: num_labels for BertForSequenceClassification) + """ + if pretrained_model_name in PRETRAINED_MODEL_ARCHIVE_MAP: + archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name] + else: + archive_file = pretrained_model_name + # redirect to the cache, if necessary + try: + resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir) + except FileNotFoundError: + logger.error( + "Model name '{}' was not found in model name list ({}). " + "We assumed '{}' was a path or url but couldn't find any file " + "associated to this path or url.".format( + pretrained_model_name, + ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), + archive_file)) + return None + if resolved_archive_file == archive_file: + logger.info("loading archive file {}".format(archive_file)) + else: + logger.info("loading archive file {} from cache at {}".format( + archive_file, resolved_archive_file)) + tempdir = None + if os.path.isdir(resolved_archive_file): + serialization_dir = resolved_archive_file + else: + # Extract archive to temp dir + tempdir = tempfile.mkdtemp() + logger.info("extracting archive file {} to temp dir {}".format( + resolved_archive_file, tempdir)) + with tarfile.open(resolved_archive_file, 'r:gz') as archive: + archive.extractall(tempdir) + serialization_dir = tempdir + # Load config + config_file = os.path.join(serialization_dir, CONFIG_NAME) + config = BertConfig.from_json_file(config_file) + logger.info("Model config {}".format(config)) + # Instantiate model. + model = cls(config, *inputs, **kwargs) + weights_path = os.path.join(serialization_dir, WEIGHTS_NAME) + state_dict = torch.load(weights_path) + + missing_keys = [] + unexpected_keys = [] + error_msgs = [] + # copy state_dict so _load_from_state_dict can modify it + metadata = getattr(state_dict, '_metadata', None) + state_dict = state_dict.copy() + if metadata is not None: + state_dict._metadata = metadata + + def load(module, prefix=''): + local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) + module._load_from_state_dict( + state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) + for name, child in module._modules.items(): + if child is not None: + load(child, prefix + name + '.') + + load(model, prefix='' if hasattr(model, 'bert') else 'bert.') + if len(missing_keys) > 0: + logger.info("Weights of {} not initialized from pretrained model: {}".format( + model.__class__.__name__, missing_keys)) + if len(unexpected_keys) > 0: + logger.info("Weights from pretrained model not used in {}: {}".format( + model.__class__.__name__, unexpected_keys)) + if tempdir: + # Clean up temp dir + shutil.rmtree(tempdir) + return model + + +class BertModel(PreTrainedBertModel): + """BERT model ("Bidirectional Embedding Representations from a Transformer"). + + Params: + config: a BertConfig class instance with the configuration to build a new model + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`. + + Outputs: Tuple of (encoded_layers, pooled_output) + `encoded_layers`: controled by `output_all_encoded_layers` argument: + - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end + of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each + encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size], + - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding + to the last attention block of shape [batch_size, sequence_length, hidden_size], + `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a + classifier pretrained on top of the hidden state associated to the first character of the + input (`CLF`) to train on the Next-Sentence task (see BERT's paper). + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + model = modeling.BertModel(config=config) + all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask) + ``` + """ + + def __init__(self, config): + super(BertModel, self).__init__(config) + self.embeddings = BertEmbeddings(config) + self.encoder = BertEncoder(config) + self.pooler = BertPooler(config) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True): + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + if token_type_ids is None: + token_type_ids = torch.zeros_like(input_ids) + + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + + embedding_output = self.embeddings(input_ids, token_type_ids) + encoded_layers, attention_tensor = self.encoder(embedding_output, + extended_attention_mask, + output_all_encoded_layers=output_all_encoded_layers) + sequence_output = encoded_layers[-1] + pooled_output = self.pooler(sequence_output) + if not output_all_encoded_layers: + encoded_layers = encoded_layers[-1] + return encoded_layers, pooled_output, attention_tensor + + +class BertForPreTraining(PreTrainedBertModel): + """BERT model with pre-training heads. + This module comprises the BERT model followed by the two pre-training heads: + - the masked language modeling head, and + - the next sentence classification head. + + Params: + config: a BertConfig class instance with the configuration to build a new model. + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] + with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss + is only computed for the labels set in [0, ..., vocab_size] + `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size] + with indices selected in [0, 1]. + 0 => next sentence is the continuation, 1 => next sentence is a random sentence. + + Outputs: + if `masked_lm_labels` and `next_sentence_label` are not `None`: + Outputs the total_loss which is the sum of the masked language modeling loss and the next + sentence classification loss. + if `masked_lm_labels` or `next_sentence_label` is `None`: + Outputs a tuple comprising + - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and + - the next sentence classification logits of shape [batch_size, 2]. + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + model = BertForPreTraining(config) + masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask) + ``` + """ + + def __init__(self, config): + super(BertForPreTraining, self).__init__(config) + self.bert = BertModel(config) + self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, + next_sentence_label=None): + sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, + output_all_encoded_layers=False) + prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) + + if masked_lm_labels is not None and next_sentence_label is not None: + loss_fct = CrossEntropyLoss(ignore_index=-1) + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) + next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) + total_loss = masked_lm_loss + next_sentence_loss + return total_loss + else: + return prediction_scores, seq_relationship_score + + +class BertForMaskedLM(PreTrainedBertModel): + """BERT model with the masked language modeling head. + This module comprises the BERT model followed by the masked language modeling head. + + Params: + config: a BertConfig class instance with the configuration to build a new model. + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] + with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss + is only computed for the labels set in [0, ..., vocab_size] + + Outputs: + if `masked_lm_labels` is `None`: + Outputs the masked language modeling loss. + if `masked_lm_labels` is `None`: + Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size]. + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + model = BertForMaskedLM(config) + masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask) + ``` + """ + + def __init__(self, config): + super(BertForMaskedLM, self).__init__(config) + self.bert = BertModel(config) + self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None): + sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, + output_all_encoded_layers=False) + prediction_scores = self.cls(sequence_output) + + if masked_lm_labels is not None: + loss_fct = CrossEntropyLoss(ignore_index=-1) + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) + return masked_lm_loss + else: + return prediction_scores + + +class BertForNextSentencePrediction(PreTrainedBertModel): + """BERT model with next sentence prediction head. + This module comprises the BERT model followed by the next sentence classification head. + + Params: + config: a BertConfig class instance with the configuration to build a new model. + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size] + with indices selected in [0, 1]. + 0 => next sentence is the continuation, 1 => next sentence is a random sentence. + + Outputs: + if `next_sentence_label` is not `None`: + Outputs the total_loss which is the sum of the masked language modeling loss and the next + sentence classification loss. + if `next_sentence_label` is `None`: + Outputs the next sentence classification logits of shape [batch_size, 2]. + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + model = BertForNextSentencePrediction(config) + seq_relationship_logits = model(input_ids, token_type_ids, input_mask) + ``` + """ + + def __init__(self, config): + super(BertForNextSentencePrediction, self).__init__(config) + self.bert = BertModel(config) + self.cls = BertOnlyNSPHead(config) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None): + _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, + output_all_encoded_layers=False) + seq_relationship_score = self.cls(pooled_output) + + if next_sentence_label is not None: + loss_fct = CrossEntropyLoss(ignore_index=-1) + next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) + return next_sentence_loss + else: + return seq_relationship_score + + +class BertForSequenceClassification(PreTrainedBertModel): + """BERT model for classification. + This module is composed of the BERT model with a linear layer on top of + the pooled output. + + Params: + `config`: a BertConfig class instance with the configuration to build a new model. + `num_labels`: the number of classes for the classifier. Default = 2. + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `labels`: labels for the classification output: torch.LongTensor of shape [batch_size] + with indices selected in [0, ..., num_labels]. + + Outputs: + if `labels` is not `None`: + Outputs the CrossEntropy classification loss of the output with the labels. + if `labels` is `None`: + Outputs the classification logits of shape [batch_size, num_labels]. + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + num_labels = 2 + + model = BertForSequenceClassification(config, num_labels) + logits = model(input_ids, token_type_ids, input_mask) + ``` + """ + + def __init__(self, config, num_labels=2): + super(BertForSequenceClassification, self).__init__(config) + self.num_labels = num_labels + self.bert = BertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, num_labels) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None): + _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False) + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + return loss + else: + return logits + + +class BertForTokenClassification(PreTrainedBertModel): + """BERT model for token-level classification. + This module is composed of the BERT model with a linear layer on top of + the full hidden state of the last layer. + + Params: + `config`: a BertConfig class instance with the configuration to build a new model. + `num_labels`: the number of classes for the classifier. Default = 2. + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `labels`: labels for the classification output: torch.LongTensor of shape [batch_size] + with indices selected in [0, ..., num_labels]. + + Outputs: + if `labels` is not `None`: + Outputs the CrossEntropy classification loss of the output with the labels. + if `labels` is `None`: + Outputs the classification logits of shape [batch_size, sequence_length, num_labels]. + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + num_labels = 2 + + model = BertForTokenClassification(config, num_labels) + logits = model(input_ids, token_type_ids, input_mask) + ``` + """ + + def __init__(self, config, num_labels=2): + super(BertForTokenClassification, self).__init__(config) + self.num_labels = num_labels + self.bert = BertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, num_labels) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None): + sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False) + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + return loss + else: + return logits + + +class BertForQuestionAnswering(PreTrainedBertModel): + """BERT model for Question Answering (span extraction). + This module is composed of the BERT model with a linear layer on top of + the sequence output that computes start_logits and end_logits + + Params: + `config`: either + - a BertConfig class instance with the configuration to build a new model, or + - a str with the name of a pre-trained model to load selected in the list of: + . `bert-base-uncased` + . `bert-large-uncased` + . `bert-base-cased` + . `bert-base-multilingual` + . `bert-base-chinese` + The pre-trained model will be downloaded and cached if needed. + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size]. + Positions are clamped to the length of the sequence and position outside of the sequence are not taken + into account for computing the loss. + `end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size]. + Positions are clamped to the length of the sequence and position outside of the sequence are not taken + into account for computing the loss. + + Outputs: + if `start_positions` and `end_positions` are not `None`: + Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions. + if `start_positions` or `end_positions` is `None`: + Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end + position tokens of shape [batch_size, sequence_length]. + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + model = BertForQuestionAnswering(config) + start_logits, end_logits = model(input_ids, token_type_ids, input_mask) + ``` + """ + + def __init__(self, config): + super(BertForQuestionAnswering, self).__init__(config) + self.bert = BertModel(config) + # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version + # self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.qa_outputs = nn.Linear(config.hidden_size, 2) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None, end_positions=None): + sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False) + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions.clamp_(0, ignored_index) + end_positions.clamp_(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + return total_loss + else: + return start_logits, end_logits diff --git a/bertviz/pytorch_pretrained_bert/optimization.py b/bertviz/pytorch_pretrained_bert/optimization.py new file mode 100755 index 0000000..4314c84 --- /dev/null +++ b/bertviz/pytorch_pretrained_bert/optimization.py @@ -0,0 +1,162 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch optimization for BERT model.""" + +import math +import torch +from torch.optim import Optimizer +from torch.optim.optimizer import required +from torch.nn.utils import clip_grad_norm_ + +def warmup_cosine(x, warmup=0.002): + if x < warmup: + return x/warmup + return 0.5 * (1.0 + torch.cos(math.pi * x)) + +def warmup_constant(x, warmup=0.002): + if x < warmup: + return x/warmup + return 1.0 + +def warmup_linear(x, warmup=0.002): + if x < warmup: + return x/warmup + return 1.0 - x + +SCHEDULES = { + 'warmup_cosine':warmup_cosine, + 'warmup_constant':warmup_constant, + 'warmup_linear':warmup_linear, +} + + +class BertAdam(Optimizer): + """Implements BERT version of Adam algorithm with weight decay fix. + Params: + lr: learning rate + warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1 + t_total: total number of training steps for the learning + rate schedule, -1 means constant learning rate. Default: -1 + schedule: schedule to use for the warmup (see above). Default: 'warmup_linear' + b1: Adams b1. Default: 0.9 + b2: Adams b2. Default: 0.999 + e: Adams epsilon. Default: 1e-6 + weight_decay_rate: Weight decay. Default: 0.01 + max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0 + """ + def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear', + b1=0.9, b2=0.999, e=1e-6, weight_decay_rate=0.01, + max_grad_norm=1.0): + if lr is not required and lr < 0.0: + raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) + if schedule not in SCHEDULES: + raise ValueError("Invalid schedule parameter: {}".format(schedule)) + if not 0.0 <= warmup < 1.0 and not warmup == -1: + raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup)) + if not 0.0 <= b1 < 1.0: + raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1)) + if not 0.0 <= b2 < 1.0: + raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2)) + if not e >= 0.0: + raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e)) + defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total, + b1=b1, b2=b2, e=e, weight_decay_rate=weight_decay_rate, + max_grad_norm=max_grad_norm) + super(BertAdam, self).__init__(params, defaults) + + def get_lr(self): + lr = [] + for group in self.param_groups: + for p in group['params']: + state = self.state[p] + if len(state) == 0: + return [0] + if group['t_total'] != -1: + schedule_fct = SCHEDULES[group['schedule']] + lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup']) + else: + lr_scheduled = group['lr'] + lr.append(lr_scheduled) + return lr + + def step(self, closure=None): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + if grad.is_sparse: + raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['next_m'] = torch.zeros_like(p.data) + # Exponential moving average of squared gradient values + state['next_v'] = torch.zeros_like(p.data) + + next_m, next_v = state['next_m'], state['next_v'] + beta1, beta2 = group['b1'], group['b2'] + + # Add grad clipping + if group['max_grad_norm'] > 0: + clip_grad_norm_(p, group['max_grad_norm']) + + # Decay the first and second moment running average coefficient + # In-place operations to update the averages at the same time + next_m.mul_(beta1).add_(1 - beta1, grad) + next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad) + update = next_m / (next_v.sqrt() + group['e']) + + # Just adding the square of the weights to the loss function is *not* + # the correct way of using L2 regularization/weight decay with Adam, + # since that will interact with the m and v parameters in strange ways. + # + # Instead we want to decay the weights in a manner that doesn't interact + # with the m/v parameters. This is equivalent to adding the square + # of the weights to the loss with plain (non-momentum) SGD. + if group['weight_decay_rate'] > 0.0: + update += group['weight_decay_rate'] * p.data + + if group['t_total'] != -1: + schedule_fct = SCHEDULES[group['schedule']] + lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup']) + else: + lr_scheduled = group['lr'] + + update_with_lr = lr_scheduled * update + p.data.add_(-update_with_lr) + + state['step'] += 1 + + # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1 + # No bias correction + # bias_correction1 = 1 - beta1 ** state['step'] + # bias_correction2 = 1 - beta2 ** state['step'] + + return loss diff --git a/bertviz/pytorch_pretrained_bert/tokenization.py b/bertviz/pytorch_pretrained_bert/tokenization.py new file mode 100755 index 0000000..c7ef20d --- /dev/null +++ b/bertviz/pytorch_pretrained_bert/tokenization.py @@ -0,0 +1,346 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import unicodedata +import os +import logging + +from .file_utils import cached_path + +logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt = '%m/%d/%Y %H:%M:%S', + level = logging.INFO) +logger = logging.getLogger(__name__) + +PRETRAINED_VOCAB_ARCHIVE_MAP = { + 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", + 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", + 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt", + 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt", + 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt", + 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", + 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt", +} +VOCAB_NAME = 'vocab.txt' + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + index = 0 + with open(vocab_file, "r", encoding="utf-8") as reader: + while True: + token = reader.readline() + if not token: + break + token = token.strip() + vocab[token] = index + index += 1 + return vocab + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a peice of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class BertTokenizer(object): + """Runs end-to-end tokenization: punctuation splitting + wordpiece""" + def __init__(self, vocab_file, do_lower_case=True): + if not os.path.isfile(vocab_file): + raise ValueError( + "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " + "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)) + self.vocab = load_vocab(vocab_file) + self.ids_to_tokens = collections.OrderedDict( + [(ids, tok) for tok, ids in self.vocab.items()]) + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + + def tokenize(self, text): + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + return split_tokens + + def convert_tokens_to_ids(self, tokens): + """Converts a sequence of tokens into ids using the vocab.""" + ids = [] + for token in tokens: + ids.append(self.vocab[token]) + return ids + + def convert_ids_to_tokens(self, ids): + """Converts a sequence of ids in wordpiece tokens using the vocab.""" + tokens = [] + for i in ids: + tokens.append(self.ids_to_tokens[i]) + return tokens + + @classmethod + def from_pretrained(cls, pretrained_model_name, cache_dir=None, *inputs, **kwargs): + """ + Instantiate a PreTrainedBertModel from a pre-trained model file. + Download and cache the pre-trained model file if needed. + """ + if pretrained_model_name in PRETRAINED_VOCAB_ARCHIVE_MAP: + vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name] + else: + vocab_file = pretrained_model_name + if os.path.isdir(vocab_file): + vocab_file = os.path.join(vocab_file, VOCAB_NAME) + # redirect to the cache, if necessary + try: + resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) + except FileNotFoundError: + logger.error( + "Model name '{}' was not found in model name list ({}). " + "We assumed '{}' was a path or url but couldn't find any file " + "associated to this path or url.".format( + pretrained_model_name, + ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), + vocab_file)) + return None + if resolved_vocab_file == vocab_file: + logger.info("loading vocabulary file {}".format(vocab_file)) + else: + logger.info("loading vocabulary file {} from cache at {}".format( + vocab_file, resolved_vocab_file)) + # Instantiate tokenizer. + tokenizer = cls(resolved_vocab_file, *inputs, **kwargs) + return tokenizer + + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, do_lower_case=True): + """Constructs a BasicTokenizer. + + Args: + do_lower_case: Whether to lower case the input. + """ + self.do_lower_case = do_lower_case + + def tokenize(self, text): + """Tokenizes a piece of text.""" + text = self._clean_text(text) + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + text = self._tokenize_chinese_chars(text) + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text): + """Splits punctuation on a piece of text.""" + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenization.""" + + def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer. + + Returns: + A list of wordpiece tokens. + """ + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat.startswith("C"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or + (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False diff --git a/bertviz/tests/fixtures/config.json b/bertviz/tests/fixtures/config.json new file mode 100644 index 0000000..ee8abf4 --- /dev/null +++ b/bertviz/tests/fixtures/config.json @@ -0,0 +1,12 @@ +{ + "num_hidden_layers": 12, + "vocab_size": 18, + "hidden_size": 12, + "max_position_embeddings": 64, + "type_vocab_size": 2, + "hidden_dropout_prob": 0.1, + "num_attention_heads": 3, + "attention_probs_dropout_prob": 0.1, + "intermediate_size": 6 + +} \ No newline at end of file diff --git a/bertviz/tests/fixtures/vocab.txt b/bertviz/tests/fixtures/vocab.txt new file mode 100644 index 0000000..8c18a83 --- /dev/null +++ b/bertviz/tests/fixtures/vocab.txt @@ -0,0 +1,18 @@ +[PAD] +[UNK] +the +quick +##est +brown +fox +##iest +jumped +over +##zie +##st +dog +. +lazy +la +[SEP] +[CLS] \ No newline at end of file diff --git a/bertviz/tests/test_attention.py b/bertviz/tests/test_attention.py new file mode 100644 index 0000000..c61da4b --- /dev/null +++ b/bertviz/tests/test_attention.py @@ -0,0 +1,41 @@ +from bertviz.visualization import AttentionVisualizer +from bertviz.attention import _get_attentions +from bertviz.pytorch_pretrained_bert import BertTokenizer, BertModel, BertConfig +import numpy as np +import unittest + + +class TestAttention(unittest.TestCase): + + def setUp(self): + self.config = BertConfig.from_json_file('fixtures/config.json') + model = BertModel(self.config) + tokenizer = BertTokenizer('fixtures/vocab.txt') + self.attention_visualizer = AttentionVisualizer(model, tokenizer) + + def test_get_attentions(self): + sentence1 = 'The quickest brown fox jumped over the lazy dog' + sentence2 = "the quick brown fox jumped over the laziest lazy elmo" + tokens_a, tokens_b, attn = self.attention_visualizer.get_viz_data(sentence1, sentence2) + attentions = _get_attentions(tokens_a, tokens_b, attn) + attn_squeezed = np.squeeze(attn) + expected_all_attention = attn_squeezed.tolist() + self.assertEqual(attentions['all']['att'], expected_all_attention) + attn_a = np.array(attentions['a']['att']) + attn_b = np.array(attentions['b']['att']) + attn_ab = np.array(attentions['ab']['att']) + attn_ba = np.array(attentions['ba']['att']) + expected_top_half = attn_squeezed[:, :, :len(tokens_a), :] + top_half = np.concatenate((attn_a, attn_ab), axis=-1) + self.assertEqual(top_half.shape, expected_top_half.shape) + self.assertTrue(np.array_equal(top_half, expected_top_half)) + expected_bottom_half = attn_squeezed[:, :, len(tokens_a):, :] + bottom_half = np.concatenate((attn_b, attn_ba), axis=-1) + self.assertEqual(bottom_half.shape, expected_bottom_half.shape) + all = np.concatenate((top_half, bottom_half), axis=-2) + self.assertEqual(all.shape, attn_squeezed.shape) + self.assertTrue(np.allclose(all, attn_squeezed, atol=1e-06)) + +if __name__ == "__main__": + unittest.main() + diff --git a/bertviz/tests/test_visualization.py b/bertviz/tests/test_visualization.py new file mode 100644 index 0000000..bb5cedb --- /dev/null +++ b/bertviz/tests/test_visualization.py @@ -0,0 +1,40 @@ +from bertviz.visualization import AttentionVisualizer +from bertviz.pytorch_pretrained_bert import BertTokenizer, BertModel, BertConfig +import unittest + + +class TestVisualization(unittest.TestCase): + + def setUp(self): + self.config = BertConfig.from_json_file('fixtures/config.json') + model = BertModel(self.config) + tokenizer = BertTokenizer('fixtures/vocab.txt') + self.attention_visualizer = AttentionVisualizer(model, tokenizer) + + def test_get_inputs(self): + sentence1 = 'The quickest brown fox jumped over the lazy dog' + tokens_ids1 = [2, 3, 4, 5, 6, 8, 9, 2, 14, 12] + sentence2 = "the quick brown fox jumped over the laziest lazy elmo" + token_ids2 = [2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1] + tokens_tensor, token_type_tensor, tokens_a, tokens_b = self.attention_visualizer._get_inputs(sentence1, sentence2) + cls_id = 17 + sep_id = 16 + self.assertEqual(tokens_tensor.tolist()[0], + [cls_id] + tokens_ids1 + [sep_id] + token_ids2 + [sep_id]) + self.assertEqual(token_type_tensor.tolist()[0], + ([0] * 12) + ([1] * 13)) + + def test_get_viz_data(self): + sentence1 = 'The quickest brown fox jumped over the lazy dog' + tokens1 = ['the', 'quick', '##est', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog'] + sentence2 = "the quick brown fox jumped over the laziest lazy elmo" + tokens2 = ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'la', '##zie', '##st', 'lazy', '[UNK]'] + tokens_a, tokens_b, attn = self.attention_visualizer.get_viz_data(sentence1, sentence2) + self.assertEqual(tokens_a, ['[CLS]'] + tokens1 + ['[SEP]']) + self.assertEqual(tokens_b, tokens2 + ['[SEP]']) + expected_attn_shape = (self.config.num_hidden_layers, 1, self.config.num_attention_heads, len(tokens_a) + len(tokens_b), len(tokens_a) + len(tokens_b)) + self.assertEqual(attn.shape, expected_attn_shape) + +if __name__ == "__main__": + unittest.main() + diff --git a/bertviz/visualization.py b/bertviz/visualization.py new file mode 100644 index 0000000..77f4b7f --- /dev/null +++ b/bertviz/visualization.py @@ -0,0 +1,51 @@ +# coding=utf-8 +# Copyright 2018 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Changes made by Jesse Vig on 12/12/18: +# - Adapted to BERT model +# + + +import torch + +class AttentionVisualizer: + + def __init__(self, model, tokenizer): + self.model = model + self.tokenizer = tokenizer + self.model.eval() + + def get_viz_data(self, sentence_a, sentence_b): + tokens_tensor, token_type_tensor, tokens_a, tokens_b = self._get_inputs(sentence_a, sentence_b) + attn = self._get_attention(tokens_tensor, token_type_tensor) + return tokens_a, tokens_b, attn + + def _get_inputs(self, sentence_a, sentence_b): + tokens_a = self.tokenizer.tokenize(sentence_a) + tokens_b = self.tokenizer.tokenize(sentence_b) + tokens_a_delim = ['[CLS]'] + tokens_a + ['[SEP]'] + tokens_b_delim = tokens_b + ['[SEP]'] + token_ids = self.tokenizer.convert_tokens_to_ids(tokens_a_delim + tokens_b_delim) + tokens_tensor = torch.tensor([token_ids]) + token_type_tensor = torch.LongTensor([[0] * len(tokens_a_delim) + [1] * len(tokens_b_delim)]) + return tokens_tensor, token_type_tensor, tokens_a_delim, tokens_b_delim + + def _get_attention(self, tokens_tensor, token_type_tensor): + _, _, attention_tensor = self.model(tokens_tensor, token_type_ids=token_type_tensor) + return attention_tensor.data.numpy() + + + +