diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d5c1656
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,29 @@
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+.idea
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..d645695
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,202 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/README.md b/README.md
index e69de29..85a4efa 100644
--- a/README.md
+++ b/README.md
@@ -0,0 +1,18 @@
+# BertViz
+
+Tool for visualizing BERT's attention layers
+
+
+## Authors
+
+* **Jesse Vig** - *Initial work* - [jessevig](https://github.com/jessevig)
+
+## License
+
+This project is licensed under the Apache 2.0 License - see the [LICENSE.md](LICENSE.md) file for details
+
+## Acknowledgments
+
+This project incorporates code from the following repos:
+* https://github.com/tensorflow/tensor2tensor
+* https://github.com/huggingface/pytorch-pretrained-BERT
diff --git a/bertviz/__init__.py b/bertviz/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/bertviz/attention.js b/bertviz/attention.js
new file mode 100644
index 0000000..676ff00
--- /dev/null
+++ b/bertviz/attention.js
@@ -0,0 +1,363 @@
+/**
+ * @fileoverview Transformer Visualization D3 javascript code.
+ */
+
+requirejs(['jquery', 'd3'],
+function($, d3) {
+
+var attention = window.attention;
+
+const TEXT_SIZE = 15;
+const BOXWIDTH = TEXT_SIZE * 8;
+const BOXHEIGHT = TEXT_SIZE * 1.5;
+const WIDTH = 2000;
+const HEIGHT = attention.all.bot_text.length * BOXHEIGHT * 2 + 100;
+const MATRIX_WIDTH = 150;
+const head_colours = d3.scale.category10();
+const CHECKBOX_SIZE = 20;
+
+function lighten(colour) {
+ var c = d3.hsl(colour);
+ var increment = (1 - c.l) * 0.6;
+ c.l += increment;
+ c.s -= increment;
+ return c;
+}
+
+function transpose(mat) {
+ return mat[0].map(function(col, i) {
+ return mat.map(function(row) {
+ return row[i];
+ });
+ });
+}
+
+function zip(a, b) {
+ return a.map(function (e, i) {
+ return [e, b[i]];
+ });
+}
+
+
+function renderVis(id, top_text, bot_text, attention_heads, config) {
+ $(id).empty();
+ var svg = d3.select(id)
+ .append('svg')
+ .attr("width", WIDTH)
+ .attr("height", HEIGHT);
+
+ var att_data = [];
+ for (var i=0; i < attention_heads.length; i++) {
+ var att_trans = transpose(attention_heads[i]);
+ att_data.push(zip(attention_heads[i], att_trans));
+ }
+
+ renderText(svg, top_text, true, att_data, 0);
+ renderText(svg, bot_text, false, att_data, MATRIX_WIDTH + BOXWIDTH);
+
+ renderAttentionHighlights(svg, att_data);
+
+ svg.append("g").classed("attention_heads", true);
+
+ renderAttention(svg, attention_heads);
+
+ draw_checkboxes(config, 0, svg, attention_heads);
+}
+
+
+function renderText(svg, text, is_top, att_data, left_pos) {
+ var id = is_top ? "top" : "bottom";
+ var textContainer = svg.append("svg:g")
+ .attr("id", id);
+
+ textContainer.append("g").classed("attention_boxes", true)
+ .selectAll("g")
+ .data(att_data)
+ .enter()
+ .append("g")
+ .selectAll("rect")
+ .data(function(d) {return d;})
+ .enter()
+ .append("rect")
+ .attr("x", function(d, i, j) {
+ return left_pos + box_offset(j);
+ })
+ .attr("y", function(d, i) {
+ return (+1) * BOXHEIGHT;
+ })
+ .attr("width", BOXWIDTH/active_heads())
+ .attr("height", function() { return BOXHEIGHT; })
+ .attr("fill", function(d, i, j) {
+ return head_colours(j);
+ })
+ .style("opacity", 0.0);
+
+
+ var tokenContainer = textContainer.append("g").selectAll("g")
+ .data(text)
+ .enter()
+ .append("g");
+
+ tokenContainer.append("rect")
+ .classed("background", true)
+ .style("opacity", 0.0)
+ .attr("fill", "lightgray")
+ .attr("x", left_pos)
+ .attr("y", function(d, i) {
+ return (i+1) * BOXHEIGHT;
+ })
+ .attr("width", BOXWIDTH)
+ .attr("height", BOXHEIGHT);
+
+ var theText = tokenContainer.append("text")
+ .text(function(d) { return d; })
+ .attr("font-size", TEXT_SIZE + "px")
+ .style("cursor", "default")
+ .style("-webkit-user-select", "none")
+ .attr("x", left_pos)
+ .attr("y", function(d, i) {
+ return (i+1) * BOXHEIGHT;
+ });
+
+ if (is_top) {
+ theText.style("text-anchor", "end")
+ .attr("dx", BOXWIDTH - TEXT_SIZE)
+ .attr("dy", TEXT_SIZE);
+ } else {
+ theText.style("text-anchor", "start")
+ .attr("dx", + TEXT_SIZE)
+ .attr("dy", TEXT_SIZE);
+ }
+
+ tokenContainer.on("mouseover", function(d, index) {
+ textContainer.selectAll(".background")
+ .style("opacity", function(d, i) {
+ return i == index ? 1.0 : 0.0;
+ });
+
+ svg.selectAll(".attention_heads").style("display", "none");
+
+ svg.selectAll(".line_heads") // To get the nesting to work.
+ .selectAll(".att_lines")
+ .attr("stroke-opacity", function(d) {
+ return 1.0;
+ })
+ .attr("y1", function(d, i) {
+ if (is_top) {
+ return (index+1) * BOXHEIGHT + (BOXHEIGHT/2);
+ } else {
+ return (i+1) * BOXHEIGHT + (BOXHEIGHT/2);
+ }
+ })
+ .attr("x1", BOXWIDTH)
+ .attr("y2", function(d, i) {
+ if (is_top) {
+ return (i+1) * BOXHEIGHT + (BOXHEIGHT/2);
+ } else {
+ return (index+1) * BOXHEIGHT + (BOXHEIGHT/2);
+ }
+ })
+ .attr("x2", BOXWIDTH + MATRIX_WIDTH)
+ .attr("stroke-width", 2)
+ .attr("stroke", function(d, i, j) {
+ return head_colours(j);
+ })
+ .attr("stroke-opacity", function(d, i, j) {
+ if (is_top) {d = d[0];} else {d = d[1];}
+ if (config.head_vis[j]) {
+ if (d) {
+ return d[index];
+ } else {
+ return 0.0;
+ }
+ } else {
+ return 0.0;
+ }
+ });
+
+
+ function updateAttentionBoxes() {
+ var id = is_top ? "bottom" : "top";
+ var the_left_pos = is_top ? MATRIX_WIDTH + BOXWIDTH : 0;
+ svg.select("#" + id)
+ .selectAll(".attention_boxes")
+ .selectAll("g")
+ .selectAll("rect")
+ .attr("x", function(d, i, j) { return the_left_pos + box_offset(j); })
+ .attr("y", function(d, i) { return (i+1) * BOXHEIGHT; })
+ .attr("width", BOXWIDTH/active_heads())
+ .attr("height", function() { return BOXHEIGHT; })
+ .style("opacity", function(d, i, j) {
+ if (is_top) {d = d[0];} else {d = d[1];}
+ if (config.head_vis[j])
+ if (d) {
+ return d[index];
+ } else {
+ return 0.0;
+ }
+ else
+ return 0.0;
+
+ });
+ }
+
+ updateAttentionBoxes();
+ });
+
+ textContainer.on("mouseleave", function() {
+ d3.select(this).selectAll(".background")
+ .style("opacity", 0.0);
+
+ svg.selectAll(".att_lines").attr("stroke-opacity", 0.0);
+ svg.selectAll(".attention_heads").style("display", "inline");
+ svg.selectAll(".attention_boxes")
+ .selectAll("g")
+ .selectAll("rect")
+ .style("opacity", 0.0);
+ });
+}
+
+function renderAttentionHighlights(svg, attention) {
+ var line_container = svg.append("g");
+ line_container.selectAll("g")
+ .data(attention)
+ .enter()
+ .append("g")
+ .classed("line_heads", true)
+ .selectAll("line")
+ .data(function(d){return d;})
+ .enter()
+ .append("line").classed("att_lines", true);
+}
+
+function renderAttention(svg, attention_heads) {
+ var line_container = svg.selectAll(".attention_heads");
+ line_container.html(null);
+ for(var h=0; h").val(i).text(i));
+}
+
+$("#layer").on('change', function(e) {
+ config.layer = +e.currentTarget.value;
+ render();
+});
+
+$("#att_type").on('change', function(e) {
+ config.att_type = e.currentTarget.value;
+ render();
+});
+
+$("button").on('click', visualize);
+
+visualize();
+
+});
\ No newline at end of file
diff --git a/bertviz/attention.py b/bertviz/attention.py
new file mode 100644
index 0000000..6b0ebcf
--- /dev/null
+++ b/bertviz/attention.py
@@ -0,0 +1,137 @@
+# coding=utf-8
+# Copyright 2018 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Changes made by Jesse Vig on 12/12/18
+# - Adapted to BERT model
+
+
+"""Module for postprocessing and displaying transformer attentions.
+
+This module is designed to be called from an ipython notebook.
+"""
+
+import json
+import os
+
+import IPython.display as display
+
+import numpy as np
+
+vis_html = """
+
+ Layer:
+ Attention:
+
+
+"""
+
+__location__ = os.path.realpath(
+ os.path.join(os.getcwd(), os.path.dirname(__file__)))
+vis_js = open(os.path.join(__location__, 'attention.js')).read()
+
+
+def show(tokens_a, tokens_b, attn):
+ """Displays attention visualization"""
+ attentions = _get_attentions(tokens_a, tokens_b, attn)
+ att_json = json.dumps(attentions)
+ _show_attention(att_json)
+
+
+def _show_attention(att_json):
+ display.display(display.HTML(vis_html))
+ display.display(display.Javascript('window.attention = %s' % att_json))
+ display.display(display.Javascript(vis_js))
+
+
+def _get_attentions(tokens_a, tokens_b, attn):
+ """Compute representation of the attention ready for the d3 visualization.
+
+ Args:
+ tokens_a: list of strings, words to be displayed on the left of the vis
+ tokens_b: list of strings, words to be displayed on the right of the vis
+ attn: numpy array, attention
+ [num_layers, batch_size, num_heads, seq_len, seq_len]
+
+
+ Returns:
+ Dictionary of attention representations with the structure:
+ {
+ 'all': Representations for showing all attentions at the same time.
+ 'a': Sentence A self-attention
+ 'b': Sentence B self-attention
+ 'ab': Sentence A -> Sentence B attention
+ 'ba': Sentence B -> Sentence A attention
+ }
+ and each sub-dictionary has structure:
+ {
+ 'att': list of inter attentions matrices, one for each attention head
+ 'top_text': list of strings, words to be displayed on the left of the vis
+ 'bot_text5': list of strings, words to be displayed on the right of the vis
+ }
+ """
+ def format_mat(mat):
+ return mat.transpose(0, 2, 1).tolist()
+
+
+ all_attns = []
+ a_attns = []
+ b_attns = []
+ ab_attns = []
+ ba_attns = []
+ slice_a = slice(0, len(tokens_a))
+ slice_b = slice(len(tokens_a), len(tokens_a) + len(tokens_b))
+ num_layers = len(attn)
+ for layer in range(num_layers):
+ layer_attn = attn[layer][0]
+ all_attns.append(format_mat(layer_attn))
+ a_attns.append(format_mat(layer_attn[:, slice_a, slice_a]))
+ b_attns.append(format_mat(layer_attn[:, slice_b, slice_b]))
+ ab_attns.append(format_mat(layer_attn[:, slice_a, slice_b]))
+ ba_attns.append(format_mat(layer_attn[:, slice_b, slice_a]))
+
+ attentions = {
+ 'all': {
+ 'att': all_attns,
+ 'top_text': tokens_a + tokens_b,
+ 'bot_text': tokens_a + tokens_b
+ },
+ 'a': {
+ 'att': a_attns,
+ 'top_text': tokens_a,
+ 'bot_text': tokens_a
+ },
+ 'b': {
+ 'att': b_attns,
+ 'top_text': tokens_b,
+ 'bot_text': tokens_b
+ },
+ 'ab': {
+ 'att': ab_attns,
+ 'top_text': tokens_a,
+ 'bot_text': tokens_b
+ },
+ 'ba': {
+ 'att': ba_attns,
+ 'top_text': tokens_b,
+ 'bot_text': tokens_a
+ }
+ }
+ return attentions
diff --git a/bertviz/bertviz.ipynb b/bertviz/bertviz.ipynb
new file mode 100644
index 0000000..6de58c1
--- /dev/null
+++ b/bertviz/bertviz.ipynb
@@ -0,0 +1,76 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "from bertviz import attention, visualization\n",
+ "import numpy as np\n",
+ "from pytorch_pretrained_bert import BertModel, BertTokenizer, BertConfig"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "%%javascript\n",
+ "require.config({\n",
+ " paths: {\n",
+ " d3: '//cdnjs.cloudflare.com/ajax/libs/d3/3.4.8/d3.min'\n",
+ " }\n",
+ "});\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# config = BertConfig.from_json_file('tests/fixtures/config.json')\n",
+ "# model = BertModel(config)\n",
+ "# tokenizer = BertTokenizer('tests/fixtures/vocab.txt')\n",
+ "# attention_visualizer = visualization.AttentionVisualizer(model, tokenizer)\n",
+ "# sentence1 = 'The quickest brown fox'\n",
+ "# sentence2 = \"the quick brown fox jumped over the laziest lazy elmo\"\n",
+ "# tokens_a, tokens_b, attn = attention_visualizer.get_viz_data(sentence1, sentence2)\n",
+ "# attention.show(tokens_a, tokens_b, attn)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "bert_version = 'bert-base-uncased'\n",
+ "model = BertModel.from_pretrained(bert_version)\n",
+ "tokenizer = BertTokenizer.from_pretrained(bert_version)\n",
+ "# sentence_a = 'The quickest brown fox jumped over the lazy dog'\n",
+ "# sentence_b = \"the quick brown fox jumped over the laziest lazy elmo\"\n",
+ "sentence_a = \"I went to the store.\"\n",
+ "sentence_b = \"At the store, I bought fresh strawberries.\"\n",
+ "attention_visualizer = visualization.AttentionVisualizer(model, tokenizer)\n",
+ "tokens_a, tokens_b, attn = attention_visualizer.get_viz_data(sentence_a, sentence_b)\n",
+ "attention.show(tokens_a, tokens_b, attn)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/bertviz/debug.py b/bertviz/debug.py
new file mode 100644
index 0000000..c2e8970
--- /dev/null
+++ b/bertviz/debug.py
@@ -0,0 +1,15 @@
+from bertviz.visualization import AttentionVisualizer
+from bertviz.attention import _get_attentions, show
+from bertviz.pytorch_pretrained_bert import BertTokenizer, BertModel, BertConfig
+import numpy as np
+import unittest
+
+
+config = BertConfig.from_json_file('tests/fixtures/config.json')
+model = BertModel(config)
+tokenizer = BertTokenizer('tests/fixtures/vocab.txt')
+attention_visualizer = AttentionVisualizer(model, tokenizer)
+sentence1 = 'The quickest brown fox jumped over the lazy dog'
+sentence2 = "the quick brown fox jumped over the laziest lazy elmo"
+tokens_a, tokens_b, attn = attention_visualizer.get_viz_data(sentence1, sentence2)
+show(tokens_a, tokens_b, attn)
\ No newline at end of file
diff --git a/bertviz/pytorch_pretrained_bert/__init__.py b/bertviz/pytorch_pretrained_bert/__init__.py
new file mode 100755
index 0000000..fc9b15a
--- /dev/null
+++ b/bertviz/pytorch_pretrained_bert/__init__.py
@@ -0,0 +1,7 @@
+from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
+from .modeling import (BertConfig, BertModel, BertForPreTraining,
+ BertForMaskedLM, BertForNextSentencePrediction,
+ BertForSequenceClassification, BertForTokenClassification,
+ BertForQuestionAnswering)
+from .optimization import BertAdam
+from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE
diff --git a/bertviz/pytorch_pretrained_bert/__main__.py b/bertviz/pytorch_pretrained_bert/__main__.py
new file mode 100755
index 0000000..73f1909
--- /dev/null
+++ b/bertviz/pytorch_pretrained_bert/__main__.py
@@ -0,0 +1,19 @@
+# coding: utf8
+if __name__ == '__main__':
+ import sys
+ try:
+ from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
+ except ModuleNotFoundError:
+ print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
+ "In that case, it requires TensorFlow to be installed. Please see "
+ "https://www.tensorflow.org/install/ for installation instructions.")
+ raise
+
+ if len(sys.argv) != 5:
+ # pylint: disable=line-too-long
+ print("Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
+ else:
+ PYTORCH_DUMP_OUTPUT = sys.argv.pop()
+ TF_CONFIG = sys.argv.pop()
+ TF_CHECKPOINT = sys.argv.pop()
+ convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
diff --git a/bertviz/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py b/bertviz/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
new file mode 100755
index 0000000..20fdd8c
--- /dev/null
+++ b/bertviz/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
@@ -0,0 +1,112 @@
+# coding=utf-8
+# Copyright 2018 The HugginFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert BERT checkpoint."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import re
+import argparse
+import tensorflow as tf
+import torch
+import numpy as np
+
+from .modeling import BertConfig, BertForPreTraining
+
+def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
+ config_path = os.path.abspath(bert_config_file)
+ tf_path = os.path.abspath(tf_checkpoint_path)
+ print("Converting TensorFlow checkpoint from {} with config at {}".format(tf_path, config_path))
+ # Load weights from TF model
+ init_vars = tf.train.list_variables(tf_path)
+ names = []
+ arrays = []
+ for name, shape in init_vars:
+ print("Loading TF weight {} with shape {}".format(name, shape))
+ array = tf.train.load_variable(tf_path, name)
+ names.append(name)
+ arrays.append(array)
+
+ # Initialise PyTorch model
+ config = BertConfig.from_json_file(bert_config_file)
+ print("Building PyTorch model from configuration: {}".format(str(config)))
+ model = BertForPreTraining(config)
+
+ for name, array in zip(names, arrays):
+ name = name.split('/')
+ # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+ # which are not required for using pretrained model
+ if name[-1] in ["adam_v", "adam_m"]:
+ print("Skipping {}".format("/".join(name)))
+ continue
+ pointer = model
+ for m_name in name:
+ if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
+ l = re.split(r'_(\d+)', m_name)
+ else:
+ l = [m_name]
+ if l[0] == 'kernel':
+ pointer = getattr(pointer, 'weight')
+ elif l[0] == 'output_bias':
+ pointer = getattr(pointer, 'bias')
+ elif l[0] == 'output_weights':
+ pointer = getattr(pointer, 'weight')
+ else:
+ pointer = getattr(pointer, l[0])
+ if len(l) >= 2:
+ num = int(l[1])
+ pointer = pointer[num]
+ if m_name[-11:] == '_embeddings':
+ pointer = getattr(pointer, 'weight')
+ elif m_name == 'kernel':
+ array = np.transpose(array)
+ try:
+ assert pointer.shape == array.shape
+ except AssertionError as e:
+ e.args += (pointer.shape, array.shape)
+ raise
+ print("Initialize PyTorch weight {}".format(name))
+ pointer.data = torch.from_numpy(array)
+
+ # Save pytorch-model
+ print("Save PyTorch model to {}".format(pytorch_dump_path))
+ torch.save(model.state_dict(), pytorch_dump_path)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ ## Required parameters
+ parser.add_argument("--tf_checkpoint_path",
+ default = None,
+ type = str,
+ required = True,
+ help = "Path the TensorFlow checkpoint path.")
+ parser.add_argument("--bert_config_file",
+ default = None,
+ type = str,
+ required = True,
+ help = "The config json file corresponding to the pre-trained BERT model. \n"
+ "This specifies the model architecture.")
+ parser.add_argument("--pytorch_dump_path",
+ default = None,
+ type = str,
+ required = True,
+ help = "Path to the output PyTorch model.")
+ args = parser.parse_args()
+ convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
+ args.bert_config_file,
+ args.pytorch_dump_path)
diff --git a/bertviz/pytorch_pretrained_bert/file_utils.py b/bertviz/pytorch_pretrained_bert/file_utils.py
new file mode 100755
index 0000000..f734b7e
--- /dev/null
+++ b/bertviz/pytorch_pretrained_bert/file_utils.py
@@ -0,0 +1,233 @@
+"""
+Utilities for working with the local dataset cache.
+This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
+Copyright by the AllenNLP authors.
+"""
+
+import os
+import logging
+import shutil
+import tempfile
+import json
+from urllib.parse import urlparse
+from pathlib import Path
+from typing import Optional, Tuple, Union, IO, Callable, Set
+from hashlib import sha256
+from functools import wraps
+
+from tqdm import tqdm
+
+import boto3
+from botocore.exceptions import ClientError
+import requests
+
+logger = logging.getLogger(__name__) # pylint: disable=invalid-name
+
+PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
+ Path.home() / '.pytorch_pretrained_bert'))
+
+
+def url_to_filename(url: str, etag: str = None) -> str:
+ """
+ Convert `url` into a hashed filename in a repeatable way.
+ If `etag` is specified, append its hash to the url's, delimited
+ by a period.
+ """
+ url_bytes = url.encode('utf-8')
+ url_hash = sha256(url_bytes)
+ filename = url_hash.hexdigest()
+
+ if etag:
+ etag_bytes = etag.encode('utf-8')
+ etag_hash = sha256(etag_bytes)
+ filename += '.' + etag_hash.hexdigest()
+
+ return filename
+
+
+def filename_to_url(filename: str, cache_dir: str = None) -> Tuple[str, str]:
+ """
+ Return the url and etag (which may be ``None``) stored for `filename`.
+ Raise ``FileNotFoundError`` if `filename` or its stored metadata do not exist.
+ """
+ if cache_dir is None:
+ cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+
+ cache_path = os.path.join(cache_dir, filename)
+ if not os.path.exists(cache_path):
+ raise FileNotFoundError("file {} not found".format(cache_path))
+
+ meta_path = cache_path + '.json'
+ if not os.path.exists(meta_path):
+ raise FileNotFoundError("file {} not found".format(meta_path))
+
+ with open(meta_path) as meta_file:
+ metadata = json.load(meta_file)
+ url = metadata['url']
+ etag = metadata['etag']
+
+ return url, etag
+
+
+def cached_path(url_or_filename: Union[str, Path], cache_dir: str = None) -> str:
+ """
+ Given something that might be a URL (or might be a local path),
+ determine which. If it's a URL, download the file and cache it, and
+ return the path to the cached file. If it's already a local path,
+ make sure the file exists and then return the path.
+ """
+ if cache_dir is None:
+ cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+ if isinstance(url_or_filename, Path):
+ url_or_filename = str(url_or_filename)
+
+ parsed = urlparse(url_or_filename)
+
+ if parsed.scheme in ('http', 'https', 's3'):
+ # URL, so get it from the cache (downloading if necessary)
+ return get_from_cache(url_or_filename, cache_dir)
+ elif os.path.exists(url_or_filename):
+ # File, and it exists.
+ return url_or_filename
+ elif parsed.scheme == '':
+ # File, but it doesn't exist.
+ raise FileNotFoundError("file {} not found".format(url_or_filename))
+ else:
+ # Something unknown
+ raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
+
+
+def split_s3_path(url: str) -> Tuple[str, str]:
+ """Split a full s3 path into the bucket name and path."""
+ parsed = urlparse(url)
+ if not parsed.netloc or not parsed.path:
+ raise ValueError("bad s3 path {}".format(url))
+ bucket_name = parsed.netloc
+ s3_path = parsed.path
+ # Remove '/' at beginning of path.
+ if s3_path.startswith("/"):
+ s3_path = s3_path[1:]
+ return bucket_name, s3_path
+
+
+def s3_request(func: Callable):
+ """
+ Wrapper function for s3 requests in order to create more helpful error
+ messages.
+ """
+
+ @wraps(func)
+ def wrapper(url: str, *args, **kwargs):
+ try:
+ return func(url, *args, **kwargs)
+ except ClientError as exc:
+ if int(exc.response["Error"]["Code"]) == 404:
+ raise FileNotFoundError("file {} not found".format(url))
+ else:
+ raise
+
+ return wrapper
+
+
+@s3_request
+def s3_etag(url: str) -> Optional[str]:
+ """Check ETag on S3 object."""
+ s3_resource = boto3.resource("s3")
+ bucket_name, s3_path = split_s3_path(url)
+ s3_object = s3_resource.Object(bucket_name, s3_path)
+ return s3_object.e_tag
+
+
+@s3_request
+def s3_get(url: str, temp_file: IO) -> None:
+ """Pull a file directly from S3."""
+ s3_resource = boto3.resource("s3")
+ bucket_name, s3_path = split_s3_path(url)
+ s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
+
+
+def http_get(url: str, temp_file: IO) -> None:
+ req = requests.get(url, stream=True)
+ content_length = req.headers.get('Content-Length')
+ total = int(content_length) if content_length is not None else None
+ progress = tqdm(unit="B", total=total)
+ for chunk in req.iter_content(chunk_size=1024):
+ if chunk: # filter out keep-alive new chunks
+ progress.update(len(chunk))
+ temp_file.write(chunk)
+ progress.close()
+
+
+def get_from_cache(url: str, cache_dir: str = None) -> str:
+ """
+ Given a URL, look for the corresponding dataset in the local cache.
+ If it's not there, download it. Then return the path to the cached file.
+ """
+ if cache_dir is None:
+ cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+
+ os.makedirs(cache_dir, exist_ok=True)
+
+ # Get eTag to add to filename, if it exists.
+ if url.startswith("s3://"):
+ etag = s3_etag(url)
+ else:
+ response = requests.head(url, allow_redirects=True)
+ if response.status_code != 200:
+ raise IOError("HEAD request failed for url {} with status code {}"
+ .format(url, response.status_code))
+ etag = response.headers.get("ETag")
+
+ filename = url_to_filename(url, etag)
+
+ # get cache path to put the file
+ cache_path = os.path.join(cache_dir, filename)
+
+ if not os.path.exists(cache_path):
+ # Download to temporary file, then copy to cache dir once finished.
+ # Otherwise you get corrupt cache entries if the download gets interrupted.
+ with tempfile.NamedTemporaryFile() as temp_file:
+ logger.info("%s not found in cache, downloading to %s", url, temp_file.name)
+
+ # GET file object
+ if url.startswith("s3://"):
+ s3_get(url, temp_file)
+ else:
+ http_get(url, temp_file)
+
+ # we are copying the file before closing it, so flush to avoid truncation
+ temp_file.flush()
+ # shutil.copyfileobj() starts at the current position, so go to the start
+ temp_file.seek(0)
+
+ logger.info("copying %s to cache at %s", temp_file.name, cache_path)
+ with open(cache_path, 'wb') as cache_file:
+ shutil.copyfileobj(temp_file, cache_file)
+
+ logger.info("creating metadata file for %s", cache_path)
+ meta = {'url': url, 'etag': etag}
+ meta_path = cache_path + '.json'
+ with open(meta_path, 'w') as meta_file:
+ json.dump(meta, meta_file)
+
+ logger.info("removing temp file %s", temp_file.name)
+
+ return cache_path
+
+
+def read_set_from_file(filename: str) -> Set[str]:
+ '''
+ Extract a de-duped collection (set) of text from a file.
+ Expected file format is one item per line.
+ '''
+ collection = set()
+ with open(filename, 'r') as file_:
+ for line in file_:
+ collection.add(line.rstrip())
+ return collection
+
+
+def get_file_extension(path: str, dot=True, lower: bool = True):
+ ext = os.path.splitext(path)[1]
+ ext = ext if dot else ext[1:]
+ return ext.lower() if lower else ext
diff --git a/bertviz/pytorch_pretrained_bert/modeling.py b/bertviz/pytorch_pretrained_bert/modeling.py
new file mode 100755
index 0000000..7a820ea
--- /dev/null
+++ b/bertviz/pytorch_pretrained_bert/modeling.py
@@ -0,0 +1,1054 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Changes made by Jesse Vig on 12/12/18:
+# - Return attention weights from BertModel
+#
+
+"""PyTorch BERT model."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import copy
+import json
+import math
+import logging
+import tarfile
+import tempfile
+import shutil
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from .file_utils import cached_path
+
+logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
+ datefmt='%m/%d/%Y %H:%M:%S',
+ level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+PRETRAINED_MODEL_ARCHIVE_MAP = {
+ 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz",
+ 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz",
+ 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz",
+ 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz",
+ 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz",
+ 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz",
+ 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz",
+}
+CONFIG_NAME = 'bert_config.json'
+WEIGHTS_NAME = 'pytorch_model.bin'
+
+
+def gelu(x):
+ """Implementation of the gelu activation function.
+ For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+ 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+ """
+ return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+
+
+def swish(x):
+ return x * torch.sigmoid(x)
+
+
+ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
+
+
+class BertConfig(object):
+ """Configuration class to store the configuration of a `BertModel`.
+ """
+
+ def __init__(self,
+ vocab_size_or_config_json_file,
+ hidden_size=768,
+ num_hidden_layers=12,
+ num_attention_heads=12,
+ intermediate_size=3072,
+ hidden_act="gelu",
+ hidden_dropout_prob=0.1,
+ attention_probs_dropout_prob=0.1,
+ max_position_embeddings=512,
+ type_vocab_size=2,
+ initializer_range=0.02):
+ """Constructs BertConfig.
+
+ Args:
+ vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
+ hidden_size: Size of the encoder layers and the pooler layer.
+ num_hidden_layers: Number of hidden layers in the Transformer encoder.
+ num_attention_heads: Number of attention heads for each attention layer in
+ the Transformer encoder.
+ intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+ layer in the Transformer encoder.
+ hidden_act: The non-linear activation function (function or string) in the
+ encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+ hidden_dropout_prob: The dropout probabilitiy for all fully connected
+ layers in the embeddings, encoder, and pooler.
+ attention_probs_dropout_prob: The dropout ratio for the attention
+ probabilities.
+ max_position_embeddings: The maximum sequence length that this model might
+ ever be used with. Typically set this to something large just in case
+ (e.g., 512 or 1024 or 2048).
+ type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+ `BertModel`.
+ initializer_range: The sttdev of the truncated_normal_initializer for
+ initializing all weight matrices.
+ """
+ if isinstance(vocab_size_or_config_json_file, str):
+ with open(vocab_size_or_config_json_file, "r") as reader:
+ json_config = json.loads(reader.read())
+ for key, value in json_config.items():
+ self.__dict__[key] = value
+ elif isinstance(vocab_size_or_config_json_file, int):
+ self.vocab_size = vocab_size_or_config_json_file
+ self.hidden_size = hidden_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.hidden_act = hidden_act
+ self.intermediate_size = intermediate_size
+ self.hidden_dropout_prob = hidden_dropout_prob
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
+ self.max_position_embeddings = max_position_embeddings
+ self.type_vocab_size = type_vocab_size
+ self.initializer_range = initializer_range
+ else:
+ raise ValueError("First argument must be either a vocabulary size (int)"
+ "or the path to a pretrained model config file (str)")
+
+ @classmethod
+ def from_dict(cls, json_object):
+ """Constructs a `BertConfig` from a Python dictionary of parameters."""
+ config = BertConfig(vocab_size_or_config_json_file=-1)
+ for key, value in json_object.items():
+ config.__dict__[key] = value
+ return config
+
+ @classmethod
+ def from_json_file(cls, json_file):
+ """Constructs a `BertConfig` from a json file of parameters."""
+ with open(json_file, "r") as reader:
+ text = reader.read()
+ return cls.from_dict(json.loads(text))
+
+ def __repr__(self):
+ return str(self.to_json_string())
+
+ def to_dict(self):
+ """Serializes this instance to a Python dictionary."""
+ output = copy.deepcopy(self.__dict__)
+ return output
+
+ def to_json_string(self):
+ """Serializes this instance to a JSON string."""
+ return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+
+class BertLayerNorm(nn.Module):
+ def __init__(self, config, variance_epsilon=1e-12):
+ """Construct a layernorm module in the TF style (epsilon inside the square root).
+ """
+ super(BertLayerNorm, self).__init__()
+ self.gamma = nn.Parameter(torch.ones(config.hidden_size))
+ self.beta = nn.Parameter(torch.zeros(config.hidden_size))
+ self.variance_epsilon = variance_epsilon
+
+ def forward(self, x):
+ u = x.mean(-1, keepdim=True)
+ s = (x - u).pow(2).mean(-1, keepdim=True)
+ x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+ return self.gamma * x + self.beta
+
+
+class BertEmbeddings(nn.Module):
+ """Construct the embeddings from word, position and token_type embeddings.
+ """
+
+ def __init__(self, config):
+ super(BertEmbeddings, self).__init__()
+ self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
+ self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+ self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+ # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+ # any TensorFlow checkpoint file
+ self.LayerNorm = BertLayerNorm(config)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+ def forward(self, input_ids, token_type_ids=None):
+ seq_length = input_ids.size(1)
+ position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
+ position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+ if token_type_ids is None:
+ token_type_ids = torch.zeros_like(input_ids)
+
+ words_embeddings = self.word_embeddings(input_ids)
+ position_embeddings = self.position_embeddings(position_ids)
+ token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+ embeddings = words_embeddings + position_embeddings + token_type_embeddings
+ embeddings = self.LayerNorm(embeddings)
+ embeddings = self.dropout(embeddings)
+ return embeddings
+
+
+class BertSelfAttention(nn.Module):
+ def __init__(self, config):
+ super(BertSelfAttention, self).__init__()
+ if config.hidden_size % config.num_attention_heads != 0:
+ raise ValueError(
+ "The hidden size (%d) is not a multiple of the number of attention "
+ "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+ self.num_attention_heads = config.num_attention_heads
+ self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+ self.query = nn.Linear(config.hidden_size, self.all_head_size)
+ self.key = nn.Linear(config.hidden_size, self.all_head_size)
+ self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+ self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+ def transpose_for_scores(self, x):
+ new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+ x = x.view(*new_x_shape)
+ return x.permute(0, 2, 1, 3)
+
+ def forward(self, hidden_states, attention_mask):
+ mixed_query_layer = self.query(hidden_states)
+ mixed_key_layer = self.key(hidden_states)
+ mixed_value_layer = self.value(hidden_states)
+
+ query_layer = self.transpose_for_scores(mixed_query_layer)
+ key_layer = self.transpose_for_scores(mixed_key_layer)
+ value_layer = self.transpose_for_scores(mixed_value_layer)
+
+ # Take the dot product between "query" and "key" to get the raw attention scores.
+ attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+ attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+ # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+ attention_scores = attention_scores + attention_mask
+
+ # Normalize the attention scores to probabilities.
+ attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+ # This is actually dropping out entire tokens to attend to, which might
+ # seem a bit unusual, but is taken from the original Transformer paper.
+ attention_probs = self.dropout(attention_probs)
+
+ context_layer = torch.matmul(attention_probs, value_layer)
+ context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+ new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+ context_layer = context_layer.view(*new_context_layer_shape)
+ return context_layer, attention_probs
+
+
+class BertSelfOutput(nn.Module):
+ def __init__(self, config):
+ super(BertSelfOutput, self).__init__()
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+ self.LayerNorm = BertLayerNorm(config)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+ def forward(self, hidden_states, input_tensor):
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
+ return hidden_states
+
+
+class BertAttention(nn.Module):
+ def __init__(self, config):
+ super(BertAttention, self).__init__()
+ self.self = BertSelfAttention(config)
+ self.output = BertSelfOutput(config)
+
+ def forward(self, input_tensor, attention_mask):
+ self_output, attention_probs = self.self(input_tensor, attention_mask)
+ attention_output = self.output(self_output, input_tensor)
+ return attention_output, attention_probs
+
+
+class BertIntermediate(nn.Module):
+ def __init__(self, config):
+ super(BertIntermediate, self).__init__()
+ self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+ self.intermediate_act_fn = ACT2FN[config.hidden_act] \
+ if isinstance(config.hidden_act, str) else config.hidden_act
+
+ def forward(self, hidden_states):
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.intermediate_act_fn(hidden_states)
+ return hidden_states
+
+
+class BertOutput(nn.Module):
+ def __init__(self, config):
+ super(BertOutput, self).__init__()
+ self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+ self.LayerNorm = BertLayerNorm(config)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+ def forward(self, hidden_states, input_tensor):
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
+ return hidden_states
+
+
+class BertLayer(nn.Module):
+ def __init__(self, config):
+ super(BertLayer, self).__init__()
+ self.attention = BertAttention(config)
+ self.intermediate = BertIntermediate(config)
+ self.output = BertOutput(config)
+
+ def forward(self, hidden_states, attention_mask):
+ attention_output, attention_probs = self.attention(hidden_states, attention_mask)
+ intermediate_output = self.intermediate(attention_output)
+ layer_output = self.output(intermediate_output, attention_output)
+ return layer_output, attention_probs
+
+
+class BertEncoder(nn.Module):
+ def __init__(self, config):
+ super(BertEncoder, self).__init__()
+ layer = BertLayer(config)
+ self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
+
+ def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
+ all_encoder_layers = []
+ attention_probs_list = []
+ for layer_module in self.layer:
+ hidden_states, attention_probs = layer_module(hidden_states, attention_mask)
+ attention_probs_list.append(attention_probs)
+ if output_all_encoded_layers:
+ all_encoder_layers.append(hidden_states)
+ if not output_all_encoded_layers:
+ all_encoder_layers.append(hidden_states)
+ attention_tensor = torch.stack(attention_probs_list)
+ return all_encoder_layers, attention_tensor
+
+
+class BertPooler(nn.Module):
+ def __init__(self, config):
+ super(BertPooler, self).__init__()
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+ self.activation = nn.Tanh()
+
+ def forward(self, hidden_states):
+ # We "pool" the model by simply taking the hidden state corresponding
+ # to the first token.
+ first_token_tensor = hidden_states[:, 0]
+ pooled_output = self.dense(first_token_tensor)
+ pooled_output = self.activation(pooled_output)
+ return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+ def __init__(self, config):
+ super(BertPredictionHeadTransform, self).__init__()
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+ self.transform_act_fn = ACT2FN[config.hidden_act] \
+ if isinstance(config.hidden_act, str) else config.hidden_act
+ self.LayerNorm = BertLayerNorm(config)
+
+ def forward(self, hidden_states):
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.transform_act_fn(hidden_states)
+ hidden_states = self.LayerNorm(hidden_states)
+ return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+ def __init__(self, config, bert_model_embedding_weights):
+ super(BertLMPredictionHead, self).__init__()
+ self.transform = BertPredictionHeadTransform(config)
+
+ # The output weights are the same as the input embeddings, but there is
+ # an output-only bias for each token.
+ self.decoder = nn.Linear(bert_model_embedding_weights.size(1),
+ bert_model_embedding_weights.size(0),
+ bias=False)
+ self.decoder.weight = bert_model_embedding_weights
+ self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0)))
+
+ def forward(self, hidden_states):
+ hidden_states = self.transform(hidden_states)
+ hidden_states = self.decoder(hidden_states) + self.bias
+ return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+ def __init__(self, config, bert_model_embedding_weights):
+ super(BertOnlyMLMHead, self).__init__()
+ self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
+
+ def forward(self, sequence_output):
+ prediction_scores = self.predictions(sequence_output)
+ return prediction_scores
+
+
+class BertOnlyNSPHead(nn.Module):
+ def __init__(self, config):
+ super(BertOnlyNSPHead, self).__init__()
+ self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+ def forward(self, pooled_output):
+ seq_relationship_score = self.seq_relationship(pooled_output)
+ return seq_relationship_score
+
+
+class BertPreTrainingHeads(nn.Module):
+ def __init__(self, config, bert_model_embedding_weights):
+ super(BertPreTrainingHeads, self).__init__()
+ self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
+ self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+ def forward(self, sequence_output, pooled_output):
+ prediction_scores = self.predictions(sequence_output)
+ seq_relationship_score = self.seq_relationship(pooled_output)
+ return prediction_scores, seq_relationship_score
+
+
+class PreTrainedBertModel(nn.Module):
+ """ An abstract class to handle weights initialization and
+ a simple interface for dowloading and loading pretrained models.
+ """
+
+ def __init__(self, config, *inputs, **kwargs):
+ super(PreTrainedBertModel, self).__init__()
+ if not isinstance(config, BertConfig):
+ raise ValueError(
+ "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
+ "To create a model from a Google pretrained model use "
+ "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+ self.__class__.__name__, self.__class__.__name__
+ ))
+ self.config = config
+
+ def init_bert_weights(self, module):
+ """ Initialize the weights.
+ """
+ if isinstance(module, (nn.Linear, nn.Embedding)):
+ # Slightly different from the TF version which uses truncated_normal for initialization
+ # cf https://github.com/pytorch/pytorch/pull/5617
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ elif isinstance(module, BertLayerNorm):
+ module.beta.data.normal_(mean=0.0, std=self.config.initializer_range)
+ module.gamma.data.normal_(mean=0.0, std=self.config.initializer_range)
+ if isinstance(module, nn.Linear) and module.bias is not None:
+ module.bias.data.zero_()
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_name, cache_dir=None, *inputs, **kwargs):
+ """
+ Instantiate a PreTrainedBertModel from a pre-trained model file.
+ Download and cache the pre-trained model file if needed.
+
+ Params:
+ pretrained_model_name: either:
+ - a str with the name of a pre-trained model to load selected in the list of:
+ . `bert-base-uncased`
+ . `bert-large-uncased`
+ . `bert-base-cased`
+ . `bert-base-multilingual`
+ . `bert-base-chinese`
+ - a path or url to a pretrained model archive containing:
+ . `bert_config.json` a configuration file for the model
+ . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
+ *inputs, **kwargs: additional input for the specific Bert class
+ (ex: num_labels for BertForSequenceClassification)
+ """
+ if pretrained_model_name in PRETRAINED_MODEL_ARCHIVE_MAP:
+ archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name]
+ else:
+ archive_file = pretrained_model_name
+ # redirect to the cache, if necessary
+ try:
+ resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
+ except FileNotFoundError:
+ logger.error(
+ "Model name '{}' was not found in model name list ({}). "
+ "We assumed '{}' was a path or url but couldn't find any file "
+ "associated to this path or url.".format(
+ pretrained_model_name,
+ ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
+ archive_file))
+ return None
+ if resolved_archive_file == archive_file:
+ logger.info("loading archive file {}".format(archive_file))
+ else:
+ logger.info("loading archive file {} from cache at {}".format(
+ archive_file, resolved_archive_file))
+ tempdir = None
+ if os.path.isdir(resolved_archive_file):
+ serialization_dir = resolved_archive_file
+ else:
+ # Extract archive to temp dir
+ tempdir = tempfile.mkdtemp()
+ logger.info("extracting archive file {} to temp dir {}".format(
+ resolved_archive_file, tempdir))
+ with tarfile.open(resolved_archive_file, 'r:gz') as archive:
+ archive.extractall(tempdir)
+ serialization_dir = tempdir
+ # Load config
+ config_file = os.path.join(serialization_dir, CONFIG_NAME)
+ config = BertConfig.from_json_file(config_file)
+ logger.info("Model config {}".format(config))
+ # Instantiate model.
+ model = cls(config, *inputs, **kwargs)
+ weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
+ state_dict = torch.load(weights_path)
+
+ missing_keys = []
+ unexpected_keys = []
+ error_msgs = []
+ # copy state_dict so _load_from_state_dict can modify it
+ metadata = getattr(state_dict, '_metadata', None)
+ state_dict = state_dict.copy()
+ if metadata is not None:
+ state_dict._metadata = metadata
+
+ def load(module, prefix=''):
+ local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+ module._load_from_state_dict(
+ state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+ for name, child in module._modules.items():
+ if child is not None:
+ load(child, prefix + name + '.')
+
+ load(model, prefix='' if hasattr(model, 'bert') else 'bert.')
+ if len(missing_keys) > 0:
+ logger.info("Weights of {} not initialized from pretrained model: {}".format(
+ model.__class__.__name__, missing_keys))
+ if len(unexpected_keys) > 0:
+ logger.info("Weights from pretrained model not used in {}: {}".format(
+ model.__class__.__name__, unexpected_keys))
+ if tempdir:
+ # Clean up temp dir
+ shutil.rmtree(tempdir)
+ return model
+
+
+class BertModel(PreTrainedBertModel):
+ """BERT model ("Bidirectional Embedding Representations from a Transformer").
+
+ Params:
+ config: a BertConfig class instance with the configuration to build a new model
+
+ Inputs:
+ `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+ with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+ `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+ `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+ types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+ a `sentence B` token (see BERT paper for more details).
+ `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+ selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+ input sequence length in the current batch. It's the mask that we typically use for attention when
+ a batch has varying length sentences.
+ `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
+
+ Outputs: Tuple of (encoded_layers, pooled_output)
+ `encoded_layers`: controled by `output_all_encoded_layers` argument:
+ - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
+ of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
+ encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
+ - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
+ to the last attention block of shape [batch_size, sequence_length, hidden_size],
+ `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
+ classifier pretrained on top of the hidden state associated to the first character of the
+ input (`CLF`) to train on the Next-Sentence task (see BERT's paper).
+
+ Example usage:
+ ```python
+ # Already been converted into WordPiece token ids
+ input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+ input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+ token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+ config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+ num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+ model = modeling.BertModel(config=config)
+ all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+ ```
+ """
+
+ def __init__(self, config):
+ super(BertModel, self).__init__(config)
+ self.embeddings = BertEmbeddings(config)
+ self.encoder = BertEncoder(config)
+ self.pooler = BertPooler(config)
+ self.apply(self.init_bert_weights)
+
+ def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True):
+ if attention_mask is None:
+ attention_mask = torch.ones_like(input_ids)
+ if token_type_ids is None:
+ token_type_ids = torch.zeros_like(input_ids)
+
+ # We create a 3D attention mask from a 2D tensor mask.
+ # Sizes are [batch_size, 1, 1, to_seq_length]
+ # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+ # this attention mask is more simple than the triangular masking of causal attention
+ # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+ extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+
+ # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+ # masked positions, this operation will create a tensor which is 0.0 for
+ # positions we want to attend and -10000.0 for masked positions.
+ # Since we are adding it to the raw scores before the softmax, this is
+ # effectively the same as removing these entirely.
+ extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
+ extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+ embedding_output = self.embeddings(input_ids, token_type_ids)
+ encoded_layers, attention_tensor = self.encoder(embedding_output,
+ extended_attention_mask,
+ output_all_encoded_layers=output_all_encoded_layers)
+ sequence_output = encoded_layers[-1]
+ pooled_output = self.pooler(sequence_output)
+ if not output_all_encoded_layers:
+ encoded_layers = encoded_layers[-1]
+ return encoded_layers, pooled_output, attention_tensor
+
+
+class BertForPreTraining(PreTrainedBertModel):
+ """BERT model with pre-training heads.
+ This module comprises the BERT model followed by the two pre-training heads:
+ - the masked language modeling head, and
+ - the next sentence classification head.
+
+ Params:
+ config: a BertConfig class instance with the configuration to build a new model.
+
+ Inputs:
+ `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+ with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+ `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+ `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+ types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+ a `sentence B` token (see BERT paper for more details).
+ `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+ selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+ input sequence length in the current batch. It's the mask that we typically use for attention when
+ a batch has varying length sentences.
+ `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
+ with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
+ is only computed for the labels set in [0, ..., vocab_size]
+ `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
+ with indices selected in [0, 1].
+ 0 => next sentence is the continuation, 1 => next sentence is a random sentence.
+
+ Outputs:
+ if `masked_lm_labels` and `next_sentence_label` are not `None`:
+ Outputs the total_loss which is the sum of the masked language modeling loss and the next
+ sentence classification loss.
+ if `masked_lm_labels` or `next_sentence_label` is `None`:
+ Outputs a tuple comprising
+ - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
+ - the next sentence classification logits of shape [batch_size, 2].
+
+ Example usage:
+ ```python
+ # Already been converted into WordPiece token ids
+ input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+ input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+ token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+ config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+ num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+ model = BertForPreTraining(config)
+ masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
+ ```
+ """
+
+ def __init__(self, config):
+ super(BertForPreTraining, self).__init__(config)
+ self.bert = BertModel(config)
+ self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
+ self.apply(self.init_bert_weights)
+
+ def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
+ next_sentence_label=None):
+ sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
+ output_all_encoded_layers=False)
+ prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+
+ if masked_lm_labels is not None and next_sentence_label is not None:
+ loss_fct = CrossEntropyLoss(ignore_index=-1)
+ masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
+ next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+ total_loss = masked_lm_loss + next_sentence_loss
+ return total_loss
+ else:
+ return prediction_scores, seq_relationship_score
+
+
+class BertForMaskedLM(PreTrainedBertModel):
+ """BERT model with the masked language modeling head.
+ This module comprises the BERT model followed by the masked language modeling head.
+
+ Params:
+ config: a BertConfig class instance with the configuration to build a new model.
+
+ Inputs:
+ `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+ with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+ `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+ `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+ types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+ a `sentence B` token (see BERT paper for more details).
+ `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+ selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+ input sequence length in the current batch. It's the mask that we typically use for attention when
+ a batch has varying length sentences.
+ `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
+ with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
+ is only computed for the labels set in [0, ..., vocab_size]
+
+ Outputs:
+ if `masked_lm_labels` is `None`:
+ Outputs the masked language modeling loss.
+ if `masked_lm_labels` is `None`:
+ Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size].
+
+ Example usage:
+ ```python
+ # Already been converted into WordPiece token ids
+ input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+ input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+ token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+ config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+ num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+ model = BertForMaskedLM(config)
+ masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
+ ```
+ """
+
+ def __init__(self, config):
+ super(BertForMaskedLM, self).__init__(config)
+ self.bert = BertModel(config)
+ self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
+ self.apply(self.init_bert_weights)
+
+ def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None):
+ sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask,
+ output_all_encoded_layers=False)
+ prediction_scores = self.cls(sequence_output)
+
+ if masked_lm_labels is not None:
+ loss_fct = CrossEntropyLoss(ignore_index=-1)
+ masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
+ return masked_lm_loss
+ else:
+ return prediction_scores
+
+
+class BertForNextSentencePrediction(PreTrainedBertModel):
+ """BERT model with next sentence prediction head.
+ This module comprises the BERT model followed by the next sentence classification head.
+
+ Params:
+ config: a BertConfig class instance with the configuration to build a new model.
+
+ Inputs:
+ `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+ with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+ `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+ `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+ types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+ a `sentence B` token (see BERT paper for more details).
+ `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+ selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+ input sequence length in the current batch. It's the mask that we typically use for attention when
+ a batch has varying length sentences.
+ `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
+ with indices selected in [0, 1].
+ 0 => next sentence is the continuation, 1 => next sentence is a random sentence.
+
+ Outputs:
+ if `next_sentence_label` is not `None`:
+ Outputs the total_loss which is the sum of the masked language modeling loss and the next
+ sentence classification loss.
+ if `next_sentence_label` is `None`:
+ Outputs the next sentence classification logits of shape [batch_size, 2].
+
+ Example usage:
+ ```python
+ # Already been converted into WordPiece token ids
+ input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+ input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+ token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+ config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+ num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+ model = BertForNextSentencePrediction(config)
+ seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
+ ```
+ """
+
+ def __init__(self, config):
+ super(BertForNextSentencePrediction, self).__init__(config)
+ self.bert = BertModel(config)
+ self.cls = BertOnlyNSPHead(config)
+ self.apply(self.init_bert_weights)
+
+ def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None):
+ _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
+ output_all_encoded_layers=False)
+ seq_relationship_score = self.cls(pooled_output)
+
+ if next_sentence_label is not None:
+ loss_fct = CrossEntropyLoss(ignore_index=-1)
+ next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+ return next_sentence_loss
+ else:
+ return seq_relationship_score
+
+
+class BertForSequenceClassification(PreTrainedBertModel):
+ """BERT model for classification.
+ This module is composed of the BERT model with a linear layer on top of
+ the pooled output.
+
+ Params:
+ `config`: a BertConfig class instance with the configuration to build a new model.
+ `num_labels`: the number of classes for the classifier. Default = 2.
+
+ Inputs:
+ `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+ with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+ `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+ `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+ types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+ a `sentence B` token (see BERT paper for more details).
+ `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+ selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+ input sequence length in the current batch. It's the mask that we typically use for attention when
+ a batch has varying length sentences.
+ `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
+ with indices selected in [0, ..., num_labels].
+
+ Outputs:
+ if `labels` is not `None`:
+ Outputs the CrossEntropy classification loss of the output with the labels.
+ if `labels` is `None`:
+ Outputs the classification logits of shape [batch_size, num_labels].
+
+ Example usage:
+ ```python
+ # Already been converted into WordPiece token ids
+ input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+ input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+ token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+ config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+ num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+ num_labels = 2
+
+ model = BertForSequenceClassification(config, num_labels)
+ logits = model(input_ids, token_type_ids, input_mask)
+ ```
+ """
+
+ def __init__(self, config, num_labels=2):
+ super(BertForSequenceClassification, self).__init__(config)
+ self.num_labels = num_labels
+ self.bert = BertModel(config)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+ self.classifier = nn.Linear(config.hidden_size, num_labels)
+ self.apply(self.init_bert_weights)
+
+ def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
+ _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
+ pooled_output = self.dropout(pooled_output)
+ logits = self.classifier(pooled_output)
+
+ if labels is not None:
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+ return loss
+ else:
+ return logits
+
+
+class BertForTokenClassification(PreTrainedBertModel):
+ """BERT model for token-level classification.
+ This module is composed of the BERT model with a linear layer on top of
+ the full hidden state of the last layer.
+
+ Params:
+ `config`: a BertConfig class instance with the configuration to build a new model.
+ `num_labels`: the number of classes for the classifier. Default = 2.
+
+ Inputs:
+ `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+ with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+ `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+ `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+ types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+ a `sentence B` token (see BERT paper for more details).
+ `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+ selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+ input sequence length in the current batch. It's the mask that we typically use for attention when
+ a batch has varying length sentences.
+ `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
+ with indices selected in [0, ..., num_labels].
+
+ Outputs:
+ if `labels` is not `None`:
+ Outputs the CrossEntropy classification loss of the output with the labels.
+ if `labels` is `None`:
+ Outputs the classification logits of shape [batch_size, sequence_length, num_labels].
+
+ Example usage:
+ ```python
+ # Already been converted into WordPiece token ids
+ input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+ input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+ token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+ config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+ num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+ num_labels = 2
+
+ model = BertForTokenClassification(config, num_labels)
+ logits = model(input_ids, token_type_ids, input_mask)
+ ```
+ """
+
+ def __init__(self, config, num_labels=2):
+ super(BertForTokenClassification, self).__init__(config)
+ self.num_labels = num_labels
+ self.bert = BertModel(config)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+ self.classifier = nn.Linear(config.hidden_size, num_labels)
+ self.apply(self.init_bert_weights)
+
+ def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
+ sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
+ sequence_output = self.dropout(sequence_output)
+ logits = self.classifier(sequence_output)
+
+ if labels is not None:
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+ return loss
+ else:
+ return logits
+
+
+class BertForQuestionAnswering(PreTrainedBertModel):
+ """BERT model for Question Answering (span extraction).
+ This module is composed of the BERT model with a linear layer on top of
+ the sequence output that computes start_logits and end_logits
+
+ Params:
+ `config`: either
+ - a BertConfig class instance with the configuration to build a new model, or
+ - a str with the name of a pre-trained model to load selected in the list of:
+ . `bert-base-uncased`
+ . `bert-large-uncased`
+ . `bert-base-cased`
+ . `bert-base-multilingual`
+ . `bert-base-chinese`
+ The pre-trained model will be downloaded and cached if needed.
+
+ Inputs:
+ `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+ with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+ `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+ `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+ types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+ a `sentence B` token (see BERT paper for more details).
+ `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+ selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+ input sequence length in the current batch. It's the mask that we typically use for attention when
+ a batch has varying length sentences.
+ `start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size].
+ Positions are clamped to the length of the sequence and position outside of the sequence are not taken
+ into account for computing the loss.
+ `end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].
+ Positions are clamped to the length of the sequence and position outside of the sequence are not taken
+ into account for computing the loss.
+
+ Outputs:
+ if `start_positions` and `end_positions` are not `None`:
+ Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions.
+ if `start_positions` or `end_positions` is `None`:
+ Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end
+ position tokens of shape [batch_size, sequence_length].
+
+ Example usage:
+ ```python
+ # Already been converted into WordPiece token ids
+ input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+ input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+ token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+ config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+ num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+ model = BertForQuestionAnswering(config)
+ start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
+ ```
+ """
+
+ def __init__(self, config):
+ super(BertForQuestionAnswering, self).__init__(config)
+ self.bert = BertModel(config)
+ # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version
+ # self.dropout = nn.Dropout(config.hidden_dropout_prob)
+ self.qa_outputs = nn.Linear(config.hidden_size, 2)
+ self.apply(self.init_bert_weights)
+
+ def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None, end_positions=None):
+ sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
+ logits = self.qa_outputs(sequence_output)
+ start_logits, end_logits = logits.split(1, dim=-1)
+ start_logits = start_logits.squeeze(-1)
+ end_logits = end_logits.squeeze(-1)
+
+ if start_positions is not None and end_positions is not None:
+ # If we are on multi-GPU, split add a dimension
+ if len(start_positions.size()) > 1:
+ start_positions = start_positions.squeeze(-1)
+ if len(end_positions.size()) > 1:
+ end_positions = end_positions.squeeze(-1)
+ # sometimes the start/end positions are outside our model inputs, we ignore these terms
+ ignored_index = start_logits.size(1)
+ start_positions.clamp_(0, ignored_index)
+ end_positions.clamp_(0, ignored_index)
+
+ loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+ start_loss = loss_fct(start_logits, start_positions)
+ end_loss = loss_fct(end_logits, end_positions)
+ total_loss = (start_loss + end_loss) / 2
+ return total_loss
+ else:
+ return start_logits, end_logits
diff --git a/bertviz/pytorch_pretrained_bert/optimization.py b/bertviz/pytorch_pretrained_bert/optimization.py
new file mode 100755
index 0000000..4314c84
--- /dev/null
+++ b/bertviz/pytorch_pretrained_bert/optimization.py
@@ -0,0 +1,162 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch optimization for BERT model."""
+
+import math
+import torch
+from torch.optim import Optimizer
+from torch.optim.optimizer import required
+from torch.nn.utils import clip_grad_norm_
+
+def warmup_cosine(x, warmup=0.002):
+ if x < warmup:
+ return x/warmup
+ return 0.5 * (1.0 + torch.cos(math.pi * x))
+
+def warmup_constant(x, warmup=0.002):
+ if x < warmup:
+ return x/warmup
+ return 1.0
+
+def warmup_linear(x, warmup=0.002):
+ if x < warmup:
+ return x/warmup
+ return 1.0 - x
+
+SCHEDULES = {
+ 'warmup_cosine':warmup_cosine,
+ 'warmup_constant':warmup_constant,
+ 'warmup_linear':warmup_linear,
+}
+
+
+class BertAdam(Optimizer):
+ """Implements BERT version of Adam algorithm with weight decay fix.
+ Params:
+ lr: learning rate
+ warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1
+ t_total: total number of training steps for the learning
+ rate schedule, -1 means constant learning rate. Default: -1
+ schedule: schedule to use for the warmup (see above). Default: 'warmup_linear'
+ b1: Adams b1. Default: 0.9
+ b2: Adams b2. Default: 0.999
+ e: Adams epsilon. Default: 1e-6
+ weight_decay_rate: Weight decay. Default: 0.01
+ max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
+ """
+ def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
+ b1=0.9, b2=0.999, e=1e-6, weight_decay_rate=0.01,
+ max_grad_norm=1.0):
+ if lr is not required and lr < 0.0:
+ raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
+ if schedule not in SCHEDULES:
+ raise ValueError("Invalid schedule parameter: {}".format(schedule))
+ if not 0.0 <= warmup < 1.0 and not warmup == -1:
+ raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
+ if not 0.0 <= b1 < 1.0:
+ raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
+ if not 0.0 <= b2 < 1.0:
+ raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
+ if not e >= 0.0:
+ raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
+ defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
+ b1=b1, b2=b2, e=e, weight_decay_rate=weight_decay_rate,
+ max_grad_norm=max_grad_norm)
+ super(BertAdam, self).__init__(params, defaults)
+
+ def get_lr(self):
+ lr = []
+ for group in self.param_groups:
+ for p in group['params']:
+ state = self.state[p]
+ if len(state) == 0:
+ return [0]
+ if group['t_total'] != -1:
+ schedule_fct = SCHEDULES[group['schedule']]
+ lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
+ else:
+ lr_scheduled = group['lr']
+ lr.append(lr_scheduled)
+ return lr
+
+ def step(self, closure=None):
+ """Performs a single optimization step.
+
+ Arguments:
+ closure (callable, optional): A closure that reevaluates the model
+ and returns the loss.
+ """
+ loss = None
+ if closure is not None:
+ loss = closure()
+
+ for group in self.param_groups:
+ for p in group['params']:
+ if p.grad is None:
+ continue
+ grad = p.grad.data
+ if grad.is_sparse:
+ raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+
+ state = self.state[p]
+
+ # State initialization
+ if len(state) == 0:
+ state['step'] = 0
+ # Exponential moving average of gradient values
+ state['next_m'] = torch.zeros_like(p.data)
+ # Exponential moving average of squared gradient values
+ state['next_v'] = torch.zeros_like(p.data)
+
+ next_m, next_v = state['next_m'], state['next_v']
+ beta1, beta2 = group['b1'], group['b2']
+
+ # Add grad clipping
+ if group['max_grad_norm'] > 0:
+ clip_grad_norm_(p, group['max_grad_norm'])
+
+ # Decay the first and second moment running average coefficient
+ # In-place operations to update the averages at the same time
+ next_m.mul_(beta1).add_(1 - beta1, grad)
+ next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+ update = next_m / (next_v.sqrt() + group['e'])
+
+ # Just adding the square of the weights to the loss function is *not*
+ # the correct way of using L2 regularization/weight decay with Adam,
+ # since that will interact with the m and v parameters in strange ways.
+ #
+ # Instead we want to decay the weights in a manner that doesn't interact
+ # with the m/v parameters. This is equivalent to adding the square
+ # of the weights to the loss with plain (non-momentum) SGD.
+ if group['weight_decay_rate'] > 0.0:
+ update += group['weight_decay_rate'] * p.data
+
+ if group['t_total'] != -1:
+ schedule_fct = SCHEDULES[group['schedule']]
+ lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
+ else:
+ lr_scheduled = group['lr']
+
+ update_with_lr = lr_scheduled * update
+ p.data.add_(-update_with_lr)
+
+ state['step'] += 1
+
+ # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
+ # No bias correction
+ # bias_correction1 = 1 - beta1 ** state['step']
+ # bias_correction2 = 1 - beta2 ** state['step']
+
+ return loss
diff --git a/bertviz/pytorch_pretrained_bert/tokenization.py b/bertviz/pytorch_pretrained_bert/tokenization.py
new file mode 100755
index 0000000..c7ef20d
--- /dev/null
+++ b/bertviz/pytorch_pretrained_bert/tokenization.py
@@ -0,0 +1,346 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import unicodedata
+import os
+import logging
+
+from .file_utils import cached_path
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
+ datefmt = '%m/%d/%Y %H:%M:%S',
+ level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+ 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
+ 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
+ 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
+ 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
+ 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
+ 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
+ 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
+}
+VOCAB_NAME = 'vocab.txt'
+
+
+def load_vocab(vocab_file):
+ """Loads a vocabulary file into a dictionary."""
+ vocab = collections.OrderedDict()
+ index = 0
+ with open(vocab_file, "r", encoding="utf-8") as reader:
+ while True:
+ token = reader.readline()
+ if not token:
+ break
+ token = token.strip()
+ vocab[token] = index
+ index += 1
+ return vocab
+
+
+def whitespace_tokenize(text):
+ """Runs basic whitespace cleaning and splitting on a peice of text."""
+ text = text.strip()
+ if not text:
+ return []
+ tokens = text.split()
+ return tokens
+
+
+class BertTokenizer(object):
+ """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
+ def __init__(self, vocab_file, do_lower_case=True):
+ if not os.path.isfile(vocab_file):
+ raise ValueError(
+ "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+ "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
+ self.vocab = load_vocab(vocab_file)
+ self.ids_to_tokens = collections.OrderedDict(
+ [(ids, tok) for tok, ids in self.vocab.items()])
+ self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+ self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+ def tokenize(self, text):
+ split_tokens = []
+ for token in self.basic_tokenizer.tokenize(text):
+ for sub_token in self.wordpiece_tokenizer.tokenize(token):
+ split_tokens.append(sub_token)
+ return split_tokens
+
+ def convert_tokens_to_ids(self, tokens):
+ """Converts a sequence of tokens into ids using the vocab."""
+ ids = []
+ for token in tokens:
+ ids.append(self.vocab[token])
+ return ids
+
+ def convert_ids_to_tokens(self, ids):
+ """Converts a sequence of ids in wordpiece tokens using the vocab."""
+ tokens = []
+ for i in ids:
+ tokens.append(self.ids_to_tokens[i])
+ return tokens
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_name, cache_dir=None, *inputs, **kwargs):
+ """
+ Instantiate a PreTrainedBertModel from a pre-trained model file.
+ Download and cache the pre-trained model file if needed.
+ """
+ if pretrained_model_name in PRETRAINED_VOCAB_ARCHIVE_MAP:
+ vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name]
+ else:
+ vocab_file = pretrained_model_name
+ if os.path.isdir(vocab_file):
+ vocab_file = os.path.join(vocab_file, VOCAB_NAME)
+ # redirect to the cache, if necessary
+ try:
+ resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+ except FileNotFoundError:
+ logger.error(
+ "Model name '{}' was not found in model name list ({}). "
+ "We assumed '{}' was a path or url but couldn't find any file "
+ "associated to this path or url.".format(
+ pretrained_model_name,
+ ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+ vocab_file))
+ return None
+ if resolved_vocab_file == vocab_file:
+ logger.info("loading vocabulary file {}".format(vocab_file))
+ else:
+ logger.info("loading vocabulary file {} from cache at {}".format(
+ vocab_file, resolved_vocab_file))
+ # Instantiate tokenizer.
+ tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
+ return tokenizer
+
+
+class BasicTokenizer(object):
+ """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+ def __init__(self, do_lower_case=True):
+ """Constructs a BasicTokenizer.
+
+ Args:
+ do_lower_case: Whether to lower case the input.
+ """
+ self.do_lower_case = do_lower_case
+
+ def tokenize(self, text):
+ """Tokenizes a piece of text."""
+ text = self._clean_text(text)
+ # This was added on November 1st, 2018 for the multilingual and Chinese
+ # models. This is also applied to the English models now, but it doesn't
+ # matter since the English models were not trained on any Chinese data
+ # and generally don't have any Chinese data in them (there are Chinese
+ # characters in the vocabulary because Wikipedia does have some Chinese
+ # words in the English Wikipedia.).
+ text = self._tokenize_chinese_chars(text)
+ orig_tokens = whitespace_tokenize(text)
+ split_tokens = []
+ for token in orig_tokens:
+ if self.do_lower_case:
+ token = token.lower()
+ token = self._run_strip_accents(token)
+ split_tokens.extend(self._run_split_on_punc(token))
+
+ output_tokens = whitespace_tokenize(" ".join(split_tokens))
+ return output_tokens
+
+ def _run_strip_accents(self, text):
+ """Strips accents from a piece of text."""
+ text = unicodedata.normalize("NFD", text)
+ output = []
+ for char in text:
+ cat = unicodedata.category(char)
+ if cat == "Mn":
+ continue
+ output.append(char)
+ return "".join(output)
+
+ def _run_split_on_punc(self, text):
+ """Splits punctuation on a piece of text."""
+ chars = list(text)
+ i = 0
+ start_new_word = True
+ output = []
+ while i < len(chars):
+ char = chars[i]
+ if _is_punctuation(char):
+ output.append([char])
+ start_new_word = True
+ else:
+ if start_new_word:
+ output.append([])
+ start_new_word = False
+ output[-1].append(char)
+ i += 1
+
+ return ["".join(x) for x in output]
+
+ def _tokenize_chinese_chars(self, text):
+ """Adds whitespace around any CJK character."""
+ output = []
+ for char in text:
+ cp = ord(char)
+ if self._is_chinese_char(cp):
+ output.append(" ")
+ output.append(char)
+ output.append(" ")
+ else:
+ output.append(char)
+ return "".join(output)
+
+ def _is_chinese_char(self, cp):
+ """Checks whether CP is the codepoint of a CJK character."""
+ # This defines a "chinese character" as anything in the CJK Unicode block:
+ # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+ #
+ # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+ # despite its name. The modern Korean Hangul alphabet is a different block,
+ # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+ # space-separated words, so they are not treated specially and handled
+ # like the all of the other languages.
+ if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
+ (cp >= 0x3400 and cp <= 0x4DBF) or #
+ (cp >= 0x20000 and cp <= 0x2A6DF) or #
+ (cp >= 0x2A700 and cp <= 0x2B73F) or #
+ (cp >= 0x2B740 and cp <= 0x2B81F) or #
+ (cp >= 0x2B820 and cp <= 0x2CEAF) or
+ (cp >= 0xF900 and cp <= 0xFAFF) or #
+ (cp >= 0x2F800 and cp <= 0x2FA1F)): #
+ return True
+
+ return False
+
+ def _clean_text(self, text):
+ """Performs invalid character removal and whitespace cleanup on text."""
+ output = []
+ for char in text:
+ cp = ord(char)
+ if cp == 0 or cp == 0xfffd or _is_control(char):
+ continue
+ if _is_whitespace(char):
+ output.append(" ")
+ else:
+ output.append(char)
+ return "".join(output)
+
+
+class WordpieceTokenizer(object):
+ """Runs WordPiece tokenization."""
+
+ def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
+ self.vocab = vocab
+ self.unk_token = unk_token
+ self.max_input_chars_per_word = max_input_chars_per_word
+
+ def tokenize(self, text):
+ """Tokenizes a piece of text into its word pieces.
+
+ This uses a greedy longest-match-first algorithm to perform tokenization
+ using the given vocabulary.
+
+ For example:
+ input = "unaffable"
+ output = ["un", "##aff", "##able"]
+
+ Args:
+ text: A single token or whitespace separated tokens. This should have
+ already been passed through `BasicTokenizer.
+
+ Returns:
+ A list of wordpiece tokens.
+ """
+
+ output_tokens = []
+ for token in whitespace_tokenize(text):
+ chars = list(token)
+ if len(chars) > self.max_input_chars_per_word:
+ output_tokens.append(self.unk_token)
+ continue
+
+ is_bad = False
+ start = 0
+ sub_tokens = []
+ while start < len(chars):
+ end = len(chars)
+ cur_substr = None
+ while start < end:
+ substr = "".join(chars[start:end])
+ if start > 0:
+ substr = "##" + substr
+ if substr in self.vocab:
+ cur_substr = substr
+ break
+ end -= 1
+ if cur_substr is None:
+ is_bad = True
+ break
+ sub_tokens.append(cur_substr)
+ start = end
+
+ if is_bad:
+ output_tokens.append(self.unk_token)
+ else:
+ output_tokens.extend(sub_tokens)
+ return output_tokens
+
+
+def _is_whitespace(char):
+ """Checks whether `chars` is a whitespace character."""
+ # \t, \n, and \r are technically contorl characters but we treat them
+ # as whitespace since they are generally considered as such.
+ if char == " " or char == "\t" or char == "\n" or char == "\r":
+ return True
+ cat = unicodedata.category(char)
+ if cat == "Zs":
+ return True
+ return False
+
+
+def _is_control(char):
+ """Checks whether `chars` is a control character."""
+ # These are technically control characters but we count them as whitespace
+ # characters.
+ if char == "\t" or char == "\n" or char == "\r":
+ return False
+ cat = unicodedata.category(char)
+ if cat.startswith("C"):
+ return True
+ return False
+
+
+def _is_punctuation(char):
+ """Checks whether `chars` is a punctuation character."""
+ cp = ord(char)
+ # We treat all non-letter/number ASCII as punctuation.
+ # Characters such as "^", "$", and "`" are not in the Unicode
+ # Punctuation class but we treat them as punctuation anyways, for
+ # consistency.
+ if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+ (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+ return True
+ cat = unicodedata.category(char)
+ if cat.startswith("P"):
+ return True
+ return False
diff --git a/bertviz/tests/fixtures/config.json b/bertviz/tests/fixtures/config.json
new file mode 100644
index 0000000..ee8abf4
--- /dev/null
+++ b/bertviz/tests/fixtures/config.json
@@ -0,0 +1,12 @@
+{
+ "num_hidden_layers": 12,
+ "vocab_size": 18,
+ "hidden_size": 12,
+ "max_position_embeddings": 64,
+ "type_vocab_size": 2,
+ "hidden_dropout_prob": 0.1,
+ "num_attention_heads": 3,
+ "attention_probs_dropout_prob": 0.1,
+ "intermediate_size": 6
+
+}
\ No newline at end of file
diff --git a/bertviz/tests/fixtures/vocab.txt b/bertviz/tests/fixtures/vocab.txt
new file mode 100644
index 0000000..8c18a83
--- /dev/null
+++ b/bertviz/tests/fixtures/vocab.txt
@@ -0,0 +1,18 @@
+[PAD]
+[UNK]
+the
+quick
+##est
+brown
+fox
+##iest
+jumped
+over
+##zie
+##st
+dog
+.
+lazy
+la
+[SEP]
+[CLS]
\ No newline at end of file
diff --git a/bertviz/tests/test_attention.py b/bertviz/tests/test_attention.py
new file mode 100644
index 0000000..c61da4b
--- /dev/null
+++ b/bertviz/tests/test_attention.py
@@ -0,0 +1,41 @@
+from bertviz.visualization import AttentionVisualizer
+from bertviz.attention import _get_attentions
+from bertviz.pytorch_pretrained_bert import BertTokenizer, BertModel, BertConfig
+import numpy as np
+import unittest
+
+
+class TestAttention(unittest.TestCase):
+
+ def setUp(self):
+ self.config = BertConfig.from_json_file('fixtures/config.json')
+ model = BertModel(self.config)
+ tokenizer = BertTokenizer('fixtures/vocab.txt')
+ self.attention_visualizer = AttentionVisualizer(model, tokenizer)
+
+ def test_get_attentions(self):
+ sentence1 = 'The quickest brown fox jumped over the lazy dog'
+ sentence2 = "the quick brown fox jumped over the laziest lazy elmo"
+ tokens_a, tokens_b, attn = self.attention_visualizer.get_viz_data(sentence1, sentence2)
+ attentions = _get_attentions(tokens_a, tokens_b, attn)
+ attn_squeezed = np.squeeze(attn)
+ expected_all_attention = attn_squeezed.tolist()
+ self.assertEqual(attentions['all']['att'], expected_all_attention)
+ attn_a = np.array(attentions['a']['att'])
+ attn_b = np.array(attentions['b']['att'])
+ attn_ab = np.array(attentions['ab']['att'])
+ attn_ba = np.array(attentions['ba']['att'])
+ expected_top_half = attn_squeezed[:, :, :len(tokens_a), :]
+ top_half = np.concatenate((attn_a, attn_ab), axis=-1)
+ self.assertEqual(top_half.shape, expected_top_half.shape)
+ self.assertTrue(np.array_equal(top_half, expected_top_half))
+ expected_bottom_half = attn_squeezed[:, :, len(tokens_a):, :]
+ bottom_half = np.concatenate((attn_b, attn_ba), axis=-1)
+ self.assertEqual(bottom_half.shape, expected_bottom_half.shape)
+ all = np.concatenate((top_half, bottom_half), axis=-2)
+ self.assertEqual(all.shape, attn_squeezed.shape)
+ self.assertTrue(np.allclose(all, attn_squeezed, atol=1e-06))
+
+if __name__ == "__main__":
+ unittest.main()
+
diff --git a/bertviz/tests/test_visualization.py b/bertviz/tests/test_visualization.py
new file mode 100644
index 0000000..bb5cedb
--- /dev/null
+++ b/bertviz/tests/test_visualization.py
@@ -0,0 +1,40 @@
+from bertviz.visualization import AttentionVisualizer
+from bertviz.pytorch_pretrained_bert import BertTokenizer, BertModel, BertConfig
+import unittest
+
+
+class TestVisualization(unittest.TestCase):
+
+ def setUp(self):
+ self.config = BertConfig.from_json_file('fixtures/config.json')
+ model = BertModel(self.config)
+ tokenizer = BertTokenizer('fixtures/vocab.txt')
+ self.attention_visualizer = AttentionVisualizer(model, tokenizer)
+
+ def test_get_inputs(self):
+ sentence1 = 'The quickest brown fox jumped over the lazy dog'
+ tokens_ids1 = [2, 3, 4, 5, 6, 8, 9, 2, 14, 12]
+ sentence2 = "the quick brown fox jumped over the laziest lazy elmo"
+ token_ids2 = [2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1]
+ tokens_tensor, token_type_tensor, tokens_a, tokens_b = self.attention_visualizer._get_inputs(sentence1, sentence2)
+ cls_id = 17
+ sep_id = 16
+ self.assertEqual(tokens_tensor.tolist()[0],
+ [cls_id] + tokens_ids1 + [sep_id] + token_ids2 + [sep_id])
+ self.assertEqual(token_type_tensor.tolist()[0],
+ ([0] * 12) + ([1] * 13))
+
+ def test_get_viz_data(self):
+ sentence1 = 'The quickest brown fox jumped over the lazy dog'
+ tokens1 = ['the', 'quick', '##est', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']
+ sentence2 = "the quick brown fox jumped over the laziest lazy elmo"
+ tokens2 = ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'la', '##zie', '##st', 'lazy', '[UNK]']
+ tokens_a, tokens_b, attn = self.attention_visualizer.get_viz_data(sentence1, sentence2)
+ self.assertEqual(tokens_a, ['[CLS]'] + tokens1 + ['[SEP]'])
+ self.assertEqual(tokens_b, tokens2 + ['[SEP]'])
+ expected_attn_shape = (self.config.num_hidden_layers, 1, self.config.num_attention_heads, len(tokens_a) + len(tokens_b), len(tokens_a) + len(tokens_b))
+ self.assertEqual(attn.shape, expected_attn_shape)
+
+if __name__ == "__main__":
+ unittest.main()
+
diff --git a/bertviz/visualization.py b/bertviz/visualization.py
new file mode 100644
index 0000000..77f4b7f
--- /dev/null
+++ b/bertviz/visualization.py
@@ -0,0 +1,51 @@
+# coding=utf-8
+# Copyright 2018 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Changes made by Jesse Vig on 12/12/18:
+# - Adapted to BERT model
+#
+
+
+import torch
+
+class AttentionVisualizer:
+
+ def __init__(self, model, tokenizer):
+ self.model = model
+ self.tokenizer = tokenizer
+ self.model.eval()
+
+ def get_viz_data(self, sentence_a, sentence_b):
+ tokens_tensor, token_type_tensor, tokens_a, tokens_b = self._get_inputs(sentence_a, sentence_b)
+ attn = self._get_attention(tokens_tensor, token_type_tensor)
+ return tokens_a, tokens_b, attn
+
+ def _get_inputs(self, sentence_a, sentence_b):
+ tokens_a = self.tokenizer.tokenize(sentence_a)
+ tokens_b = self.tokenizer.tokenize(sentence_b)
+ tokens_a_delim = ['[CLS]'] + tokens_a + ['[SEP]']
+ tokens_b_delim = tokens_b + ['[SEP]']
+ token_ids = self.tokenizer.convert_tokens_to_ids(tokens_a_delim + tokens_b_delim)
+ tokens_tensor = torch.tensor([token_ids])
+ token_type_tensor = torch.LongTensor([[0] * len(tokens_a_delim) + [1] * len(tokens_b_delim)])
+ return tokens_tensor, token_type_tensor, tokens_a_delim, tokens_b_delim
+
+ def _get_attention(self, tokens_tensor, token_type_tensor):
+ _, _, attention_tensor = self.model(tokens_tensor, token_type_ids=token_type_tensor)
+ return attention_tensor.data.numpy()
+
+
+
+