-
Notifications
You must be signed in to change notification settings - Fork 536
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
30 changed files
with
650 additions
and
108 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
FROM ubuntu | ||
MAINTAINER Adam Goldberg <[email protected]> | ||
MAINTAINER [email protected] | ||
RUN sudo apt-get update | ||
RUN sudo apt-get -y install emacs | ||
RUN sudo apt-get -y install default-jre default-jdk | ||
|
@@ -8,16 +8,41 @@ RUN sudo apt-get -y install gnuplot | |
RUN sudo apt-get -y install postgresql postgresql-contrib | ||
RUN sudo apt-get -y install git | ||
RUN sudo apt-get -y install build-essential | ||
RUN sudo apt-get -y install libnuma-dev | ||
RUN sudo apt-get -y install bc | ||
RUN cd ~/ && git clone https://github.com/HazyResearch/deepdive.git | ||
RUN sudo apt-get install zip unzip | ||
RUN cd ~/deepdive && make | ||
|
||
# Configure environment variables | ||
RUN echo 'export PGUSER=postgres' >> ~/.bashrc | ||
RUN echo 'export PGPORT=$POSTGRES_PORT_5432_TCP_PORT' >> ~/.bashrc | ||
RUN echo 'export PGHOST=$POSTGRES_PORT_5432_TCP_ADDR' >> ~/.bashrc | ||
RUN echo 'export PGPASSWORD=password' >> ~/.bashrc | ||
RUN echo 'export PGUSER=postgres' >> ~/.bashrc | ||
RUN echo 'export PGPORT=$DB_PORT_5432_TCP_PORT' >> ~/.bashrc | ||
RUN echo 'export PGHOST=$DB_PORT_5432_TCP_ADDR' >> ~/.bashrc | ||
RUN echo 'export PGPASSWORD=' >> ~/.bashrc | ||
RUN echo 'export PGUSER=gpadmin' >> ~/.bashrc | ||
RUN echo 'export DEEPDIVE_HOME=~/deepdive' >> ~/.bashrc | ||
RUN echo 'export LD_LIBRARY_PATH=$DEEPDIVE_HOME/lib/dw_linux/lib:$DEEPDIVE_HOME/lib/dw_linux/lib64' >> ~/.bashrc | ||
RUN echo 'export PATH=~/deepdive/sbt:$PATH' >> ~/.bashrc | ||
RUN echo 'export LD_LIBRARY_PATH=$DEEPDIVE_HOME/lib/dw_linux/lib:$DEEPDIVE_HOME/sbt:$DEEPDIVE_HOME/lib/dw_linux/lib64' >> ~/.bashrc | ||
RUN echo 'export PATH=~/deepdive/sbt:$PATH' >> ~/.bashrc | ||
|
||
# Initialize script to wait for greenplum | ||
RUN echo 'while true; do' >> ~/.bashrc | ||
RUN echo ' psql -q -h $DB_PORT_5432_TCP_ADDR -p $DB_PORT_5432_TCP_PORT -U gpadmin deepdive -c "SELECT 1;" > /dev/null 2 >& 1' >> ~/.bashrc | ||
RUN echo ' RETVAL=$?' >> ~/.bashrc | ||
RUN echo ' [ $RETVAL -eq 0 ] && break' >> ~/.bashrc | ||
RUN echo ' echo -ne "Waiting for DB\r"' >> ~/.bashrc | ||
RUN echo ' sleep 1' >> ~/.bashrc | ||
RUN echo ' echo -ne "Waiting for DB.\r"' >> ~/.bashrc | ||
RUN echo ' sleep 1' >> ~/.bashrc | ||
RUN echo ' echo -ne "Waiting for DB..\r"' >> ~/.bashrc | ||
RUN echo ' sleep 1' >> ~/.bashrc | ||
RUN echo ' echo -ne "Waiting for DB...\r"' >> ~/.bashrc | ||
RUN echo ' sleep 1' >> ~/.bashrc | ||
RUN echo ' echo -ne "Waiting for DB....\r"' >> ~/.bashrc | ||
RUN echo ' sleep 1' >> ~/.bashrc | ||
RUN echo 'done' >> ~/.bashrc | ||
RUN echo 'echo -ne "\nGreenplum is up and running! You may now use deepdive.\n"' >> ~/.bashrc | ||
|
||
RUN sed -i s/'sbt "test-only org.deepdive.test.integration.ChunkingApp -- -oF"'/'echo "Skipping ChunkingApp" \#sbt "test-only org.deepdive.test.integration.ChunkingApp -- -oF"'/g /root/deepdive/test/test_psql.sh | ||
|
||
RUN mkdir -p ~/deepdive/app | ||
|
||
VOLUME ["/root/deepdive/app"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,38 @@ | ||
# DeepDive | ||
# DeepDive v0.05 | ||
|
||
Licensed under the Apache License, Version 2.0. http://www.apache.org/licenses/LICENSE-2.0.txt | ||
|
||
Tested with Travis CI. | ||
Tested with Travis CI. | ||
[![Build Status](https://travis-ci.org/HazyResearch/deepdive.svg?branch=master)](https://travis-ci.org/HazyResearch/deepdive) | ||
|
||
### [Visit The DeepDive Website](http://deepdive.stanford.edu) | ||
|
||
Docker instructions: | ||
<pre> | ||
# Build and tag deepdive image | ||
# You may specify 'latest' or 'dev' as the tag | ||
docker pull adamwgoldberg/deepdive-github:develop | ||
|
||
# Pull my greenplum image from Docker Hub. Contact me if you need access to the private repository on Docker Hub. | ||
docker run -d --privileged --name db -h gphost adamwgoldberg/greenplum | ||
|
||
# Run Deepdive | ||
# All deepdive application code should be created in /root/deepdive/app | ||
# Make sure the deepdive-github tag matches the above one. | ||
docker run -t -d --link db:db --name deepdive adamwgoldberg/deepdive-github:develop bash | ||
|
||
# Attach shell to Deepdive | ||
# You may need to wait several minutes for Greenplum to initialize. | ||
# The bash shell will say "Waiting for DB..." until it finishes. | ||
docker exec -ti deepdive bash | ||
|
||
# Inside of that shell run: | ||
cd ~/deepdive | ||
make test | ||
</pre> | ||
|
||
Docker tips: | ||
* AWS EC2 m.xlarge on Virginia region using ami-84e897ec is a great place to start | ||
* Ensure you have at least 20GB of storage | ||
* Any machine with Docker installed should work fine | ||
* Due to licensing, Greenplum is not freely available outside of our lab. You may wish to use a Dockerized postgres instead. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
--- | ||
layout: default | ||
--- | ||
|
||
# Changelog for release 0.0.5-alpha (02/08/2015) | ||
|
||
- Added support to build Docker images for DeepDive. See the README.md for more. | ||
- Added SQL "FeatureStatsView" view. Populated with feature | ||
statistics; useful for debugging. | ||
- Added a few fixes to greenplum docs | ||
- Added parallel greenplum loading for extractor data | ||
- A few misc bugfixes | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
target/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
# NLP Extractor | ||
|
||
This directory provides an NLP extractor that is a wrapper for | ||
Stanford NLP. Its input is textual data in TSV format, and its output | ||
is one or many TSV files that contains processed sentences data that | ||
can be directly loaded into PostgreSQL / Greenplum tables. | ||
|
||
In (1) we show how to use it this NLP extractor as an "extractor" in | ||
DeepDive, and in (2) we provide a script to run it as a stand-alone | ||
application in parallel, without DeepDive --- which is a recommended | ||
way if you have lots of data, since this way can achieve more | ||
parallism. | ||
|
||
## Compile | ||
|
||
Compile the NLP extractor using following command in this directory: | ||
|
||
sbt stage | ||
|
||
## (1) run.sh and Integration with DeepDive | ||
|
||
### Input | ||
|
||
The input file is a TSV file of following form. Each line is a | ||
document. | ||
|
||
The TSV file should have two columns. The first column is | ||
`document_id` (text format), a unique modifier for a document. The | ||
second column is `text`, all sentences in the document in plain text: | ||
|
||
doc1\tFull text in doc1 | ||
doc2\tFull text in doc2 | ||
... | ||
|
||
Note that the input TSV file should not have headers. | ||
|
||
### Output | ||
|
||
Output is another TSV file that contains multiple columns: | ||
|
||
1. `document_id`: The document_id from input. | ||
2. `sentence`: The raw sentence text same as input | ||
3. `words`: (PSQL-formatted) array of words | ||
4. `lemma`: array of lemmatized words | ||
5. `post_tags`: array of Part-of-speech tags | ||
6. `ner_tags`: array of Named Entity tags | ||
7. `dependencies`: array of collapsed dependencies | ||
8. `sentence_offset`: The index / offset of this sentence in document | ||
9. `sentence_id`: A unique identifier to the sentence | ||
|
||
You can create a table like this to be able to import the output TSV | ||
file to the database. (Note that this is the `output_relation` in | ||
DeepDive) | ||
|
||
CREATE TABLE sentences( | ||
document_id bigint, | ||
sentence text, | ||
words text[], | ||
lemma text[], | ||
pos_tags text[], | ||
dependencies text[], | ||
ner_tags text[], | ||
sentence_offset bigint, | ||
sentence_id text | ||
); | ||
|
||
## (2) run_parallel.sh: Stand-alone Parallel NLP Extractor | ||
|
||
When used the `run.sh` with DeepDive, sometime ideal parallelism | ||
cannot be achieved because of memory problems. In this case, we | ||
recommend to use the `run_parallel.sh`. It does the following steps: | ||
|
||
1. Split your input file `INPUT_FILE` into chunks in `INPUT_FILE.split/` | ||
2. Uses system parallelism tool `xargs` to run `run.sh` in parallel. | ||
The outputs are saved to `INPUT_FILE.split/*.out`. | ||
|
||
Run it with the following command | ||
|
||
./run_parallel.sh INPUT_FILE PARALLELISM [INPUT_BATCH_SIZE=100] [SENTENCE_WORDS_LIMIT=120] | ||
|
||
- `INPUT_FILE`: your input TSV file | ||
- `PARALLELISM`: a number indicating desired parallelism. e.g.: 8 | ||
- `INPUT_BATCH_SIZE`: how many lines are in each file after split. | ||
Default 100. | ||
- `SENTENCE_WORDS_LIMIT`: Do not run dependency parsing if number of | ||
words in sentence is larger than this number. This helps in speeding | ||
up the parsing. | ||
|
||
When finished, you should manually import the files in | ||
`INPUT_FILE.split/*.out` into your database. You can use a COPY query | ||
like this: | ||
|
||
cat INPUT_FILE.split/*.out | psql YOUR_DB_NAME -c "COPY sentences FROM STDIN" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
import com.typesafe.sbt.SbtStartScript | ||
|
||
name := "deepdive-nlp-parser" | ||
|
||
version := "0.1" | ||
|
||
scalaVersion := "2.10.3" | ||
|
||
resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/" | ||
|
||
libraryDependencies ++= List( | ||
"ch.qos.logback" % "logback-classic" % "1.0.7", | ||
"com.typesafe.play" %% "play-json" % "2.2.1", | ||
"com.github.scopt" %% "scopt" % "3.2.0", | ||
"edu.stanford.nlp" % "stanford-corenlp" % "3.3.1", | ||
"edu.stanford.nlp" % "stanford-corenlp" % "3.3.1" classifier "models", | ||
"org.scalatest" % "scalatest_2.10" % "2.0.RC2" % "test" | ||
) | ||
|
||
parallelExecution in Test := false | ||
|
||
seq(SbtStartScript.startScriptForClassesSettings: _*) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
addSbtPlugin("com.typesafe.sbt" % "sbt-start-script" % "0.10.0") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
#! /usr/bin/env bash | ||
|
||
# export SBT_OPTS="-Xmx1g" | ||
|
||
$(dirname $0)/target/start $@ |
Oops, something went wrong.