googlegenomics · Jessime · Sep 13, 2018 · Sep 21, 2018 · bashir2 · Sep 21, 2018
diff --git a/service/README.md b/service/README.md
@@ -0,0 +1,39 @@
+# Annotation Server
+
+## TODO: add descriptions of architecture, development, deployment, etc.
+
+## WARNING
+
+This server is a prototype that is not productionized yet.
+But, in initial testing, it has been shown to scale well with Variant Transforms.
+The client side code is also still in the prototype phase, and can be found here:
+
+[Variant Transforms PR #361](https://github.com/googlegenomics/gcp-variant-transforms/pull/361)
+
+## Getting Started
+
+The server can be deployed either to Google App Engine, or locally on localhost. To launch to GAE, run:
+
+```
+$ cd ~/variant-annotation/service/server/
+$ gcloud app deploy --project gcp-variant-transforms-test
+```
+
+You can verify that you’re in the correct directory for launching to GAE by ensuring you are in the same folder as the app.yaml file. Launching is a slow process and can easily take 10-20 minutes.
+
+To speed development cycles, you can also run locally. Before launching, add the appropriate environment variables. For example, the following lines can be added to ~/.bashrc:
+
+```
+#ENVIRONMENT variables for VEP server
+ANNOTATION_SERVER_ENV="local"
+VEP_DIR="${HOME}/.vep/"
+VEP_SCRIPT="${HOME}/Code/ensembl-vep/vep"
+ASSEMBLY="GRCh38"
+```
+
+Once these variables are defined, just run:
+
+```
+$ cd ~/variant-annotation/service/server/
+$ python main.py
+```
diff --git a/service/client_tests.py b/service/client_tests.py
@@ -0,0 +1,124 @@
+# Copyright 2018 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test requests for the annotation server.
+
+Run by:
+    python client_tests.py [p|d] [1-8]
+
+Example:
+    # This will test if the development server is running properly.
+    python client_tests.py d 1
+"""
+
+#TODO(jessime) All of the vcf file paths are stale. Need to update data files.
+
+import sys
+import requests
+from pprint import pprint
+from tqdm import tqdm, trange
+
+class Tests(object):
+
+    def __init__(self, env):
+        if env == 'd':
+            self.url = 'http://127.0.0.1:8080/'
+        elif env == 'p':
+            self.url = 'https://gcp-variant-transforms-test.appspot.com/'
+        else:
+            raise ValueError('env must be `d` (dev.) or `p` (prod.).')
+
+    def example1(self):
+        """No payload"""
+        return requests.get(self.url)
+
+    def example2(self):
+        """A single simple variant"""
+        vcf = 'GRCh38_single.vcf'
+        with open(vcf) as vcf:
+            vcf = vcf.readlines()
+        variant = vcf[-1]
+        response = requests.post(self.url, data={'variants': variant})
+        return response
+
+    def example2_1(self):
+        """A single simple variant in the request header"""
+        vcf = 'GRCh38_single.vcf'
+        with open(vcf) as vcf:
+            vcf = vcf.readlines()
+        variant = vcf[-1]
+        response = requests.post(self.url, params={'variants': variant})
+        return response
+
+    def example3(self):
+        """A single already annotated variant"""
+        vcf = 'gnomad_vep3.vcf'
+        with open(vcf) as vcf:
+            vcf = vcf.readlines()
+        variant = vcf[-1]
+        response = requests.post(self.url, data={'variants': variant})
+        return response
+
+    def example4(self):
+        """Multiple already annotated variants"""
+        vcf = 'gnomad_vep3.vcf'
+        with open(vcf) as vcf:
+            vcf = vcf.readlines()
+        variants = ''.join(vcf[-3:])
+        #print variants
+        response = requests.post(self.url, data={'variants': variants})
+        return response
+
+    def example5(self):
+        """Timing many variants in a for loop"""
+        vcf = 'valid-4.1-large.vcf'
+        with open(vcf) as vcf:
+            vcf = vcf.readlines()
+        variants = [l for l in vcf if l[0] != '#']
+        for v in tqdm(variants):
+            response = requests.post(self.url, params={'variants': v})
+        return response.update({'notes': 'Showing last response only.'})
+
+    def example6(self):
+        """Run whatever is currently going on in the `test` route."""
+        return requests.get(self.url + 'test')
+
+    def example7(self):
+        """10s of Mbs of data in the request body"""
+        vcf = '/usr/local/google/home/jessime/data/gnomad_genomes_chrX_head30M.vcf'
+        vcf = '/usr/local/google/home/jessime/data/gnomad_genomes_GRCh37_chrX_head2500.vcf'
+        with open(vcf) as vcf:
+            vcf = vcf.read()
+        print '~ request size (MB): ', len(vcf.encode('utf-8'))/float(1024**2)
+        return requests.post(self.url, data={'variants': vcf})
+
+    def example8(self):
+        """Get headers"""
+        return requests.get(self.url + 'headers')
+
+    def run(self, n):
+        response = getattr(self, 'example' + n)()
+        try:
+            data = response.json()
+            if isinstance(data, dict):
+                for key, value in data.iteritems():
+                    print '{}:\n{}\n'.format(key, value)
+            else:
+                print data
+        except ValueError:
+            raise ValueError(response.text)
+
+if __name__ == '__main__':
+    tests = Tests(sys.argv[1])
+    tests.run(sys.argv[2])
diff --git a/service/example_client.py b/service/example_client.py
@@ -0,0 +1,107 @@
+# Copyright 2018 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A small end-to-end client that uses apache_beam, but not Variant Transforms.
+
+This prototype client accepts a path to a VCF file, the path to the newly
+annotated VCF file, and either `p` or `d` to specify the `production` or
+`development` environment.
+
+Run by:
+    python example_client.py path/to/infile.vcf, path/to/outfile.vcf, [p|d]
+"""
+
+import sys
+import uuid
+import requests
+import apache_beam as beam
+
+class KeyGenFn(beam.DoFn):
+
+    def __init__(self, chunk=1000):
+        self.chunk = chunk
+        self.key = uuid.uuid4()
+        self.counter = 0
+
+    def process(self, element):
+        self.counter += 1
+        if self.counter == self.chunk:
+            self.counter = 0
+            self.key = uuid.uuid4()
+        yield self.key, element
+
+
+class Requester(beam.DoFn):
+
+    def __init__(self, url='https://gcp-variant-transforms-test.appspot.com/'):
+        self.url = url
+
+    def process(self, vcf_chunk):
+        start = vcf_chunk.count('\n')
+        response = requests.post(self.url, data={'variants': vcf_chunk})
+        try:
+            data = response.json()
+            if data['stderr']:
+                raise ValueError(data['stderr'])
+            result = data['stdout']
+            end = result.count('\n')
+            #print 'This chunk started with {} lines and ended with {}.'.format(start, end)
+        except ValueError:
+            #TODO (jessime) This should be more robust. We should look at the
+            #text and decide what todo based on the text. For example:
+            #if we get the 'wait 30s error, we should do that and try again.
+            #If it the request size was too large, we can subdivide the string
+            #and send multiple smaller requests. If it's a VEP error, then we
+            #can actually abort. Or something like this.
+            raise ValueError(response.text)
+        yield result
+
+
+def remove_header_lines(vcf):
+    return [line for line in vcf.splitlines() if line[0] != '#']
+
+
+def join_lines(kv):
+    return '\n'.join(kv[1])
+
+
+def run(infile, outfile, env):
+    if env == 'd':
+        options = {}
+    elif env == 'p':
+        options = {
+            'runner': 'DataflowRunner',
+            'num_workers': 50,
+            'max_num_workers': 100,
+            'project': 'gcp-variant-transforms-test',
+            'staging_location': 'gs://jessime_test_bucket/staging',
+            'temp_location': 'gs://jessime_test_bucket/temp',
+            'job_name': 'vep-as-a-service8',
+            'setup_file': './setup.py',
+            'save_main_session': True}
+    else:
+        raise ValueError('env must be `d` (dev.) or `p` (prod.).')
+
+    options = beam.options.pipeline_options.PipelineOptions(**options)
+    with beam.Pipeline(options=options) as p:
+        results = (p | beam.io.ReadFromText(infile)
+                     | beam.ParDo(KeyGenFn())
+                     | beam.GroupByKey()
+                     | beam.Map(join_lines)
+                     | beam.ParDo(Requester())
+                     | beam.FlatMap(remove_header_lines))
+        results | beam.io.WriteToText(outfile)
+
+if __name__ == '__main__':
+    run(sys.argv[1], sys.argv[2], sys.argv[3])
diff --git a/service/server/.dockerignore b/service/server/.dockerignore
@@ -0,0 +1,8 @@
+env
+*.pyc
+__pycache__
+.dockerignore
+Dockerfile
+.git
+.hg
+.svn
diff --git a/service/server/Dockerfile b/service/server/Dockerfile
@@ -0,0 +1,66 @@
+# Copyright 2018 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START dockerfile]
+FROM gcr.io/google_appengine/python
+
+# Install the fortunes binary from the debian repositories.
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    git \
+    libarchive-zip-perl \
+    libdbd-mysql-perl \
+    libdbi-perl \
+    libfile-copy-recursive-perl \
+    libhts1 \
+    libjson-perl \
+    libmodule-build-perl \
+    tabix \
+    unzip \
+    zlib1g-dev
+
+#Install VEP per the instructions at:
+#http://www.ensembl.org/info/docs/tools/vep/script/vep_download.html#installer
+ARG ENSEMBL_RELEASE=91
+RUN git clone https://github.com/Ensembl/ensembl-vep.git
+WORKDIR ensembl-vep
+RUN git checkout release/${ENSEMBL_RELEASE}
+RUN perl INSTALL.pl \
+    --AUTO a \
+    --NO_UPDATE
+WORKDIR ..
+
+ ARG CACHE=/mnt/vep/vep_cache/
+ RUN mkdir -p ${CACHE}
+ RUN pip install gsutil
+
+# Change the -p argument to use Python 2.7 if desired.
+RUN virtualenv /env -p python2.7
+
+# Set environment variables for server
+ENV ANNOTATION_SERVER_ENV production
+ENV VEP_DIR /mnt/vep/vep_cache
+ENV VEP_SCRIPT ensembl-vep/vep
+ENV ASSEMBLY GRCh38
+
+# Set virtualenv environment variables. This is equivalent to running
+# source /env/bin/activate. This ensures the application is executed within
+# the context of the virtualenv and will have access to its dependencies.
+ENV VIRTUAL_ENV /env
+ENV PATH /env/bin:$PATH
+
+ADD requirements.txt /app/
+RUN pip install -r requirements.txt
+ADD . /app
+CMD gunicorn -b :$PORT -t 600 main:app
diff --git a/service/server/app.yaml b/service/server/app.yaml
@@ -0,0 +1,31 @@
+# Copyright 2018 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Configure your Google App Engine environement. See here for documentation:
+# https://cloud.google.com/appengine/docs/flexible/python/configuring-your-app-with-app-yaml
+
+runtime: custom
+env: flex
+resources:
+  memory_gb: 4
+  disk_size_gb: 30
+automatic_scaling:
+  min_num_instances: 500
+  max_num_instances: 500
+  cool_down_period_sec: 180
+  cpu_utilization:
+    target_utilization: 0.95
+readiness_check:
+  app_start_timeout_sec: 2400