googlegenomics · Jessime · Sep 13, 2018 · Sep 21, 2018 · bashir2 · Sep 21, 2018
diff --git a/service/README.md b/service/README.md
@@ -0,0 +1,3 @@
+# Annotation Server
+
+## TODO: add descriptions of architecture, development, deployment, etc.
diff --git a/service/client_tests.py b/service/client_tests.py
@@ -0,0 +1,98 @@
+import sys
+import requests
+from pprint import pprint
+from tqdm import tqdm, trange
+
+class Tests(object):
+
+    def __init__(self, env):
+        if env == 'd':
+            self.url = 'http://127.0.0.1:8080/'
+        elif env == 'p':
+            self.url = 'https://gcp-variant-transforms-test.appspot.com/'
+        else:
+            raise ValueError('env must be `d` (dev.) or `p` (prod.).')
+
+    def example1(self):
+        """No payload"""
+        return requests.get(self.url)
+
+    def example2(self):
+        """A single simple variant"""
+        vcf = 'GRCh38_single.vcf'
+        with open(vcf) as vcf:
+            vcf = vcf.readlines()
+        variant = vcf[-1]
+        response = requests.post(self.url, data={'variants': variant})
+        return response
+
+    def example2_1(self):
+        """A single simple variant in the request header"""
+        vcf = 'GRCh38_single.vcf'
+        with open(vcf) as vcf:
+            vcf = vcf.readlines()
+        variant = vcf[-1]
+        response = requests.post(self.url, params={'variants': variant})
+        return response
+
+    def example3(self):
+        """A single already annotated variant"""
+        vcf = 'gnomad_vep3.vcf'
+        with open(vcf) as vcf:
+            vcf = vcf.readlines()
+        variant = vcf[-1]
+        response = requests.post(self.url, data={'variants': variant})
+        return response
+
+    def example4(self):
+        """Multiple already annotated variants"""
+        vcf = 'gnomad_vep3.vcf'
+        with open(vcf) as vcf:
+            vcf = vcf.readlines()
+        variants = ''.join(vcf[-3:])
+        #print variants
+        response = requests.post(self.url, data={'variants': variants})
+        return response
+
+    def example5(self):
+        """Timing many variants in a for loop"""
+        vcf = 'valid-4.1-large.vcf'
+        with open(vcf) as vcf:
+            vcf = vcf.readlines()
+        variants = [l for l in vcf if l[0] != '#']
+        for v in tqdm(variants):
+            response = requests.post(self.url, params={'variants': v})
+        return response.update({'notes': 'Showing last response only.'})
+
+    def example6(self):
+        """Run whatever is currently going on in the `test` route."""
+        return requests.get(self.url + 'test')
+
+    def example7(self):
+        """10s of Mbs of data in the request body"""
+        vcf = '/usr/local/google/home/jessime/data/gnomad_genomes_chrX_head30M.vcf'
+        vcf = '/usr/local/google/home/jessime/data/gnomad_genomes_GRCh37_chrX_head2500.vcf'
+        with open(vcf) as vcf:
+            vcf = vcf.read()
+        print '~ request size (MB): ', len(vcf.encode('utf-8'))/float(1024**2)
+        return requests.post(self.url, data={'variants': vcf})
+
+    def example8(self):
+        """Get headers"""
+        return requests.get(self.url + 'headers')
+
+    def run(self, n):
+        response = getattr(self, 'example' + n)()
+        try:
+            data = response.json()
+            if isinstance(data, dict):
+                for key, value in data.iteritems():
+                    print '{}:\n{}\n'.format(key, value)
+            else:
+                print data
+        except ValueError:
+            raise ValueError(response.text)
+
+if __name__ == '__main__':
+    tests = Tests(sys.argv[1])
+    tests.run(sys.argv[2])
diff --git a/service/example_client.py b/service/example_client.py
@@ -0,0 +1,83 @@
+import sys
+import uuid
+import requests
+import apache_beam as beam
+
+class KeyGenFn(beam.DoFn):
+
+    def __init__(self, chunk=1000):
+        self.chunk = chunk
+        self.key = uuid.uuid4()
+        self.counter = 0
+
+    def process(self, element):
+        self.counter += 1
+        if self.counter == self.chunk:
+            self.counter = 0
+            self.key = uuid.uuid4()
+        yield self.key, element
+
+
+class Requester(beam.DoFn):
+
+    def __init__(self, url='https://gcp-variant-transforms-test.appspot.com/'):
+        self.url = url
+
+    def process(self, vcf_chunk):
+        start = vcf_chunk.count('\n')
+        response = requests.post(self.url, data={'variants': vcf_chunk})
+        try:
+            data = response.json()
+            if data['stderr']:
+                raise ValueError(data['stderr'])
+            result = data['stdout']
+            end = result.count('\n')
+            #print 'This chunk started with {} lines and ended with {}.'.format(start, end)
+        except ValueError:
+            #TODO (jessime) This should be more robust. We should look at the
+            #text and decide what todo based on the text. For example:
+            #if we get the 'wait 30s error, we should do that and try again.
+            #If it the request size was too large, we can subdivide the string
+            #and send multiple smaller requests. If it's a VEP error, then we
+            #can actually abort. Or something like this.
+            raise ValueError(response.text)
+        yield result
+
+
+def remove_header_lines(vcf):
+    return [line for line in vcf.splitlines() if line[0] != '#']
+
+
+def join_lines(kv):
+    return '\n'.join(kv[1])
+
+
+def run(infile, outfile, env):
+    if env == 'd':
+        options = {}
+    elif env == 'p':
+        options = {
+            'runner': 'DataflowRunner',
+            'num_workers': 50,
+            'max_num_workers': 100,
+            'project': 'gcp-variant-transforms-test',
+            'staging_location': 'gs://jessime_test_bucket/staging',
+            'temp_location': 'gs://jessime_test_bucket/temp',
+            'job_name': 'vep-as-a-service8',
+            'setup_file': './setup.py',
+            'save_main_session': True}
+    else:
+        raise ValueError('env must be `d` (dev.) or `p` (prod.).')
+
+    options = beam.options.pipeline_options.PipelineOptions(**options)
+    with beam.Pipeline(options=options) as p:
+        results = (p | beam.io.ReadFromText(infile)
+                     | beam.ParDo(KeyGenFn())
+                     | beam.GroupByKey()
+                     | beam.Map(join_lines)
+                     | beam.ParDo(Requester())
+                     | beam.FlatMap(remove_header_lines))
+        results | beam.io.WriteToText(outfile)
+
+if __name__ == '__main__':
+    run(sys.argv[1], sys.argv[2], sys.argv[3])
diff --git a/service/server/.dockerignore b/service/server/.dockerignore
@@ -0,0 +1,8 @@
+env
+*.pyc
+__pycache__
+.dockerignore
+Dockerfile
+.git
+.hg
+.svn
diff --git a/service/server/Dockerfile b/service/server/Dockerfile
@@ -0,0 +1,50 @@
+# [START dockerfile]
+FROM gcr.io/google_appengine/python
+
+# Install the fortunes binary from the debian repositories.
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    git \
+    libarchive-zip-perl \
+    libdbd-mysql-perl \
+    libdbi-perl \
+    libfile-copy-recursive-perl \
+    libhts1 \
+    libjson-perl \
+    libmodule-build-perl \
+    tabix \
+    unzip \
+    zlib1g-dev
+
+#Install VEP per the instructions at:
+#http://www.ensembl.org/info/docs/tools/vep/script/vep_download.html#installer
+ARG ENSEMBL_RELEASE=91
+RUN git clone https://github.com/Ensembl/ensembl-vep.git
+WORKDIR ensembl-vep
+RUN git checkout release/${ENSEMBL_RELEASE}
+RUN perl INSTALL.pl \
+    --AUTO a \
+    --NO_UPDATE
+WORKDIR ..
+
+ ARG CACHE=/mnt/vep/vep_cache/
+ RUN mkdir -p ${CACHE}
+ RUN pip install gsutil
+
+# Change the -p argument to use Python 2.7 if desired.
+RUN virtualenv /env -p python2.7
+
+# Set environment variables for server
+ENV ANNOTATION_SERVER_ENV production
+ENV VEP_DIR /mnt/vep/vep_cache
+ENV VEP_SCRIPT ensembl-vep/vep
+ENV ASSEMBLY GRCh38
+
+# Set virtualenv environment variables. This is equivalent to running
+ENV VIRTUAL_ENV /env
+ENV PATH /env/bin:$PATH
+
+ADD requirements.txt /app/
+RUN pip install -r requirements.txt
+ADD . /app
+CMD gunicorn -b :$PORT -t 600 main:app
diff --git a/service/server/app.yaml b/service/server/app.yaml
@@ -0,0 +1,13 @@
+runtime: custom
+env: flex
+resources:
+  memory_gb: 4
+  disk_size_gb: 30
+automatic_scaling:
+  min_num_instances: 500
+  max_num_instances: 500
+  cool_down_period_sec: 180
+  cpu_utilization:
+    target_utilization: 0.95
+readiness_check:
+  app_start_timeout_sec: 2400
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Annotation Server

		## TODO: add descriptions of architecture, development, deployment, etc.
Copy link Member bashir2 Sep 21, 2018 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. For now, please add a note that this is a prototype that is not productionized yet but has been shown to scale well with Variant Transforms and add a link to your other PR: googlegenomics/gcp-variant-transforms#361 Copy link Author Jessime Sep 21, 2018 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. Done. I've also added a few lines about how to launch the server.