-
Notifications
You must be signed in to change notification settings - Fork 7
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Initial commit of annotation service. #7
base: master
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# Annotation Server | ||
|
||
## TODO: add descriptions of architecture, development, deployment, etc. | ||
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
import sys | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please make sure you add the header doc strings before, especially the Apache licence related lines (and a short description for each module). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
import requests | ||
from pprint import pprint | ||
from tqdm import tqdm, trange | ||
|
||
class Tests(object): | ||
|
||
def __init__(self, env): | ||
if env == 'd': | ||
self.url = 'http://127.0.0.1:8080/' | ||
elif env == 'p': | ||
self.url = 'https://gcp-variant-transforms-test.appspot.com/' | ||
else: | ||
raise ValueError('env must be `d` (dev.) or `p` (prod.).') | ||
|
||
def example1(self): | ||
"""No payload""" | ||
return requests.get(self.url) | ||
|
||
def example2(self): | ||
"""A single simple variant""" | ||
vcf = 'GRCh38_single.vcf' | ||
with open(vcf) as vcf: | ||
vcf = vcf.readlines() | ||
variant = vcf[-1] | ||
response = requests.post(self.url, data={'variants': variant}) | ||
return response | ||
|
||
def example2_1(self): | ||
"""A single simple variant in the request header""" | ||
vcf = 'GRCh38_single.vcf' | ||
with open(vcf) as vcf: | ||
vcf = vcf.readlines() | ||
variant = vcf[-1] | ||
response = requests.post(self.url, params={'variants': variant}) | ||
return response | ||
|
||
def example3(self): | ||
"""A single already annotated variant""" | ||
vcf = 'gnomad_vep3.vcf' | ||
with open(vcf) as vcf: | ||
vcf = vcf.readlines() | ||
variant = vcf[-1] | ||
response = requests.post(self.url, data={'variants': variant}) | ||
return response | ||
|
||
def example4(self): | ||
"""Multiple already annotated variants""" | ||
vcf = 'gnomad_vep3.vcf' | ||
with open(vcf) as vcf: | ||
vcf = vcf.readlines() | ||
variants = ''.join(vcf[-3:]) | ||
#print variants | ||
response = requests.post(self.url, data={'variants': variants}) | ||
return response | ||
|
||
def example5(self): | ||
"""Timing many variants in a for loop""" | ||
vcf = 'valid-4.1-large.vcf' | ||
with open(vcf) as vcf: | ||
vcf = vcf.readlines() | ||
variants = [l for l in vcf if l[0] != '#'] | ||
for v in tqdm(variants): | ||
response = requests.post(self.url, params={'variants': v}) | ||
return response.update({'notes': 'Showing last response only.'}) | ||
|
||
def example6(self): | ||
"""Run whatever is currently going on in the `test` route.""" | ||
return requests.get(self.url + 'test') | ||
|
||
def example7(self): | ||
"""10s of Mbs of data in the request body""" | ||
vcf = '/usr/local/google/home/jessime/data/gnomad_genomes_chrX_head30M.vcf' | ||
vcf = '/usr/local/google/home/jessime/data/gnomad_genomes_GRCh37_chrX_head2500.vcf' | ||
with open(vcf) as vcf: | ||
vcf = vcf.read() | ||
print '~ request size (MB): ', len(vcf.encode('utf-8'))/float(1024**2) | ||
return requests.post(self.url, data={'variants': vcf}) | ||
|
||
def example8(self): | ||
"""Get headers""" | ||
return requests.get(self.url + 'headers') | ||
|
||
def run(self, n): | ||
response = getattr(self, 'example' + n)() | ||
try: | ||
data = response.json() | ||
if isinstance(data, dict): | ||
for key, value in data.iteritems(): | ||
print '{}:\n{}\n'.format(key, value) | ||
else: | ||
print data | ||
except ValueError: | ||
raise ValueError(response.text) | ||
|
||
if __name__ == '__main__': | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Please add a usage text and print it when there are not two arguments. |
||
tests = Tests(sys.argv[1]) | ||
tests.run(sys.argv[2]) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
import sys | ||
import uuid | ||
import requests | ||
import apache_beam as beam | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this file belong to variant-annotation repo? I would have expected a pure client code here, i.e., There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was thinking of this file as my small integration test for the server, but I agree that it probably makes more sense to move to gcp-variant-transforms. (I'll leave it in for now, but feel free to move it in the future.) |
||
|
||
class KeyGenFn(beam.DoFn): | ||
|
||
def __init__(self, chunk=1000): | ||
self.chunk = chunk | ||
self.key = uuid.uuid4() | ||
self.counter = 0 | ||
|
||
def process(self, element): | ||
self.counter += 1 | ||
if self.counter == self.chunk: | ||
self.counter = 0 | ||
self.key = uuid.uuid4() | ||
yield self.key, element | ||
|
||
|
||
class Requester(beam.DoFn): | ||
|
||
def __init__(self, url='https://gcp-variant-transforms-test.appspot.com/'): | ||
self.url = url | ||
|
||
def process(self, vcf_chunk): | ||
start = vcf_chunk.count('\n') | ||
response = requests.post(self.url, data={'variants': vcf_chunk}) | ||
try: | ||
data = response.json() | ||
if data['stderr']: | ||
raise ValueError(data['stderr']) | ||
result = data['stdout'] | ||
end = result.count('\n') | ||
#print 'This chunk started with {} lines and ended with {}.'.format(start, end) | ||
except ValueError: | ||
#TODO (jessime) This should be more robust. We should look at the | ||
#text and decide what todo based on the text. For example: | ||
#if we get the 'wait 30s error, we should do that and try again. | ||
#If it the request size was too large, we can subdivide the string | ||
#and send multiple smaller requests. If it's a VEP error, then we | ||
#can actually abort. Or something like this. | ||
raise ValueError(response.text) | ||
yield result | ||
|
||
|
||
def remove_header_lines(vcf): | ||
return [line for line in vcf.splitlines() if line[0] != '#'] | ||
|
||
|
||
def join_lines(kv): | ||
return '\n'.join(kv[1]) | ||
|
||
|
||
def run(infile, outfile, env): | ||
if env == 'd': | ||
options = {} | ||
elif env == 'p': | ||
options = { | ||
'runner': 'DataflowRunner', | ||
'num_workers': 50, | ||
'max_num_workers': 100, | ||
'project': 'gcp-variant-transforms-test', | ||
'staging_location': 'gs://jessime_test_bucket/staging', | ||
'temp_location': 'gs://jessime_test_bucket/temp', | ||
'job_name': 'vep-as-a-service8', | ||
'setup_file': './setup.py', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here's a specific reason why it makes more sense to keep this in gcp-variant-transforms. This setup.py file refers to Variant Transforms main setup.py file. |
||
'save_main_session': True} | ||
else: | ||
raise ValueError('env must be `d` (dev.) or `p` (prod.).') | ||
|
||
options = beam.options.pipeline_options.PipelineOptions(**options) | ||
with beam.Pipeline(options=options) as p: | ||
results = (p | beam.io.ReadFromText(infile) | ||
| beam.ParDo(KeyGenFn()) | ||
| beam.GroupByKey() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should avoid this GroupByKey; maybe it is easier to remove this module from this repo for now and discuss this issue in the other PR in |
||
| beam.Map(join_lines) | ||
| beam.ParDo(Requester()) | ||
| beam.FlatMap(remove_header_lines)) | ||
results | beam.io.WriteToText(outfile) | ||
|
||
if __name__ == '__main__': | ||
run(sys.argv[1], sys.argv[2], sys.argv[3]) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
env | ||
*.pyc | ||
__pycache__ | ||
.dockerignore | ||
Dockerfile | ||
.git | ||
.hg | ||
.svn |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
# [START dockerfile] | ||
FROM gcr.io/google_appengine/python | ||
|
||
# Install the fortunes binary from the debian repositories. | ||
RUN apt-get update && apt-get install -y \ | ||
build-essential \ | ||
git \ | ||
libarchive-zip-perl \ | ||
libdbd-mysql-perl \ | ||
libdbi-perl \ | ||
libfile-copy-recursive-perl \ | ||
libhts1 \ | ||
libjson-perl \ | ||
libmodule-build-perl \ | ||
tabix \ | ||
unzip \ | ||
zlib1g-dev | ||
|
||
#Install VEP per the instructions at: | ||
#http://www.ensembl.org/info/docs/tools/vep/script/vep_download.html#installer | ||
ARG ENSEMBL_RELEASE=91 | ||
RUN git clone https://github.com/Ensembl/ensembl-vep.git | ||
WORKDIR ensembl-vep | ||
RUN git checkout release/${ENSEMBL_RELEASE} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When you add header lines to this file (for licence, general desc. etc.) please also add some usage examples and what the value of build arguments are expected to be. |
||
RUN perl INSTALL.pl \ | ||
--AUTO a \ | ||
--NO_UPDATE | ||
WORKDIR .. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Have you considered reusing the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Absolutely. I initially started using both the
I eventually gave up trying to modify the existing files (going "top-down", if you will), and decided to go "bottom-up" by building off of GPC minimalist examples. My intention was to continue building up functionality, using the original files as endpoints and guides. |
||
|
||
ARG CACHE=/mnt/vep/vep_cache/ | ||
RUN mkdir -p ${CACHE} | ||
RUN pip install gsutil | ||
|
||
# Change the -p argument to use Python 2.7 if desired. | ||
RUN virtualenv /env -p python2.7 | ||
|
||
# Set environment variables for server | ||
ENV ANNOTATION_SERVER_ENV production | ||
ENV VEP_DIR /mnt/vep/vep_cache | ||
ENV VEP_SCRIPT ensembl-vep/vep | ||
ENV ASSEMBLY GRCh38 | ||
|
||
# Set virtualenv environment variables. This is equivalent to running | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. running ...? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added a couple of deleted lines back. |
||
ENV VIRTUAL_ENV /env | ||
ENV PATH /env/bin:$PATH | ||
|
||
ADD requirements.txt /app/ | ||
RUN pip install -r requirements.txt | ||
ADD . /app | ||
CMD gunicorn -b :$PORT -t 600 main:app |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
runtime: custom | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When you add header texts, please also add some links to relevant app-engine documentation describing these parameters. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
env: flex | ||
resources: | ||
memory_gb: 4 | ||
disk_size_gb: 30 | ||
automatic_scaling: | ||
min_num_instances: 500 | ||
max_num_instances: 500 | ||
cool_down_period_sec: 180 | ||
cpu_utilization: | ||
target_utilization: 0.95 | ||
readiness_check: | ||
app_start_timeout_sec: 2400 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For now, please add a note that this is a prototype that is not productionized yet but has been shown to scale well with Variant Transforms and add a link to your other PR: googlegenomics/gcp-variant-transforms#361
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done. I've also added a few lines about how to launch the server.