Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🐛🔧 Fix/update genomic file std and target concept configuration #314

Merged
merged 5 commits into from
May 28, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions kf_lib_data_ingest/common/concept_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,10 @@


class FileMixin(object):
ETAG = None
SIZE = None
DATA_TYPE = None
FILE_NAME = None
FILE_PATH = None
HASH_DICT = None
URL_LIST = None


class PropertyMixin(object):
Expand Down Expand Up @@ -99,6 +98,9 @@ class GENOMIC_FILE(PropertyMixin, FileMixin):
AVAILABILITY = None
HARMONIZED = None
CAVATICA_OUTPUT_FILE = None
REFERENCE_GENOME = None
FILE_FORMAT = None
DATA_TYPE = None

class READ_GROUP(PropertyMixin):
PAIRED_END = None
Expand Down
28 changes: 28 additions & 0 deletions kf_lib_data_ingest/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,15 @@ class COMMON:
MULTIPLE = "Multiple"


class FILE:
class HASH:
MD5 = 'MD5'
SHA1 = 'SHA-1'
SHA256 = 'SHA-256'
SHA512 = 'SHA-512'
S3_ETAG = 'S3 ETag'


class SPECIMEN:
class COMPOSITION:
XENOGRAFT = 'Xenograft Tissue'
Expand All @@ -41,6 +50,25 @@ class AVAILABILITY:
IMMEDIATE = 'Immediate Download'
COLD_STORAGE = 'In cold storage'

class FORMAT:
FASTQ = 'fastq'
BAM = 'bam'
CRAM = 'cram'
BAI = 'bai'
CRAI = 'crai'
GVCF = 'gvcf'
TBI = 'tbi'
VCF = 'vcf'

class DATA_TYPE:
UNALIGNED_READS = 'Unaligned Reads'
ALIGNED_READS = 'Aligned Reads'
ALIGNED_READS_INDEX = 'Aligned Reads Index'
GVCF = 'gVCF'
GVCF_INDEX = 'gVCF Index'
VARIANT_CALLS = 'Variant Calls'
VARIANT_CALLS_INDEX = 'Variant Calls Index'


class SEQUENCING:
class REFERENCE_GENOME:
Expand Down
20 changes: 8 additions & 12 deletions kf_lib_data_ingest/target_apis/kids_first.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,23 +154,19 @@
},
'genomic_file': {
'standard_concept': CONCEPT.GENOMIC_FILE,
'links': {
'sequencing_experiment_id': CONCEPT.SEQUENCING.UNIQUE_KEY
},
'properties': {
"external_id": CONCEPT.GENOMIC_FILE.UNIQUE_KEY,
"file_name": CONCEPT.GENOMIC_FILE.FILE_NAME,
"file_format": None,
"data_type": None,
"availability": None,
"file_format": CONCEPT.GENOMIC_FILE.FILE_FORMAT,
"data_type": CONCEPT.GENOMIC_FILE.DATA_TYPE,
"availability": CONCEPT.GENOMIC_FILE.AVAILABILITY,
"controlled_access": None,
"is_harmonized": CONCEPT.GENOMIC_FILE.HARMONIZED,
"paired_end": CONCEPT.READ_GROUP.PAIRED_END,
"hashes": CONCEPT.GENOMIC_FILE.UNIQUE_KEY,
"size": CONCEPT.GENOMIC_FILE.UNIQUE_KEY,
"urls": CONCEPT.GENOMIC_FILE.UNIQUE_KEY,
"acl": CONCEPT.GENOMIC_FILE.UNIQUE_KEY,
"reference_genome": CONCEPT.GENOMIC_FILE.UNIQUE_KEY,
"hashes": CONCEPT.GENOMIC_FILE.HASH_DICT,
"size": CONCEPT.GENOMIC_FILE.SIZE,
"urls": CONCEPT.GENOMIC_FILE.URL_LIST,
"acl": None,
"reference_genome": CONCEPT.GENOMIC_FILE.REFERENCE_GENOME,
'visible': CONCEPT.GENOMIC_FILE.VISIBLE
},
'endpoint': '/genomic-files'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
),
keep_map(
in_col='file_path',
out_col=CONCEPT.GENOMIC_FILE.FILE_PATH
out_col=CONCEPT.GENOMIC_FILE.URL_LIST
),
keep_map(
in_col='file_path',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,11 @@
out_col=CONCEPT.GENOMIC_FILE.FILE_NAME
),
row_map(
m=lambda row: row['storage_dir'] + '/' + row['bam_file_name'],
out_col=CONCEPT.GENOMIC_FILE.FILE_PATH
m=lambda row: [row['storage_dir'] + '/' + row['bam_file_name']],
out_col=CONCEPT.GENOMIC_FILE.URL_LIST
),
row_map(
m=lambda row: row['storage_dir'] + '/' + row['cram_file_name'],
out_col=CONCEPT.GENOMIC_FILE.FILE_PATH
m=lambda row: [row['storage_dir'] + '/' + row['cram_file_name']],
out_col=CONCEPT.GENOMIC_FILE.URL_LIST
)
]
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@
),
value_map(
in_col='s3_path',
m=lambda x: 's3://' + x,
out_col=CONCEPT.GENOMIC_FILE.FILE_PATH
m=lambda x: ['s3://' + x],
out_col=CONCEPT.GENOMIC_FILE.URL_LIST
),
value_map(
in_col='DNA/RNA',
Expand Down
242 changes: 121 additions & 121 deletions tests/data/test_study/extract_outputs/simple_tsv_example2_output.tsv
Original file line number Diff line number Diff line change
@@ -1,121 +1,121 @@
index BIOSPECIMEN|ALIQUOT_ID GENOMIC_FILE|FILE_PATH GENOMIC_FILE|FILE_NAME BIOSPECIMEN|ID
0 4 FLDR0/2.bam 2.bam 4
1 5 FLDR0/3.bam 3.bam 5
2 6 FLDR0/4.bam 4.bam 6
3 7 FLDR1/5.bam 5.bam 7
4 8 FLDR1/6.bam 6.bam 8
5 9 FLDR1/7.bam 7.bam 9
6 10 FLDR1/8.bam 8.bam 10
7 11 FLDR1/9.bam 9.bam 11
8 12 FLDR2/10.bam 10.bam 12
9 13 FLDR2/11.bam 11.bam 13
10 14 FLDR2/12.bam 12.bam 14
11 15 FLDR2/13.bam 13.bam 15
12 16 FLDR2/14.bam 14.bam 16
13 17 FLDR3/15.bam 15.bam 17
14 18 FLDR3/16.bam 16.bam 18
15 19 FLDR3/17.bam 17.bam 19
16 20 FLDR3/18.bam 18.bam 20
17 21 FLDR3/19.bam 19.bam 21
18 22 FLDR4/20.bam 20.bam 22
19 23 FLDR4/21.bam 21.bam 23
20 24 FLDR4/22.bam 22.bam 24
21 25 FLDR4/23.bam 23.bam 25
22 26 FLDR4/24.bam 24.bam 26
23 27 FLDR5/25.bam 25.bam 27
24 28 FLDR5/26.bam 26.bam 28
25 29 FLDR5/27.bam 27.bam 29
26 30 FLDR5/28.bam 28.bam 30
27 31 FLDR5/29.bam 29.bam 31
28 32 FLDR6/30.bam 30.bam 32
29 33 FLDR6/31.bam 31.bam 33
30 34 FLDR6/32.bam 32.bam 34
31 35 FLDR6/33.bam 33.bam 35
32 36 FLDR6/34.bam 34.bam 36
33 37 FLDR7/35.bam 35.bam 37
34 38 FLDR7/36.bam 36.bam 38
35 39 FLDR7/37.bam 37.bam 39
36 40 FLDR7/38.bam 38.bam 40
37 41 FLDR7/39.bam 39.bam 41
38 42 FLDR8/40.bam 40.bam 42
39 43 FLDR8/41.bam 41.bam 43
40 44 FLDR8/42.bam 42.bam 44
41 45 FLDR8/43.bam 43.bam 45
42 46 FLDR8/44.bam 44.bam 46
43 47 FLDR9/45.bam 45.bam 47
44 48 FLDR9/46.bam 46.bam 48
45 49 FLDR9/47.bam 47.bam 49
46 50 FLDR9/48.bam 48.bam 50
47 51 FLDR9/49.bam 49.bam 51
48 52 FLDR10/50.bam 50.bam 52
49 53 FLDR10/51.bam 51.bam 53
50 54 FLDR10/52.bam 52.bam 54
51 55 FLDR10/53.bam 53.bam 55
52 56 FLDR10/54.bam 54.bam 56
53 57 FLDR11/55.bam 55.bam 57
54 58 FLDR11/56.bam 56.bam 58
55 59 FLDR11/57.bam 57.bam 59
56 60 FLDR11/58.bam 58.bam 60
57 61 FLDR11/59.bam 59.bam 61
58 62 FLDR12/60.bam 60.bam 62
59 63 FLDR12/61.bam 61.bam 63
0 4 FLDR0/2.cram 2.cram 4
1 5 FLDR0/3.cram 3.cram 5
2 6 FLDR0/4.cram 4.cram 6
3 7 FLDR1/5.cram 5.cram 7
4 8 FLDR1/6.cram 6.cram 8
5 9 FLDR1/7.cram 7.cram 9
6 10 FLDR1/8.cram 8.cram 10
7 11 FLDR1/9.cram 9.cram 11
8 12 FLDR2/10.cram 10.cram 12
9 13 FLDR2/11.cram 11.cram 13
10 14 FLDR2/12.cram 12.cram 14
11 15 FLDR2/13.cram 13.cram 15
12 16 FLDR2/14.cram 14.cram 16
13 17 FLDR3/15.cram 15.cram 17
14 18 FLDR3/16.cram 16.cram 18
15 19 FLDR3/17.cram 17.cram 19
16 20 FLDR3/18.cram 18.cram 20
17 21 FLDR3/19.cram 19.cram 21
18 22 FLDR4/20.cram 20.cram 22
19 23 FLDR4/21.cram 21.cram 23
20 24 FLDR4/22.cram 22.cram 24
21 25 FLDR4/23.cram 23.cram 25
22 26 FLDR4/24.cram 24.cram 26
23 27 FLDR5/25.cram 25.cram 27
24 28 FLDR5/26.cram 26.cram 28
25 29 FLDR5/27.cram 27.cram 29
26 30 FLDR5/28.cram 28.cram 30
27 31 FLDR5/29.cram 29.cram 31
28 32 FLDR6/30.cram 30.cram 32
29 33 FLDR6/31.cram 31.cram 33
30 34 FLDR6/32.cram 32.cram 34
31 35 FLDR6/33.cram 33.cram 35
32 36 FLDR6/34.cram 34.cram 36
33 37 FLDR7/35.cram 35.cram 37
34 38 FLDR7/36.cram 36.cram 38
35 39 FLDR7/37.cram 37.cram 39
36 40 FLDR7/38.cram 38.cram 40
37 41 FLDR7/39.cram 39.cram 41
38 42 FLDR8/40.cram 40.cram 42
39 43 FLDR8/41.cram 41.cram 43
40 44 FLDR8/42.cram 42.cram 44
41 45 FLDR8/43.cram 43.cram 45
42 46 FLDR8/44.cram 44.cram 46
43 47 FLDR9/45.cram 45.cram 47
44 48 FLDR9/46.cram 46.cram 48
45 49 FLDR9/47.cram 47.cram 49
46 50 FLDR9/48.cram 48.cram 50
47 51 FLDR9/49.cram 49.cram 51
48 52 FLDR10/50.cram 50.cram 52
49 53 FLDR10/51.cram 51.cram 53
50 54 FLDR10/52.cram 52.cram 54
51 55 FLDR10/53.cram 53.cram 55
52 56 FLDR10/54.cram 54.cram 56
53 57 FLDR11/55.cram 55.cram 57
54 58 FLDR11/56.cram 56.cram 58
55 59 FLDR11/57.cram 57.cram 59
56 60 FLDR11/58.cram 58.cram 60
57 61 FLDR11/59.cram 59.cram 61
58 62 FLDR12/60.cram 60.cram 62
59 63 FLDR12/61.cram 61.cram 63
index BIOSPECIMEN|ALIQUOT_ID GENOMIC_FILE|URL_LIST GENOMIC_FILE|FILE_NAME BIOSPECIMEN|ID
0 4 ['FLDR0/2.bam'] 2.bam 4
1 5 ['FLDR0/3.bam'] 3.bam 5
2 6 ['FLDR0/4.bam'] 4.bam 6
3 7 ['FLDR1/5.bam'] 5.bam 7
4 8 ['FLDR1/6.bam'] 6.bam 8
5 9 ['FLDR1/7.bam'] 7.bam 9
6 10 ['FLDR1/8.bam'] 8.bam 10
7 11 ['FLDR1/9.bam'] 9.bam 11
8 12 ['FLDR2/10.bam'] 10.bam 12
9 13 ['FLDR2/11.bam'] 11.bam 13
10 14 ['FLDR2/12.bam'] 12.bam 14
11 15 ['FLDR2/13.bam'] 13.bam 15
12 16 ['FLDR2/14.bam'] 14.bam 16
13 17 ['FLDR3/15.bam'] 15.bam 17
14 18 ['FLDR3/16.bam'] 16.bam 18
15 19 ['FLDR3/17.bam'] 17.bam 19
16 20 ['FLDR3/18.bam'] 18.bam 20
17 21 ['FLDR3/19.bam'] 19.bam 21
18 22 ['FLDR4/20.bam'] 20.bam 22
19 23 ['FLDR4/21.bam'] 21.bam 23
20 24 ['FLDR4/22.bam'] 22.bam 24
21 25 ['FLDR4/23.bam'] 23.bam 25
22 26 ['FLDR4/24.bam'] 24.bam 26
23 27 ['FLDR5/25.bam'] 25.bam 27
24 28 ['FLDR5/26.bam'] 26.bam 28
25 29 ['FLDR5/27.bam'] 27.bam 29
26 30 ['FLDR5/28.bam'] 28.bam 30
27 31 ['FLDR5/29.bam'] 29.bam 31
28 32 ['FLDR6/30.bam'] 30.bam 32
29 33 ['FLDR6/31.bam'] 31.bam 33
30 34 ['FLDR6/32.bam'] 32.bam 34
31 35 ['FLDR6/33.bam'] 33.bam 35
32 36 ['FLDR6/34.bam'] 34.bam 36
33 37 ['FLDR7/35.bam'] 35.bam 37
34 38 ['FLDR7/36.bam'] 36.bam 38
35 39 ['FLDR7/37.bam'] 37.bam 39
36 40 ['FLDR7/38.bam'] 38.bam 40
37 41 ['FLDR7/39.bam'] 39.bam 41
38 42 ['FLDR8/40.bam'] 40.bam 42
39 43 ['FLDR8/41.bam'] 41.bam 43
40 44 ['FLDR8/42.bam'] 42.bam 44
41 45 ['FLDR8/43.bam'] 43.bam 45
42 46 ['FLDR8/44.bam'] 44.bam 46
43 47 ['FLDR9/45.bam'] 45.bam 47
44 48 ['FLDR9/46.bam'] 46.bam 48
45 49 ['FLDR9/47.bam'] 47.bam 49
46 50 ['FLDR9/48.bam'] 48.bam 50
47 51 ['FLDR9/49.bam'] 49.bam 51
48 52 ['FLDR10/50.bam'] 50.bam 52
49 53 ['FLDR10/51.bam'] 51.bam 53
50 54 ['FLDR10/52.bam'] 52.bam 54
51 55 ['FLDR10/53.bam'] 53.bam 55
52 56 ['FLDR10/54.bam'] 54.bam 56
53 57 ['FLDR11/55.bam'] 55.bam 57
54 58 ['FLDR11/56.bam'] 56.bam 58
55 59 ['FLDR11/57.bam'] 57.bam 59
56 60 ['FLDR11/58.bam'] 58.bam 60
57 61 ['FLDR11/59.bam'] 59.bam 61
58 62 ['FLDR12/60.bam'] 60.bam 62
59 63 ['FLDR12/61.bam'] 61.bam 63
0 4 ['FLDR0/2.cram'] 2.cram 4
1 5 ['FLDR0/3.cram'] 3.cram 5
2 6 ['FLDR0/4.cram'] 4.cram 6
3 7 ['FLDR1/5.cram'] 5.cram 7
4 8 ['FLDR1/6.cram'] 6.cram 8
5 9 ['FLDR1/7.cram'] 7.cram 9
6 10 ['FLDR1/8.cram'] 8.cram 10
7 11 ['FLDR1/9.cram'] 9.cram 11
8 12 ['FLDR2/10.cram'] 10.cram 12
9 13 ['FLDR2/11.cram'] 11.cram 13
10 14 ['FLDR2/12.cram'] 12.cram 14
11 15 ['FLDR2/13.cram'] 13.cram 15
12 16 ['FLDR2/14.cram'] 14.cram 16
13 17 ['FLDR3/15.cram'] 15.cram 17
14 18 ['FLDR3/16.cram'] 16.cram 18
15 19 ['FLDR3/17.cram'] 17.cram 19
16 20 ['FLDR3/18.cram'] 18.cram 20
17 21 ['FLDR3/19.cram'] 19.cram 21
18 22 ['FLDR4/20.cram'] 20.cram 22
19 23 ['FLDR4/21.cram'] 21.cram 23
20 24 ['FLDR4/22.cram'] 22.cram 24
21 25 ['FLDR4/23.cram'] 23.cram 25
22 26 ['FLDR4/24.cram'] 24.cram 26
23 27 ['FLDR5/25.cram'] 25.cram 27
24 28 ['FLDR5/26.cram'] 26.cram 28
25 29 ['FLDR5/27.cram'] 27.cram 29
26 30 ['FLDR5/28.cram'] 28.cram 30
27 31 ['FLDR5/29.cram'] 29.cram 31
28 32 ['FLDR6/30.cram'] 30.cram 32
29 33 ['FLDR6/31.cram'] 31.cram 33
30 34 ['FLDR6/32.cram'] 32.cram 34
31 35 ['FLDR6/33.cram'] 33.cram 35
32 36 ['FLDR6/34.cram'] 34.cram 36
33 37 ['FLDR7/35.cram'] 35.cram 37
34 38 ['FLDR7/36.cram'] 36.cram 38
35 39 ['FLDR7/37.cram'] 37.cram 39
36 40 ['FLDR7/38.cram'] 38.cram 40
37 41 ['FLDR7/39.cram'] 39.cram 41
38 42 ['FLDR8/40.cram'] 40.cram 42
39 43 ['FLDR8/41.cram'] 41.cram 43
40 44 ['FLDR8/42.cram'] 42.cram 44
41 45 ['FLDR8/43.cram'] 43.cram 45
42 46 ['FLDR8/44.cram'] 44.cram 46
43 47 ['FLDR9/45.cram'] 45.cram 47
44 48 ['FLDR9/46.cram'] 46.cram 48
45 49 ['FLDR9/47.cram'] 47.cram 49
46 50 ['FLDR9/48.cram'] 48.cram 50
47 51 ['FLDR9/49.cram'] 49.cram 51
48 52 ['FLDR10/50.cram'] 50.cram 52
49 53 ['FLDR10/51.cram'] 51.cram 53
50 54 ['FLDR10/52.cram'] 52.cram 54
51 55 ['FLDR10/53.cram'] 53.cram 55
52 56 ['FLDR10/54.cram'] 54.cram 56
53 57 ['FLDR11/55.cram'] 55.cram 57
54 58 ['FLDR11/56.cram'] 56.cram 58
55 59 ['FLDR11/57.cram'] 57.cram 59
56 60 ['FLDR11/58.cram'] 58.cram 60
57 61 ['FLDR11/59.cram'] 59.cram 61
58 62 ['FLDR12/60.cram'] 60.cram 62
59 63 ['FLDR12/61.cram'] 61.cram 63
Loading