-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathsample_srr
executable file
·133 lines (96 loc) · 4.29 KB
/
sample_srr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/env python3
#############################################################################
# Copyright 2020 Simon Andrews
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
############################################################################
import urllib.request
import zlib
from ftplib import FTP
import sys
import argparse
def main():
# SRR12478073 is a good test
options = read_options()
url = get_url(options.accession)
sample_url(url,options.skip,options.collect)
def sample_url(url,skip,sample):
url_sections = url.split("/")
server = url_sections[0]
file = url_sections[-1]
folder = "/".join(url_sections[1:-1])
decompressor = collect_gzip_data(skip,sample)
#print(f"URL is {url} server={server} folder={folder} file={file}")
#print(f"Connecting to {server}")
ftp = FTP(server)
#print(f"Logging in")
ftp.login()
#print(f"Moving to {folder}")
ftp.cwd(folder)
ftp.retrbinary("RETR "+file,decompressor, blocksize=124)
ftp.quit()
def collect_gzip_data(skip,collect):
decompressor = zlib.decompressobj(zlib.MAX_WBITS|16)
newline_count = 0
def accept_data(chunk):
nonlocal decompressor
nonlocal newline_count
new_data = decompressor.decompress(chunk).decode('utf-8')
lines = new_data.split("\n")
# Python ignores sigpipe errors and will generate an exception
# if STDOUT is piped to a program such as head which closes the
# pipe before all data is written. To fix this we need to catch
# the BrokenPipeException and then just exit gracefully.
try:
for i in range(len(lines)):
if newline_count >= 4*skip:
print(lines[i],end='')
if i < len(lines)-1:
print("\n",end='')
if i < len(lines)-1:
newline_count += 1
if newline_count >= 4*(skip+collect):
sys.exit()
except BrokenPipeError:
sys.exit()
return(accept_data)
def get_url(accession):
# Get the FTP locations for the sample
ena_rest_url = f"https://www.ebi.ac.uk/ena/portal/api/filereport?accession={accession}&result=read_run&fields=run_accession,fastq_ftp"
with urllib.request.urlopen(ena_rest_url) as response:
if response.getcode() != 200:
raise IOError(f"[ENA] Got error code {response.getcode()} from ENA REST for accession {sample['accession']}")
url = ""
found_accession_line = False
for line in response:
line = line.decode("UTF-8").strip()
#print(line)
if line.startswith("SRR"):
found_accession_line = True
sections = line.split("\t")
if len(sections) != 2:
raise IOError(f"[ENA] No ENA fastq files found for accession {sample['accession']}")
url = sections[1].split(";")[0] # Just take the first read if there are multiple available
return(url)
raise IOError(f"[ENA] Found no accession in response from ENA REST for accession {sample['accession']}")
def read_options():
parser = argparse.ArgumentParser(description="Sample data from an SRR accession")
parser.add_argument('--skip', type=int, help="Number of reads to skip at the start of the file (default 100,000)", default=100000)
parser.add_argument('--collect', type=int, help="Number of reads to report to STDOUT (default 100,000)", default=100000)
parser.add_argument('accession', type=str, help="The SRR accession to sample")
options = parser.parse_args()
return options
if __name__ == "__main__":
main()