Skip to content

Commit

Permalink
Merge pull request #11 from erinyoung/update-2024-01-16
Browse files Browse the repository at this point in the history
Update 2024 01 16
  • Loading branch information
erinyoung authored Jan 17, 2024
2 parents 4568ea3 + 35b070f commit b62ee87
Show file tree
Hide file tree
Showing 10 changed files with 203 additions and 159 deletions.
180 changes: 83 additions & 97 deletions aci/aci.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env python3
# pylint: disable=logging-fstring-interpolation

'''
Author: Erin Young
Expand All @@ -13,151 +14,136 @@

# I tried to keep dependencies down...
import argparse
import concurrent.futures
import itertools
import logging
import os
import sys
import tempfile
import pandas as pd

from aci.utils.amplicon_depth import amplicon_depth # pylint: disable=E0401
from aci.utils.column_names import column_names # pylint: disable=E0401
from aci.utils.get_regions import get_regions # pylint: disable=E0401
from aci.utils.genome_depth import genome_depth # pylint: disable=E0401
from aci.utils.plotting_amplicons import plotting_amplicons # pylint: disable=E0401
from aci.utils.plotting_depth import plotting_depth # pylint: disable=E0401
from aci.utils.prep import prep # pylint: disable=E0401
from aci.utils.get_coverage import get_coverage # pylint: disable=E0401
from aci.utils.amplicon_splitting import amplicon_splitting
from aci.utils.genome_depth import genome_depth
from aci.utils.plotting_amplicons import plotting_amplicons
from aci.utils.plotting_depth import plotting_depth
from aci.utils.prep import prep

# about 30 seconds per artic V3 primer on SRR13957125
# $ samtools coverage SRR13957125.sorted.bam
# #rname startpos endpos numreads covbases coverage meandepth meanbaseq meanmapq
# MN908947.3 1 29903 1141595 29827 99.7458 5350.27 37.3 60
# 15000 - 16500

def main(): # pylint: disable=R0914,R0915
def main():
""" Use pysam to get depth for amplicon region and general coverage """

##### ----- ----- ----- ----- ----- #####
##### Part 0. Setup #####
##### ----- ----- ----- ----- ----- #####

version = '1.0.20231229'
version = '1.4.20240116'

parser = argparse.ArgumentParser()
parser.add_argument('-b', '--bam', nargs = '+', required = True, type = str, help = '(required) input bam file(s)') # pylint: disable=C0301
parser.add_argument('-d', '--bed', required = True, type = str, help ='(required) amplicon bedfile') # pylint: disable=C0301
parser.add_argument('-o', '--out', required = False, type = str, help = 'directory for results', default = 'aci') # pylint: disable=C0301
parser.add_argument('-log', '--loglevel', required = False, type = str, help = 'logging level', default = 'INFO') # pylint: disable=C0301
parser.add_argument('-t', '--threads', required = False, type = int, help = 'specifies number of threads to use', default=4) # pylint: disable=C0301
parser.add_argument('-v', '--version', help='print version and exit', action = 'version', version = version) # pylint: disable=C0301
parser.add_argument('-b', '--bam',
nargs = '+',
required = True,
type = str,
help = '(required) input bam file(s)')
parser.add_argument('-d', '--bed',
required = True,
type = str,
help ='(required) amplicon bedfile')
parser.add_argument('-o', '--out',
required = False,
type = str,
help = 'directory for results',
default = 'aci')
parser.add_argument('-log', '--loglevel',
required = False,
type = str,
help = 'logging level',
default = 'INFO')
parser.add_argument('-t', '--threads',
required = False,
type = int,
help = 'specifies number of threads to use',
default=4)
parser.add_argument('-v', '--version',
help='print version and exit',
action = 'version',
version = version)
args = parser.parse_args()

logging.basicConfig(format='%(asctime)s - %(message)s',
datefmt = '%y-%b-%d %H:%M:%S',
level=args.loglevel.upper())

if not os.path.exists(args.bed):
logging.critical('bedfile ' + args.bed + ' does not exist. Exiting') # pylint: disable=W1201
logging.critical(f"bedfile {args.bed} does not exist. Exiting")
sys.exit(2)

if not os.path.exists(args.out):
os.mkdir(args.out)

logging.info('ACI version :\t\t' + str(version)) # pylint: disable=W1201
logging.info('Number of threads :\t' + str(args.threads)) # pylint: disable=W1201
logging.info('Final directory :\t\t' + str(args.out)) # pylint: disable=W1201
logging.info('Input bed file :\t\t' + str(args.bed)) # pylint: disable=W1201
logging.info('Input bam file(s) :\t' + ', '.join(args.bam)) # pylint: disable=W1201

bed = args.bed
out = args.out
threads = args.threads
temp_dir = tempfile.TemporaryDirectory(dir = args.out) # pylint: disable=R1732

meta = {}
filenames = []
for bam in args.bam:
meta[bam] = {}
meta[bam]['initial_bam'] = bam
meta[bam]['out'] = out
meta[bam]['tmp'] = temp_dir.name + '/'
meta[bam]['file_name'] = os.path.basename(bam)
meta[bam]['sorted_bam'] = meta[bam]['tmp'] + os.path.basename(bam)
meta[bam]['sorted_bai'] = meta[bam]['sorted_bam'] + '.bai'
filenames.append(meta[bam]['file_name'])

logging.info('Sorting and indexing ' + meta[bam]['file_name']) # pylint: disable=W1201
prep(meta[bam]['initial_bam'], meta[bam]['sorted_bam'], threads)
logging.info('Finished sorting and indexing')

logging.debug('the filenames for all the bam files are')
logging.debug(filenames)
logging.info(f"ACI version :\t\t{str(version)}")
logging.info(f"Number of threads :\t{str(args.threads)}")
logging.info(f"Final directory :\t\t{str(args.out)}")
logging.info(f"Input bed file :\t\t{str(args.bed)}")
logging.info(f"Input bam file(s) :\t{', '.join(args.bam)}")

##### ----- ----- ----- ----- ----- #####
##### Part 1. Amplicon depths #####
##### ----- ----- ----- ----- ----- #####

# getting regions for parallel processing
regions = get_regions(bed)

logging.info('Getting depth for amplicons')
logging.debug('List for parallel processing:')
logging.debug(list(itertools.product(args.bam, regions)))
with concurrent.futures.ThreadPoolExecutor(max_workers=args.threads) as executor:
for bam, subregion in list(itertools.product(args.bam, regions)):
results = [executor.submit(amplicon_depth, meta[bam], subregion)]
# keeping the line below for testing
# results = amplicon_depth(df, meta[bam], subregion)

for f in concurrent.futures.as_completed(results):
logging.debug(f.result())
with tempfile.TemporaryDirectory(dir = args.out) as temp_dir:
meta = {}
filenames = []
for bam in args.bam:
meta[bam] = {}
meta[bam]['initial_bam'] = bam
meta[bam]['out'] = args.out
meta['tmp'] = temp_dir + '/'
meta[bam]['tmp'] = temp_dir + '/'
meta[bam]['file_name'] = os.path.basename(bam)
meta[bam]['sorted_bam'] = meta[bam]['tmp'] + os.path.basename(bam)
meta[bam]['sorted_bai'] = meta[bam]['sorted_bam'] + '.bai'
filenames.append(meta[bam]['file_name'])

# setting up the dataframe
columns = column_names(bed)
df = pd.DataFrame(columns= ['bam'] + columns)
df['bam'] = filenames
logging.debug('Initial empty dataframe:')
logging.debug(df)
logging.info(f"Sorting and indexing {meta[bam]['file_name']}")
prep(meta[bam]['initial_bam'], meta[bam]['sorted_bam'], args.threads)
logging.info('Finished sorting and indexing')
meta['filenames'] = filenames

# NOTE : Had to be done outside of concurrent
for bam in args.bam:
bamindex = df.index[df['bam'] == meta[bam]['file_name']]
for subregion in regions:
cov = get_coverage(meta[bam], subregion)
df.loc[bamindex, [subregion.split(':')[3]]] = cov
logging.debug('the filenames for all the bam files are')
logging.debug(filenames)

logging.debug('The final dataframe is:')
logging.debug(df)
##### ----- ----- ----- ----- ----- #####
##### Part 1. Amplicon depths #####
##### ----- ----- ----- ----- ----- #####

plotting_amplicons(df, out)
df = amplicon_splitting(meta, args)

logging.info('Depth for amplicons is saved in ' + args.out + '/amplicon_depth.csv') # pylint: disable=W1201
logging.info('An boxplot of these depths is at ' + args.out + '/amplicon_depth.png') # pylint: disable=W1201
plotting_amplicons(df, args.out)

# ##### ----- ----- ----- ----- ----- #####
# ##### Part 2. Genome/bam depths #####
# ##### ----- ----- ----- ----- ----- #####
logging.info(f"Amplicon depth is saved in {args.out}/amplicon_depth.csv")
logging.info(f"An boxplot of these depths is at {args.out}/amplicon_depth.png")

df_pysam = pd.DataFrame([])
# ##### ----- ----- ----- ----- ----- #####
# ##### Part 2. Genome/bam depths #####
# ##### ----- ----- ----- ----- ----- #####

# TODO : Fix this so that it's concurrent friendly # pylint: disable=W0511
df_pysam = pd.DataFrame([])

for bam in args.bam:
df_pysam_results = genome_depth(meta[bam])
df_pysam = pd.concat([df_pysam, df_pysam_results], ignore_index=True)
# NOTE : Attempted with concurrent and this was just as fast

logging.debug('The final pysam dataframe is:')
logging.debug(df_pysam)
for bam in args.bam:
df_pysam_results = genome_depth(meta[bam])
df_pysam = pd.concat([df_pysam, df_pysam_results], ignore_index=True)

plotting_depth(df_pysam, out)
plotting_depth(df_pysam, args.out)

logging.info('Depth for the genome from the bam file is saved in ' + out + '/genome_depth.csv') # pylint: disable=W1201
logging.info('An boxplot of these depths is at ' + out + '/genome_depth.png') # pylint: disable=W1201
logging.info(f"Genome depth is saved in {args.out}/genome_depth.csv")
logging.info(f"An boxplot of these depths is at {args.out}/genome_depth.png")

logging.info('ACI is complete! (I hope all your primers are behaving as expected!)')
# ##### ----- ----- ----- ----- ----- #####
# ##### Fin #####
# ##### ----- ----- ----- ----- ----- #####

logging.info('ACI is complete! (I hope all your primers are behaving as expected!)')

if __name__ == "__main__":
main()
52 changes: 28 additions & 24 deletions aci/utils/amplicon_depth.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env python
# pylint: disable=logging-fstring-interpolation

""" Use pysam to get depth for amplicon region """

Expand All @@ -8,48 +9,51 @@
from .subregion import subregion
from .within import within
from .without import without
from .get_coverage import get_coverage

def amplicon_depth(meta, region):
""" Use pysam to get depth for amplicon region """

name = region.split(':')[3]
# getting subregion information and creating bedfile
subrange, name = subregion(region)

# pylint was complaining about the number of variables
# so they're all dict values instead
meta['after_reduction_bam'] = meta['tmp'] + name + '.1.' + meta['file_name']
meta['removing_outside_matches'] = meta['tmp'] + name + '.2.' + meta['file_name']
meta['junk_bam'] = meta['tmp'] + name + '.3.' + meta['file_name']
meta['final_bam'] = meta['tmp'] + name + '.4.' + meta['file_name']
meta['subregion_bed'] = meta['tmp'] + name + '.' + meta['file_name'] + '.bed' # pylint: disable=C0301
after_reduction_bam = meta['tmp'] + name + '.step1.' + meta['file_name']
removing_outside_matches = meta['tmp'] + name + '.step2.' + meta['file_name']
junk_bam = meta['tmp'] + name + '.step3.' + meta['file_name']
final_bam = meta['tmp'] + name + '.step4.' + meta['file_name']
logging.debug('The filenames are going to be :')
logging.debug(meta)

# getting subregion information and creating bedfile
subrange = subregion(region, meta['subregion_bed'])
# setting the default value
cov = 0.0

# reduce bam file to something smaller
if os.path.exists(meta['sorted_bam']):
logging.debug('Step 1. reducing bam for speed for ' + meta['sorted_bam']) # pylint: disable=W1201
within(meta['sorted_bam'], meta['after_reduction_bam'], subrange)
logging.debug(f"Step 1. reducing bam for speed for {meta['sorted_bam']}")
within(meta['sorted_bam'], after_reduction_bam, subrange)

if os.path.exists(meta['after_reduction_bam']):
pysam.index(meta['after_reduction_bam'])
if os.path.exists(after_reduction_bam):
pysam.index(after_reduction_bam)

# remove all reads that fall outside of region of interest
# warning : this is the slow part of the script
logging.debug('Step 2. reducing bam for speed for ' + meta['sorted_bam']) # pylint: disable=W1201
without(meta['after_reduction_bam'], meta['removing_outside_matches'], meta['junk_bam'], meta['subregion_bed']) # pylint: disable=C0301
logging.debug(f"Step 2. reducing bam for speed for {meta['sorted_bam']}")
without(after_reduction_bam,
removing_outside_matches,
junk_bam,
bed = meta['tmp'] + name + '.bed')

if os.path.exists(meta['removing_outside_matches']):
pysam.index(meta['removing_outside_matches'])
if os.path.exists(removing_outside_matches):
pysam.index(removing_outside_matches)

# get only reads that are within subrange
logging.debug('Step 3. final reduction for ' + meta['sorted_bam']) # pylint: disable=W1201
within(meta['removing_outside_matches'], meta['final_bam'], subrange)
logging.debug(f"Step 3. final reduction for {meta['sorted_bam']}")
within(removing_outside_matches, final_bam, subrange)

if os.path.exists(meta['final_bam']):
pysam.index(meta['final_bam'])
if os.path.exists(final_bam):
pysam.index(final_bam)
cov = get_coverage(final_bam, subrange)

logging.info('Amplicon bam file created for ' + meta['file_name'] + ' over ' + subrange) # pylint: disable=W1201
logging.info(f"Amplicon bam file created for {meta['file_name']} over {subrange}")

return [meta['file_name'], meta['initial_bam'], name]
return [meta['file_name'], name, cov]
47 changes: 47 additions & 0 deletions aci/utils/amplicon_splitting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/env python
# pylint: disable=logging-fstring-interpolation

""" Split processes by region in bedfile for concurrent """

import logging
import concurrent.futures
import pandas as pd

from .amplicon_depth import amplicon_depth
from .get_regions import get_regions
from .column_names import column_names

def amplicon_splitting(meta, args):
""" Split processes by region in bedfile for concurrent """

# getting region for parallel processing
regions = get_regions(meta, args.bed)
#regions = ['MN908947.3:54:385:1', 'MN908947.3:342:704:2', 'MN908947.3:664:1004:3', 'MN908947.3:965:1312:4', 'MN908947.3:1264:1623:5', 'MN908947.3:1595:1942:6' ]

logging.info('Getting depth for amplicons')
results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=args.threads) as executor:
tasks = []
for bam in args.bam:
for subregion in regions:
future = executor.submit(amplicon_depth, meta[bam], subregion)
tasks.append(future)

completed_tasks, _ = concurrent.futures.wait(tasks,return_when=concurrent.futures.ALL_COMPLETED) # pylint: disable=C0301
results = [task.result() for task in completed_tasks]

# setting up the dataframe
columns = column_names(args.bed)
df = pd.DataFrame(columns= ['bam'] + columns)
df['bam'] = meta['filenames']
logging.debug('Initial empty dataframe:')
logging.debug(df)

# NOTE : Had to be done outside of concurrent
for bam, name, cov in results :
bamindex = df.index[df['bam'] == bam]
df.loc[bamindex, name] = cov

df = df.fillna(0)

return df
4 changes: 2 additions & 2 deletions aci/utils/genome_depth.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def genome_depth(meta):
""" Takes a bam file file and gets coverage """

# getting depth results from pysam
df = pd.DataFrame([x.split('\t') for x in pysam.depth(meta['sorted_bam']).split('\n')])
df = pd.DataFrame([x.split('\t') for x in pysam.depth(meta['sorted_bam']).split('\n')]) # pylint: disable=E1101
df.columns = ['ref', 'pos', 'cov']

# ensuring that the type is correct
Expand All @@ -19,7 +19,7 @@ def genome_depth(meta):
df['bam'] = meta['file_name']
df = df.dropna()

logging.debug('pysam results for ' + meta['sorted_bam']) # pylint: disable=W1201
logging.debug(f"pysam results for {meta['sorted_bam']}") # pylint: disable=W1203
logging.debug(df)

return df
Loading

0 comments on commit b62ee87

Please sign in to comment.