Skip to content

Commit

Permalink
continue cleaning up breakpoint creation + interval methods
Browse files Browse the repository at this point in the history
  • Loading branch information
suhas-r committed Dec 24, 2024
1 parent e09fe14 commit dd50e4e
Show file tree
Hide file tree
Showing 13 changed files with 1,940 additions and 691 deletions.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -236,3 +236,11 @@ chr7 55610095 56049369 + 1 True 82.346163
chr7 54763282 56049369 + 2 False 2.843655
```


## FAQs
- `call_cnvs.sh` didn't produce segmented CN calls in a .cns file?
- `cnvkit.py batch` contains multiple steps detailed in their
[documentation](https://cnvkit.readthedocs.io/en/stable/pipeline.html). The
errors from a particular stage don't always percolate up when running the
complete pipeline via `batch`, so try running each stage separately to
pinpoint the root cause.
4 changes: 3 additions & 1 deletion coral/breakpoint/breakpoint_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@
class BreakpointGraph:
"""A container object for the breakpoint graphs."""

amplicon_intervals: list[datatypes.Interval] = field(default_factory=list)
amplicon_intervals: list[datatypes.AmpliconInterval] = field(
default_factory=list
)
sequence_edges: list[list[Any]] = field(default_factory=list)
concordant_edges: list[list[Any]] = field(default_factory=list)
discordant_edges: list[list[Any]] = field(default_factory=list)
Expand Down
43 changes: 24 additions & 19 deletions coral/breakpoint/breakpoint_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,57 +3,62 @@
from __future__ import annotations

import io
from collections import defaultdict
from dataclasses import dataclass
from typing import Any, Dict, List

import intervaltree
import numpy as np

from coral.types import CnsInterval
from coral.datatypes import CNSInterval, CNSIntervalTree


@dataclass
class CNSSegData:
"""Container class for mapping CN (copy number) per chromosome segment, as parsed from a .cns or .bed file."""

tree: intervaltree.IntervalTree # IntervalTree mapping chromosome segments to their (per-chromosome) index in input file
intervals: List[
CnsInterval
] # Raw list of all chromosome intervals given, of the form (chromosome, start_idx, end_idx)
intervals_by_chr: Dict[
str, List[List[int]]
] # List of chromosome intervals with their respective CNs, grouped by chromosome number
tree: Dict[
str, CNSIntervalTree
] # IntervalTree mapping chromosome segments to their (per-chromosome) index in input file
# Raw list of all chromosome intervals given, of the form
# (chromosome, start_idx, end_idx)
intervals: List[CNSInterval]
# List of chromosome intervals with their respective CNs,
# grouped by chromosome number
intervals_by_chr: Dict[str, List[CNSInterval]]
log2_cn: List[float] # Log2 of CN for each chromosome segment

@classmethod
def from_file(cls, file: io.TextIOWrapper) -> CNSSegData:
"""Parse CN interval data from a .cns or .bed file."""
tree, log2_cns = {}, []
intervals = []
intervals_by_chr: Dict[str, List[List[Any]]] = {}
intervals_by_chr: Dict[str, List[CNSInterval]] = defaultdict(list)
is_cns = file.name.endswith(".cns")

idx = 0
for line in file:
fields = line.strip().split()
if not fields or fields[0] == "chromosome" or line.startswith("#"): # Skip potential header row
if (
not fields or fields[0] == "chromosome" or line.startswith("#")
): # Skip potential header row
continue
chr_tag, start, end = fields[0], int(fields[1]), int(fields[2])
intervals.append([chr_tag, start, end - 1])
if chr_tag not in tree:
tree[chr_tag] = intervaltree.IntervalTree()
intervals_by_chr[chr_tag] = []
idx = 0
tree[chr_tag][start:end] = idx
idx += 1

# Calc log2 CN for .cns
log2_cn = (
float(fields[4]) if is_cns else np.log2(float(fields[3]) / 2.0)
)
raw_cn = 2 * (2**log2_cn) if is_cns else float(fields[3])

log2_cns.append(log2_cn)
intervals_by_chr[chr_tag].append([chr_tag, start, end - 1, raw_cn])
cns_interval = CNSInterval(chr_tag, start, end - 1, raw_cn)

intervals.append(cns_interval)
intervals_by_chr[chr_tag].append(cns_interval)
if chr_tag not in tree:
tree[chr_tag] = CNSIntervalTree()
idx = 0
tree[chr_tag][start:end] = idx
idx += 1

return cls(tree, intervals, intervals_by_chr, log2_cns)
Loading

0 comments on commit dd50e4e

Please sign in to comment.