add file

f1239fd2 · adaZ-9 · fcf5dbcc · f1239fd2
Commit f1239fd2 authored Jul 25, 2023 by adaZ-9
Show whitespace changes
Inline Side-by-side

Showing with 275 additions and 0 deletions

script_splicingamount.py script_splicingamount.py +275 -0

No files found.
--- a/script_splicingamount.py
+++ b/script_splicingamount.py
+#coding=utf-8
+
+'''
+This script generates splicing amount of given circRNAs in samples based on junction provided by annotation file (gtf).
+Annotation of version gencode.v38lift37 is recommended.
+GTF files of circRNA information generated with CIRIquant and SAM (or BAM) files  generated with BWA are needed.
+
+Usage:
+python ./script_splicingamount.py /path/to/inputfile /output/dir /path/to/gtffile
+Input file contains three columns seperated by tab. Header is needed. 
+The first column contains the sample index, the second column contains the path to CIRIquant derived gtf file and the third column contains the path to SAM file.
+'''
+
+import os
+from subprocess import *
+import re
+import sys
+import pickle
+
+def gtf_junction_extract(fn):
+    '''
+    extract junction information from genecode gtf
+    return junction_dict
+    '''
+    CHR=0
+    GENE_PART=2
+    EXON_START=3
+    EXON_END=4
+    INFO=8
+    
+    # initialize junction_dict
+    chrs = ['chr' + str(i) for i in range(1, 23)]
+    chrs.extend(['chrX', 'chrY', 'chrM'])
+    junction_dict = {}
+    for c in chrs:
+        junction_dict[c] = {'up':{}, 'down':{}}
+    
+    # read gtf
+    with open(fn, 'r') as f:
+        for line in f:
+            if len(line) == 0:
+                continue
+            if line.startswith('#'):
+                continue
+            content = line.strip().split('\t')
+            gene_part = content[GENE_PART]
+            if gene_part != 'exon':
+                continue
+            chrom = content[CHR]
+            exon_start = content[EXON_START]
+            for pos in range(int(exon_start)-6, int(exon_start)+7):
+                if pos not in junction_dict[chrom]['down']:
+                    junction_dict[chrom]['down'][pos] = {'gene_id':[]}
+            exon_end = content[EXON_END]
+            for pos in range(int(exon_end)-6, int(exon_end)+7):
+                if pos not in junction_dict[chrom]['up']:
+                    junction_dict[chrom]['up'][pos] = {'gene_id':[]}
+            info = content[INFO]
+            gene_id = [x for x in info.split(';') if 'gene_id' in x][0]
+            gene_id = gene_id.strip().split(' ')[1].strip('"')
+            for pos in range(int(exon_start)-6, int(exon_start)+7):
+                if gene_id not in junction_dict[chrom]['down'][pos]['gene_id']:
+                    junction_dict[chrom]['down'][pos]['gene_id'].append(gene_id)
+            for pos in range(int(exon_end)-6, int(exon_end)+7):
+                if gene_id not in junction_dict[chrom]['up'][pos]['gene_id']:
+                    junction_dict[chrom]['up'][pos]['gene_id'].append(gene_id)
+    return junction_dict
+
+def read_ciri_result(fn):
+    '''
+    extract exon circ id and correspond gene name and gene id
+    return ciri_dict
+    '''
+    # initiate ciri_dict
+    ciri_gene_dict = {}
+    ciri_circ_dict = {}
+    
+    # read ciri_quant result    
+    with open(fn, 'r') as f:
+        for line in f:
+            if len(line) == 0:
+                continue
+            if line.startswith('#'):
+                continue
+            content = line.strip().split('\t')
+            info = content[-1].strip().split('; ')
+            circ_type = [x for x in info if 'circ_type' in x][0]
+            circ_type = circ_type.split(' ')[-1].strip('"')
+            if circ_type != 'exon':
+                continue
+            circ_id = [x for x in info if 'circ_id' in x][0]
+            circ_id = circ_id.split(' ')[-1].strip('"')
+            gene_ids = [x for x in info if 'gene_id' in x][0]
+            gene_ids = sorted(gene_ids.split(' ')[-1].strip('"').split(','))
+            for gene_id in gene_ids:
+                if gene_id not in ciri_gene_dict:
+                    ciri_gene_dict[gene_id] = 0 # geneid: splicing amount
+            ciri_circ_dict[circ_id] = gene_ids # circid: geneid
+    return ciri_gene_dict, ciri_circ_dict
+
+def extract_juncreads_from_sam(sam_fn, ciri_result, junction_dict, outputdir):
+        
+    # extract juncreads from sam
+    MAPQ_INDEX = 4
+    CIGAR_INDEX = 5
+    CHROM_INDEX = 2
+    POS_INDEX = 3
+    NAME_INDEX = 0
+    FLAG_INDEX = 1
+    first_align = False
+    align_pattern = None
+    align1_gene_id = []
+    align2_gene_id = []
+    linear_align1 = False
+    circ_align1 = False
+    linear_align2 = False
+    circ_align2 = False
+    
+    with open(sam_fn, 'r') as f:
+        chrs = ['chr' + str(i) for i in range(1, 23)]
+        chrs.extend(['chrX', 'chrY', 'chrM'])
+        for line in f:
+            if len(line) == 0:
+                continue
+            if line.startswith('@'):
+                continue
+            content = line.strip().split('\t')
+            # MAPQ screening
+            MAPQ = content[MAPQ_INDEX]
+            if int(MAPQ) <= 5:
+                first_align = False
+                continue
+            # CIGAR screening
+            CIGAR = content[CIGAR_INDEX]
+            chrom = content[CHROM_INDEX]
+            if chrom not in chrs:
+                continue
+            if first_align:
+                align2_gene_id = []
+                linear_align2 = False
+                circ_align2 = False
+                first_align = False
+                if align_pattern == 'MS':
+                    if re.match(r'^\d+[SH]\d+M$', CIGAR):
+                        align2_name = content[NAME_INDEX]
+                        align2_chr = content[CHROM_INDEX]
+                        align2_length = sum([int(i) for i in re.findall(r'\d+', CIGAR)])
+                        align2_flag = int(content[FLAG_INDEX])
+                        align2_pos = int(content[POS_INDEX])
+                        # supplementary alignment
+                        # align name; align chr; align length; align flag; same junction type; same geneid
+                        if align1_name == align2_name and align1_chr == align2_chr and align1_length == align2_length and align2_flag == align1_flag+2048:
+                            align2_aligned_seg = int(re.findall(r'\d+', CIGAR)[1])
+                            align2_pos_plusM = align2_pos + align2_aligned_seg - 1
+                            if align2_pos in junction_dict[align2_chr]['down']:
+                                linear_align2 = True
+                                align2_gene_id.extend(junction_dict[align2_chr]['down'][align2_pos]['gene_id'])
+                            if align2_pos_plusM in junction_dict[align2_chr]['up']:
+                                circ_align2 = True
+                                align2_gene_id.extend(junction_dict[align2_chr]['up'][align2_pos_plusM]['gene_id'])
+                            if (linear_align1 and linear_align2) or (circ_align1 and circ_align2):
+                                align_gene_ids = list(set(align1_gene_id) & set(align2_gene_id))
+                                if len(align_gene_ids) > 0: 
+                                    for align_gene_id in align_gene_ids:
+                                        ciri_result[align_gene_id] += 1
+                else: # SM
+                    if re.match(r'^\d+M\d+[SH]$', CIGAR) is not None:
+                        align2_name = content[NAME_INDEX]
+                        align2_chr = content[CHROM_INDEX]
+                        align2_length = sum([int(i) for i in re.findall(r'\d+', CIGAR)])
+                        align2_flag = int(content[FLAG_INDEX])
+                        align2_pos = int(content[POS_INDEX])
+                        if align1_name == align2_name and align1_chr == align2_chr and align1_length == align2_length and align2_flag == align1_flag+2048:
+                            align2_aligned_seg = int(re.findall(r'\d+', CIGAR)[0])
+                            align2_pos_plusM = align2_pos + align2_aligned_seg - 1
+                            if align2_pos_plusM in junction_dict[align2_chr]['up']:
+                                linear_align2 = True
+                                align2_gene_id.extend(junction_dict[align2_chr]['up'][align2_pos_plusM]['gene_id'])
+                            if align2_pos in junction_dict[align2_chr]['down']:
+                                circ_align2 = True
+                                align2_gene_id.extend(junction_dict[align2_chr]['down'][align2_pos]['gene_id'])
+                            if (linear_align1 and linear_align1) or (circ_align1 and circ_align2):
+                                align_gene_ids = list(set(align1_gene_id) & set(align2_gene_id))
+                                if len(align_gene_ids) > 0: 
+                                    for align_gene_id in align_gene_ids:
+                                        ciri_result[align_gene_id] += 1
+            else:
+                align1_gene_id = []
+                linear_align1 = False
+                circ_align1 = False
+                align_pattern = None
+                align1 = ''
+                if re.match(r'^\d+M\d+[SH]$', CIGAR):
+                    align1_aligned_seg = int(re.findall(r'\d+', CIGAR)[0])
+                    align1_pos = int(content[POS_INDEX])
+                    align1_pos_plusM = align1_pos + align1_aligned_seg - 1
+                    align1_chr = content[CHROM_INDEX]
+                    if align1_pos_plusM not in junction_dict[align1_chr]['up'] and align1_pos in junction_dict[align1_chr]['down']:
+                        continue
+                    if align1_pos_plusM in junction_dict[align1_chr]['up']:
+                        linear_align1 = True
+                        for gene_id in junction_dict[align1_chr]['up'][align1_pos_plusM]['gene_id']:
+                            if gene_id in ciri_result:
+                                align1_gene_id.append(gene_id)
+                    if align1_pos in junction_dict[align1_chr]['down']:
+                        circ_align1 = True
+                        for gene_id in junction_dict[align1_chr]['down'][align1_pos]['gene_id']:
+                            if gene_id in ciri_result:
+                                align1_gene_id.append(gene_id)
+                    if len(align1_gene_id) == 0:
+                        continue
+                    first_align = True
+                    align_pattern = 'MS'
+                    align1_name = content[NAME_INDEX]
+                    align1_length = sum([int(i) for i in re.findall(r'\d+', CIGAR)])
+                    align1_flag = int(content[FLAG_INDEX])
+                    align1 = line
+                    continue
+                if re.match(r'^\d+[SH]\d+M$', CIGAR):
+                    align1_aligned_seg = int(re.findall(r'\d+', CIGAR)[1])
+                    align1_pos = int(content[POS_INDEX])
+                    align1_pos_plusM = align1_pos + align1_aligned_seg - 1
+                    align1_chr = content[CHROM_INDEX]
+                    if align1_pos not in junction_dict[align1_chr]['down'] and align1_pos_plusM in junction_dict[align1_chr]['up']:
+                        continue
+                    if align1_pos in junction_dict[align1_chr]['down']:
+                        linear_align1 = True
+                        for gene_id in junction_dict[align1_chr]['down'][align1_pos]['gene_id']:
+                            if gene_id in ciri_result:
+                                align1_gene_id.append(gene_id)
+                    if align1_pos_plusM in junction_dict[align1_chr]['up']:
+                        circ_align1 = True
+                        for gene_id in junction_dict[align1_chr]['up'][align1_pos_plusM]['gene_id']:
+                            if gene_id in ciri_result:
+                                align1_gene_id.append(gene_id)
+                    if len(align1_gene_id) == 0:
+                        continue
+                    first_align = True
+                    align_pattern = 'SM'
+                    align1_name = content[NAME_INDEX]
+                    align1_length = sum([int(i) for i in re.findall(r'\d+', CIGAR)])
+                    align1_flag = int(content[FLAG_INDEX])
+                    align1 = line
+                    continue
+    
+    return 
+
+def main():
+    use_list = sys.argv
+    input_file = use_list[1]
+    outputdir = use_list[2]
+    gtffile = use_list[3]
+
+    junction_dict = gtf_junction_extract(gtf_file)
+    print('gtf reading finished.')
+    input = pd.read_csv(input_file, sep='\t')
+    
+    for i in range(input.shape[0]):
+        sample_id = input.iloc[i, 0]
+        ciri_fn = input.iloc[i, 1]
+        sam_fn = input.iloc[i, 3]
+        ciri_gene_result, ciri_circ_result = read_ciri_result(ciri_fn)
+        print('ciriquant result reading finished.')
+        extract_juncreads_from_sam(sam_fn, ciri_gene_result, junction_dict, outputdir)
+        output = '%s/%s.splicing_amount.output' % (outputdir, sample_id)
+        with open(output, 'w') as f:
+            f.write('circ_id\tgene_id\tsplicing_amount\n')
+            for circ_id in ciri_circ_result:
+                gene_ids = ciri_circ_result[circ_id]
+                for gene_id in gene_ids:
+                    f.write('{0}\t{1}\t{2}\n'.format(circ_id, gene_id, ciri_gene_result[gene_id]))
+    return 
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file