script_splicingamount.py

#coding=utf-8

'''
This script generates splicing amount of given circRNAs in samples based on junction provided by annotation file (gtf).
Annotation of version gencode.v38lift37 is recommended.
GTF files of circRNA information generated with CIRIquant and SAM (or BAM) files  generated with BWA are needed.

Usage:
python ./script_splicingamount.py /path/to/inputfile /output/dir /path/to/gtffile
Input file contains three columns seperated by tab. Header is needed. 
The first column contains the sample index, the second column contains the path to CIRIquant derived gtf file and the third column contains the path to SAM file.
'''

import os
from subprocess import *
import re
import sys
import pickle

def gtf_junction_extract(fn):
    '''
    extract junction information from genecode gtf
    return junction_dict
    '''
    CHR=0
    GENE_PART=2
    EXON_START=3
    EXON_END=4
    INFO=8
    
    # initialize junction_dict
    chrs = ['chr' + str(i) for i in range(1, 23)]
    chrs.extend(['chrX', 'chrY', 'chrM'])
    junction_dict = {}
    for c in chrs:
        junction_dict[c] = {'up':{}, 'down':{}}
    
    # read gtf
    with open(fn, 'r') as f:
        for line in f:
            if len(line) == 0:
                continue
            if line.startswith('#'):
                continue
            content = line.strip().split('\t')
            gene_part = content[GENE_PART]
            if gene_part != 'exon':
                continue
            chrom = content[CHR]
            exon_start = content[EXON_START]
            for pos in range(int(exon_start)-6, int(exon_start)+7):
                if pos not in junction_dict[chrom]['down']:
                    junction_dict[chrom]['down'][pos] = {'gene_id':[]}
            exon_end = content[EXON_END]
            for pos in range(int(exon_end)-6, int(exon_end)+7):
                if pos not in junction_dict[chrom]['up']:
                    junction_dict[chrom]['up'][pos] = {'gene_id':[]}
            info = content[INFO]
            gene_id = [x for x in info.split(';') if 'gene_id' in x][0]
            gene_id = gene_id.strip().split(' ')[1].strip('"')
            for pos in range(int(exon_start)-6, int(exon_start)+7):
                if gene_id not in junction_dict[chrom]['down'][pos]['gene_id']:
                    junction_dict[chrom]['down'][pos]['gene_id'].append(gene_id)
            for pos in range(int(exon_end)-6, int(exon_end)+7):
                if gene_id not in junction_dict[chrom]['up'][pos]['gene_id']:
                    junction_dict[chrom]['up'][pos]['gene_id'].append(gene_id)
    return junction_dict

def read_ciri_result(fn):
    '''
    extract exon circ id and correspond gene name and gene id
    return ciri_dict
    '''
    # initiate ciri_dict
    ciri_gene_dict = {}
    ciri_circ_dict = {}
    
    # read ciri_quant result    
    with open(fn, 'r') as f:
        for line in f:
            if len(line) == 0:
                continue
            if line.startswith('#'):
                continue
            content = line.strip().split('\t')
            info = content[-1].strip().split('; ')
            circ_type = [x for x in info if 'circ_type' in x][0]
            circ_type = circ_type.split(' ')[-1].strip('"')
            if circ_type != 'exon':
                continue
            circ_id = [x for x in info if 'circ_id' in x][0]
            circ_id = circ_id.split(' ')[-1].strip('"')
            gene_ids = [x for x in info if 'gene_id' in x][0]
            gene_ids = sorted(gene_ids.split(' ')[-1].strip('"').split(','))
            for gene_id in gene_ids:
                if gene_id not in ciri_gene_dict:
                    ciri_gene_dict[gene_id] = 0 # geneid: splicing amount
            ciri_circ_dict[circ_id] = gene_ids # circid: geneid
    return ciri_gene_dict, ciri_circ_dict

def extract_juncreads_from_sam(sam_fn, ciri_result, junction_dict, outputdir):
        
    # extract juncreads from sam
    MAPQ_INDEX = 4
    CIGAR_INDEX = 5
    CHROM_INDEX = 2
    POS_INDEX = 3
    NAME_INDEX = 0
    FLAG_INDEX = 1
    first_align = False
    align_pattern = None
    align1_gene_id = []
    align2_gene_id = []
    linear_align1 = False
    circ_align1 = False
    linear_align2 = False
    circ_align2 = False
    
    with open(sam_fn, 'r') as f:
        chrs = ['chr' + str(i) for i in range(1, 23)]
        chrs.extend(['chrX', 'chrY', 'chrM'])
        for line in f:
            if len(line) == 0:
                continue
            if line.startswith('@'):
                continue
            content = line.strip().split('\t')
            # MAPQ screening
            MAPQ = content[MAPQ_INDEX]
            if int(MAPQ) <= 5:
                first_align = False
                continue
            # CIGAR screening
            CIGAR = content[CIGAR_INDEX]
            chrom = content[CHROM_INDEX]
            if chrom not in chrs:
                continue
            if first_align:
                align2_gene_id = []
                linear_align2 = False
                circ_align2 = False
                first_align = False
                if align_pattern == 'MS':
                    if re.match(r'^\d+[SH]\d+M$', CIGAR):
                        align2_name = content[NAME_INDEX]
                        align2_chr = content[CHROM_INDEX]
                        align2_length = sum([int(i) for i in re.findall(r'\d+', CIGAR)])
                        align2_flag = int(content[FLAG_INDEX])
                        align2_pos = int(content[POS_INDEX])
                        # supplementary alignment
                        # align name; align chr; align length; align flag; same junction type; same geneid
                        if align1_name == align2_name and align1_chr == align2_chr and align1_length == align2_length and align2_flag == align1_flag+2048:
                            align2_aligned_seg = int(re.findall(r'\d+', CIGAR)[1])
                            align2_pos_plusM = align2_pos + align2_aligned_seg - 1
                            if align2_pos in junction_dict[align2_chr]['down']:
                                linear_align2 = True
                                align2_gene_id.extend(junction_dict[align2_chr]['down'][align2_pos]['gene_id'])
                            if align2_pos_plusM in junction_dict[align2_chr]['up']:
                                circ_align2 = True
                                align2_gene_id.extend(junction_dict[align2_chr]['up'][align2_pos_plusM]['gene_id'])
                            if (linear_align1 and linear_align2) or (circ_align1 and circ_align2):
                                align_gene_ids = list(set(align1_gene_id) & set(align2_gene_id))
                                if len(align_gene_ids) > 0: 
                                    for align_gene_id in align_gene_ids:
                                        ciri_result[align_gene_id] += 1
                else: # SM
                    if re.match(r'^\d+M\d+[SH]$', CIGAR) is not None:
                        align2_name = content[NAME_INDEX]
                        align2_chr = content[CHROM_INDEX]
                        align2_length = sum([int(i) for i in re.findall(r'\d+', CIGAR)])
                        align2_flag = int(content[FLAG_INDEX])
                        align2_pos = int(content[POS_INDEX])
                        if align1_name == align2_name and align1_chr == align2_chr and align1_length == align2_length and align2_flag == align1_flag+2048:
                            align2_aligned_seg = int(re.findall(r'\d+', CIGAR)[0])
                            align2_pos_plusM = align2_pos + align2_aligned_seg - 1
                            if align2_pos_plusM in junction_dict[align2_chr]['up']:
                                linear_align2 = True
                                align2_gene_id.extend(junction_dict[align2_chr]['up'][align2_pos_plusM]['gene_id'])
                            if align2_pos in junction_dict[align2_chr]['down']:
                                circ_align2 = True
                                align2_gene_id.extend(junction_dict[align2_chr]['down'][align2_pos]['gene_id'])
                            if (linear_align1 and linear_align1) or (circ_align1 and circ_align2):
                                align_gene_ids = list(set(align1_gene_id) & set(align2_gene_id))
                                if len(align_gene_ids) > 0: 
                                    for align_gene_id in align_gene_ids:
                                        ciri_result[align_gene_id] += 1
            else:
                align1_gene_id = []
                linear_align1 = False
                circ_align1 = False
                align_pattern = None
                align1 = ''
                if re.match(r'^\d+M\d+[SH]$', CIGAR):
                    align1_aligned_seg = int(re.findall(r'\d+', CIGAR)[0])
                    align1_pos = int(content[POS_INDEX])
                    align1_pos_plusM = align1_pos + align1_aligned_seg - 1
                    align1_chr = content[CHROM_INDEX]
                    if align1_pos_plusM not in junction_dict[align1_chr]['up'] and align1_pos in junction_dict[align1_chr]['down']:
                        continue
                    if align1_pos_plusM in junction_dict[align1_chr]['up']:
                        linear_align1 = True
                        for gene_id in junction_dict[align1_chr]['up'][align1_pos_plusM]['gene_id']:
                            if gene_id in ciri_result:
                                align1_gene_id.append(gene_id)
                    if align1_pos in junction_dict[align1_chr]['down']:
                        circ_align1 = True
                        for gene_id in junction_dict[align1_chr]['down'][align1_pos]['gene_id']:
                            if gene_id in ciri_result:
                                align1_gene_id.append(gene_id)
                    if len(align1_gene_id) == 0:
                        continue
                    first_align = True
                    align_pattern = 'MS'
                    align1_name = content[NAME_INDEX]
                    align1_length = sum([int(i) for i in re.findall(r'\d+', CIGAR)])
                    align1_flag = int(content[FLAG_INDEX])
                    align1 = line
                    continue
                if re.match(r'^\d+[SH]\d+M$', CIGAR):
                    align1_aligned_seg = int(re.findall(r'\d+', CIGAR)[1])
                    align1_pos = int(content[POS_INDEX])
                    align1_pos_plusM = align1_pos + align1_aligned_seg - 1
                    align1_chr = content[CHROM_INDEX]
                    if align1_pos not in junction_dict[align1_chr]['down'] and align1_pos_plusM in junction_dict[align1_chr]['up']:
                        continue
                    if align1_pos in junction_dict[align1_chr]['down']:
                        linear_align1 = True
                        for gene_id in junction_dict[align1_chr]['down'][align1_pos]['gene_id']:
                            if gene_id in ciri_result:
                                align1_gene_id.append(gene_id)
                    if align1_pos_plusM in junction_dict[align1_chr]['up']:
                        circ_align1 = True
                        for gene_id in junction_dict[align1_chr]['up'][align1_pos_plusM]['gene_id']:
                            if gene_id in ciri_result:
                                align1_gene_id.append(gene_id)
                    if len(align1_gene_id) == 0:
                        continue
                    first_align = True
                    align_pattern = 'SM'
                    align1_name = content[NAME_INDEX]
                    align1_length = sum([int(i) for i in re.findall(r'\d+', CIGAR)])
                    align1_flag = int(content[FLAG_INDEX])
                    align1 = line
                    continue
    
    return 

def main():
    use_list = sys.argv
    input_file = use_list[1]
    outputdir = use_list[2]
    gtffile = use_list[3]

    junction_dict = gtf_junction_extract(gtf_file)
    print('gtf reading finished.')
    input = pd.read_csv(input_file, sep='\t')
    
    for i in range(input.shape[0]):
        sample_id = input.iloc[i, 0]
        ciri_fn = input.iloc[i, 1]
        sam_fn = input.iloc[i, 3]
        ciri_gene_result, ciri_circ_result = read_ciri_result(ciri_fn)
        print('ciriquant result reading finished.')
        extract_juncreads_from_sam(sam_fn, ciri_gene_result, junction_dict, outputdir)
        output = '%s/%s.splicing_amount.output' % (outputdir, sample_id)
        with open(output, 'w') as f:
            f.write('circ_id\tgene_id\tsplicing_amount\n')
            for circ_id in ciri_circ_result:
                gene_ids = ciri_circ_result[circ_id]
                for gene_id in gene_ids:
                    f.write('{0}\t{1}\t{2}\n'.format(circ_id, gene_id, ciri_gene_result[gene_id]))
    return 

if __name__ == '__main__':
    main()