"docs/source/vscode:/vscode.git/clone" did not exist on "5eafa8737f828b7d36d06c269784e7dfb7f51083"
Commit f1239fd2 authored by adaZ-9's avatar adaZ-9
Browse files

add file

parent fcf5dbcc
#coding=utf-8
'''
This script generates splicing amount of given circRNAs in samples based on junction provided by annotation file (gtf).
Annotation of version gencode.v38lift37 is recommended.
GTF files of circRNA information generated with CIRIquant and SAM (or BAM) files generated with BWA are needed.
Usage:
python ./script_splicingamount.py /path/to/inputfile /output/dir /path/to/gtffile
Input file contains three columns seperated by tab. Header is needed.
The first column contains the sample index, the second column contains the path to CIRIquant derived gtf file and the third column contains the path to SAM file.
'''
import os
from subprocess import *
import re
import sys
import pickle
def gtf_junction_extract(fn):
'''
extract junction information from genecode gtf
return junction_dict
'''
CHR=0
GENE_PART=2
EXON_START=3
EXON_END=4
INFO=8
# initialize junction_dict
chrs = ['chr' + str(i) for i in range(1, 23)]
chrs.extend(['chrX', 'chrY', 'chrM'])
junction_dict = {}
for c in chrs:
junction_dict[c] = {'up':{}, 'down':{}}
# read gtf
with open(fn, 'r') as f:
for line in f:
if len(line) == 0:
continue
if line.startswith('#'):
continue
content = line.strip().split('\t')
gene_part = content[GENE_PART]
if gene_part != 'exon':
continue
chrom = content[CHR]
exon_start = content[EXON_START]
for pos in range(int(exon_start)-6, int(exon_start)+7):
if pos not in junction_dict[chrom]['down']:
junction_dict[chrom]['down'][pos] = {'gene_id':[]}
exon_end = content[EXON_END]
for pos in range(int(exon_end)-6, int(exon_end)+7):
if pos not in junction_dict[chrom]['up']:
junction_dict[chrom]['up'][pos] = {'gene_id':[]}
info = content[INFO]
gene_id = [x for x in info.split(';') if 'gene_id' in x][0]
gene_id = gene_id.strip().split(' ')[1].strip('"')
for pos in range(int(exon_start)-6, int(exon_start)+7):
if gene_id not in junction_dict[chrom]['down'][pos]['gene_id']:
junction_dict[chrom]['down'][pos]['gene_id'].append(gene_id)
for pos in range(int(exon_end)-6, int(exon_end)+7):
if gene_id not in junction_dict[chrom]['up'][pos]['gene_id']:
junction_dict[chrom]['up'][pos]['gene_id'].append(gene_id)
return junction_dict
def read_ciri_result(fn):
'''
extract exon circ id and correspond gene name and gene id
return ciri_dict
'''
# initiate ciri_dict
ciri_gene_dict = {}
ciri_circ_dict = {}
# read ciri_quant result
with open(fn, 'r') as f:
for line in f:
if len(line) == 0:
continue
if line.startswith('#'):
continue
content = line.strip().split('\t')
info = content[-1].strip().split('; ')
circ_type = [x for x in info if 'circ_type' in x][0]
circ_type = circ_type.split(' ')[-1].strip('"')
if circ_type != 'exon':
continue
circ_id = [x for x in info if 'circ_id' in x][0]
circ_id = circ_id.split(' ')[-1].strip('"')
gene_ids = [x for x in info if 'gene_id' in x][0]
gene_ids = sorted(gene_ids.split(' ')[-1].strip('"').split(','))
for gene_id in gene_ids:
if gene_id not in ciri_gene_dict:
ciri_gene_dict[gene_id] = 0 # geneid: splicing amount
ciri_circ_dict[circ_id] = gene_ids # circid: geneid
return ciri_gene_dict, ciri_circ_dict
def extract_juncreads_from_sam(sam_fn, ciri_result, junction_dict, outputdir):
# extract juncreads from sam
MAPQ_INDEX = 4
CIGAR_INDEX = 5
CHROM_INDEX = 2
POS_INDEX = 3
NAME_INDEX = 0
FLAG_INDEX = 1
first_align = False
align_pattern = None
align1_gene_id = []
align2_gene_id = []
linear_align1 = False
circ_align1 = False
linear_align2 = False
circ_align2 = False
with open(sam_fn, 'r') as f:
chrs = ['chr' + str(i) for i in range(1, 23)]
chrs.extend(['chrX', 'chrY', 'chrM'])
for line in f:
if len(line) == 0:
continue
if line.startswith('@'):
continue
content = line.strip().split('\t')
# MAPQ screening
MAPQ = content[MAPQ_INDEX]
if int(MAPQ) <= 5:
first_align = False
continue
# CIGAR screening
CIGAR = content[CIGAR_INDEX]
chrom = content[CHROM_INDEX]
if chrom not in chrs:
continue
if first_align:
align2_gene_id = []
linear_align2 = False
circ_align2 = False
first_align = False
if align_pattern == 'MS':
if re.match(r'^\d+[SH]\d+M$', CIGAR):
align2_name = content[NAME_INDEX]
align2_chr = content[CHROM_INDEX]
align2_length = sum([int(i) for i in re.findall(r'\d+', CIGAR)])
align2_flag = int(content[FLAG_INDEX])
align2_pos = int(content[POS_INDEX])
# supplementary alignment
# align name; align chr; align length; align flag; same junction type; same geneid
if align1_name == align2_name and align1_chr == align2_chr and align1_length == align2_length and align2_flag == align1_flag+2048:
align2_aligned_seg = int(re.findall(r'\d+', CIGAR)[1])
align2_pos_plusM = align2_pos + align2_aligned_seg - 1
if align2_pos in junction_dict[align2_chr]['down']:
linear_align2 = True
align2_gene_id.extend(junction_dict[align2_chr]['down'][align2_pos]['gene_id'])
if align2_pos_plusM in junction_dict[align2_chr]['up']:
circ_align2 = True
align2_gene_id.extend(junction_dict[align2_chr]['up'][align2_pos_plusM]['gene_id'])
if (linear_align1 and linear_align2) or (circ_align1 and circ_align2):
align_gene_ids = list(set(align1_gene_id) & set(align2_gene_id))
if len(align_gene_ids) > 0:
for align_gene_id in align_gene_ids:
ciri_result[align_gene_id] += 1
else: # SM
if re.match(r'^\d+M\d+[SH]$', CIGAR) is not None:
align2_name = content[NAME_INDEX]
align2_chr = content[CHROM_INDEX]
align2_length = sum([int(i) for i in re.findall(r'\d+', CIGAR)])
align2_flag = int(content[FLAG_INDEX])
align2_pos = int(content[POS_INDEX])
if align1_name == align2_name and align1_chr == align2_chr and align1_length == align2_length and align2_flag == align1_flag+2048:
align2_aligned_seg = int(re.findall(r'\d+', CIGAR)[0])
align2_pos_plusM = align2_pos + align2_aligned_seg - 1
if align2_pos_plusM in junction_dict[align2_chr]['up']:
linear_align2 = True
align2_gene_id.extend(junction_dict[align2_chr]['up'][align2_pos_plusM]['gene_id'])
if align2_pos in junction_dict[align2_chr]['down']:
circ_align2 = True
align2_gene_id.extend(junction_dict[align2_chr]['down'][align2_pos]['gene_id'])
if (linear_align1 and linear_align1) or (circ_align1 and circ_align2):
align_gene_ids = list(set(align1_gene_id) & set(align2_gene_id))
if len(align_gene_ids) > 0:
for align_gene_id in align_gene_ids:
ciri_result[align_gene_id] += 1
else:
align1_gene_id = []
linear_align1 = False
circ_align1 = False
align_pattern = None
align1 = ''
if re.match(r'^\d+M\d+[SH]$', CIGAR):
align1_aligned_seg = int(re.findall(r'\d+', CIGAR)[0])
align1_pos = int(content[POS_INDEX])
align1_pos_plusM = align1_pos + align1_aligned_seg - 1
align1_chr = content[CHROM_INDEX]
if align1_pos_plusM not in junction_dict[align1_chr]['up'] and align1_pos in junction_dict[align1_chr]['down']:
continue
if align1_pos_plusM in junction_dict[align1_chr]['up']:
linear_align1 = True
for gene_id in junction_dict[align1_chr]['up'][align1_pos_plusM]['gene_id']:
if gene_id in ciri_result:
align1_gene_id.append(gene_id)
if align1_pos in junction_dict[align1_chr]['down']:
circ_align1 = True
for gene_id in junction_dict[align1_chr]['down'][align1_pos]['gene_id']:
if gene_id in ciri_result:
align1_gene_id.append(gene_id)
if len(align1_gene_id) == 0:
continue
first_align = True
align_pattern = 'MS'
align1_name = content[NAME_INDEX]
align1_length = sum([int(i) for i in re.findall(r'\d+', CIGAR)])
align1_flag = int(content[FLAG_INDEX])
align1 = line
continue
if re.match(r'^\d+[SH]\d+M$', CIGAR):
align1_aligned_seg = int(re.findall(r'\d+', CIGAR)[1])
align1_pos = int(content[POS_INDEX])
align1_pos_plusM = align1_pos + align1_aligned_seg - 1
align1_chr = content[CHROM_INDEX]
if align1_pos not in junction_dict[align1_chr]['down'] and align1_pos_plusM in junction_dict[align1_chr]['up']:
continue
if align1_pos in junction_dict[align1_chr]['down']:
linear_align1 = True
for gene_id in junction_dict[align1_chr]['down'][align1_pos]['gene_id']:
if gene_id in ciri_result:
align1_gene_id.append(gene_id)
if align1_pos_plusM in junction_dict[align1_chr]['up']:
circ_align1 = True
for gene_id in junction_dict[align1_chr]['up'][align1_pos_plusM]['gene_id']:
if gene_id in ciri_result:
align1_gene_id.append(gene_id)
if len(align1_gene_id) == 0:
continue
first_align = True
align_pattern = 'SM'
align1_name = content[NAME_INDEX]
align1_length = sum([int(i) for i in re.findall(r'\d+', CIGAR)])
align1_flag = int(content[FLAG_INDEX])
align1 = line
continue
return
def main():
use_list = sys.argv
input_file = use_list[1]
outputdir = use_list[2]
gtffile = use_list[3]
junction_dict = gtf_junction_extract(gtf_file)
print('gtf reading finished.')
input = pd.read_csv(input_file, sep='\t')
for i in range(input.shape[0]):
sample_id = input.iloc[i, 0]
ciri_fn = input.iloc[i, 1]
sam_fn = input.iloc[i, 3]
ciri_gene_result, ciri_circ_result = read_ciri_result(ciri_fn)
print('ciriquant result reading finished.')
extract_juncreads_from_sam(sam_fn, ciri_gene_result, junction_dict, outputdir)
output = '%s/%s.splicing_amount.output' % (outputdir, sample_id)
with open(output, 'w') as f:
f.write('circ_id\tgene_id\tsplicing_amount\n')
for circ_id in ciri_circ_result:
gene_ids = ciri_circ_result[circ_id]
for gene_id in gene_ids:
f.write('{0}\t{1}\t{2}\n'.format(circ_id, gene_id, ciri_gene_result[gene_id]))
return
if __name__ == '__main__':
main()
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment