Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
CIRI-deep_tensorflow
Commits
f1239fd2
Commit
f1239fd2
authored
Jul 25, 2023
by
adaZ-9
Browse files
add file
parent
fcf5dbcc
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
275 additions
and
0 deletions
+275
-0
script_splicingamount.py
script_splicingamount.py
+275
-0
No files found.
script_splicingamount.py
0 → 100644
View file @
f1239fd2
#coding=utf-8
'''
This script generates splicing amount of given circRNAs in samples based on junction provided by annotation file (gtf).
Annotation of version gencode.v38lift37 is recommended.
GTF files of circRNA information generated with CIRIquant and SAM (or BAM) files generated with BWA are needed.
Usage:
python ./script_splicingamount.py /path/to/inputfile /output/dir /path/to/gtffile
Input file contains three columns seperated by tab. Header is needed.
The first column contains the sample index, the second column contains the path to CIRIquant derived gtf file and the third column contains the path to SAM file.
'''
import
os
from
subprocess
import
*
import
re
import
sys
import
pickle
def
gtf_junction_extract
(
fn
):
'''
extract junction information from genecode gtf
return junction_dict
'''
CHR
=
0
GENE_PART
=
2
EXON_START
=
3
EXON_END
=
4
INFO
=
8
# initialize junction_dict
chrs
=
[
'chr'
+
str
(
i
)
for
i
in
range
(
1
,
23
)]
chrs
.
extend
([
'chrX'
,
'chrY'
,
'chrM'
])
junction_dict
=
{}
for
c
in
chrs
:
junction_dict
[
c
]
=
{
'up'
:{},
'down'
:{}}
# read gtf
with
open
(
fn
,
'r'
)
as
f
:
for
line
in
f
:
if
len
(
line
)
==
0
:
continue
if
line
.
startswith
(
'#'
):
continue
content
=
line
.
strip
().
split
(
'
\t
'
)
gene_part
=
content
[
GENE_PART
]
if
gene_part
!=
'exon'
:
continue
chrom
=
content
[
CHR
]
exon_start
=
content
[
EXON_START
]
for
pos
in
range
(
int
(
exon_start
)
-
6
,
int
(
exon_start
)
+
7
):
if
pos
not
in
junction_dict
[
chrom
][
'down'
]:
junction_dict
[
chrom
][
'down'
][
pos
]
=
{
'gene_id'
:[]}
exon_end
=
content
[
EXON_END
]
for
pos
in
range
(
int
(
exon_end
)
-
6
,
int
(
exon_end
)
+
7
):
if
pos
not
in
junction_dict
[
chrom
][
'up'
]:
junction_dict
[
chrom
][
'up'
][
pos
]
=
{
'gene_id'
:[]}
info
=
content
[
INFO
]
gene_id
=
[
x
for
x
in
info
.
split
(
';'
)
if
'gene_id'
in
x
][
0
]
gene_id
=
gene_id
.
strip
().
split
(
' '
)[
1
].
strip
(
'"'
)
for
pos
in
range
(
int
(
exon_start
)
-
6
,
int
(
exon_start
)
+
7
):
if
gene_id
not
in
junction_dict
[
chrom
][
'down'
][
pos
][
'gene_id'
]:
junction_dict
[
chrom
][
'down'
][
pos
][
'gene_id'
].
append
(
gene_id
)
for
pos
in
range
(
int
(
exon_end
)
-
6
,
int
(
exon_end
)
+
7
):
if
gene_id
not
in
junction_dict
[
chrom
][
'up'
][
pos
][
'gene_id'
]:
junction_dict
[
chrom
][
'up'
][
pos
][
'gene_id'
].
append
(
gene_id
)
return
junction_dict
def
read_ciri_result
(
fn
):
'''
extract exon circ id and correspond gene name and gene id
return ciri_dict
'''
# initiate ciri_dict
ciri_gene_dict
=
{}
ciri_circ_dict
=
{}
# read ciri_quant result
with
open
(
fn
,
'r'
)
as
f
:
for
line
in
f
:
if
len
(
line
)
==
0
:
continue
if
line
.
startswith
(
'#'
):
continue
content
=
line
.
strip
().
split
(
'
\t
'
)
info
=
content
[
-
1
].
strip
().
split
(
'; '
)
circ_type
=
[
x
for
x
in
info
if
'circ_type'
in
x
][
0
]
circ_type
=
circ_type
.
split
(
' '
)[
-
1
].
strip
(
'"'
)
if
circ_type
!=
'exon'
:
continue
circ_id
=
[
x
for
x
in
info
if
'circ_id'
in
x
][
0
]
circ_id
=
circ_id
.
split
(
' '
)[
-
1
].
strip
(
'"'
)
gene_ids
=
[
x
for
x
in
info
if
'gene_id'
in
x
][
0
]
gene_ids
=
sorted
(
gene_ids
.
split
(
' '
)[
-
1
].
strip
(
'"'
).
split
(
','
))
for
gene_id
in
gene_ids
:
if
gene_id
not
in
ciri_gene_dict
:
ciri_gene_dict
[
gene_id
]
=
0
# geneid: splicing amount
ciri_circ_dict
[
circ_id
]
=
gene_ids
# circid: geneid
return
ciri_gene_dict
,
ciri_circ_dict
def
extract_juncreads_from_sam
(
sam_fn
,
ciri_result
,
junction_dict
,
outputdir
):
# extract juncreads from sam
MAPQ_INDEX
=
4
CIGAR_INDEX
=
5
CHROM_INDEX
=
2
POS_INDEX
=
3
NAME_INDEX
=
0
FLAG_INDEX
=
1
first_align
=
False
align_pattern
=
None
align1_gene_id
=
[]
align2_gene_id
=
[]
linear_align1
=
False
circ_align1
=
False
linear_align2
=
False
circ_align2
=
False
with
open
(
sam_fn
,
'r'
)
as
f
:
chrs
=
[
'chr'
+
str
(
i
)
for
i
in
range
(
1
,
23
)]
chrs
.
extend
([
'chrX'
,
'chrY'
,
'chrM'
])
for
line
in
f
:
if
len
(
line
)
==
0
:
continue
if
line
.
startswith
(
'@'
):
continue
content
=
line
.
strip
().
split
(
'
\t
'
)
# MAPQ screening
MAPQ
=
content
[
MAPQ_INDEX
]
if
int
(
MAPQ
)
<=
5
:
first_align
=
False
continue
# CIGAR screening
CIGAR
=
content
[
CIGAR_INDEX
]
chrom
=
content
[
CHROM_INDEX
]
if
chrom
not
in
chrs
:
continue
if
first_align
:
align2_gene_id
=
[]
linear_align2
=
False
circ_align2
=
False
first_align
=
False
if
align_pattern
==
'MS'
:
if
re
.
match
(
r
'^\d+[SH]\d+M$'
,
CIGAR
):
align2_name
=
content
[
NAME_INDEX
]
align2_chr
=
content
[
CHROM_INDEX
]
align2_length
=
sum
([
int
(
i
)
for
i
in
re
.
findall
(
r
'\d+'
,
CIGAR
)])
align2_flag
=
int
(
content
[
FLAG_INDEX
])
align2_pos
=
int
(
content
[
POS_INDEX
])
# supplementary alignment
# align name; align chr; align length; align flag; same junction type; same geneid
if
align1_name
==
align2_name
and
align1_chr
==
align2_chr
and
align1_length
==
align2_length
and
align2_flag
==
align1_flag
+
2048
:
align2_aligned_seg
=
int
(
re
.
findall
(
r
'\d+'
,
CIGAR
)[
1
])
align2_pos_plusM
=
align2_pos
+
align2_aligned_seg
-
1
if
align2_pos
in
junction_dict
[
align2_chr
][
'down'
]:
linear_align2
=
True
align2_gene_id
.
extend
(
junction_dict
[
align2_chr
][
'down'
][
align2_pos
][
'gene_id'
])
if
align2_pos_plusM
in
junction_dict
[
align2_chr
][
'up'
]:
circ_align2
=
True
align2_gene_id
.
extend
(
junction_dict
[
align2_chr
][
'up'
][
align2_pos_plusM
][
'gene_id'
])
if
(
linear_align1
and
linear_align2
)
or
(
circ_align1
and
circ_align2
):
align_gene_ids
=
list
(
set
(
align1_gene_id
)
&
set
(
align2_gene_id
))
if
len
(
align_gene_ids
)
>
0
:
for
align_gene_id
in
align_gene_ids
:
ciri_result
[
align_gene_id
]
+=
1
else
:
# SM
if
re
.
match
(
r
'^\d+M\d+[SH]$'
,
CIGAR
)
is
not
None
:
align2_name
=
content
[
NAME_INDEX
]
align2_chr
=
content
[
CHROM_INDEX
]
align2_length
=
sum
([
int
(
i
)
for
i
in
re
.
findall
(
r
'\d+'
,
CIGAR
)])
align2_flag
=
int
(
content
[
FLAG_INDEX
])
align2_pos
=
int
(
content
[
POS_INDEX
])
if
align1_name
==
align2_name
and
align1_chr
==
align2_chr
and
align1_length
==
align2_length
and
align2_flag
==
align1_flag
+
2048
:
align2_aligned_seg
=
int
(
re
.
findall
(
r
'\d+'
,
CIGAR
)[
0
])
align2_pos_plusM
=
align2_pos
+
align2_aligned_seg
-
1
if
align2_pos_plusM
in
junction_dict
[
align2_chr
][
'up'
]:
linear_align2
=
True
align2_gene_id
.
extend
(
junction_dict
[
align2_chr
][
'up'
][
align2_pos_plusM
][
'gene_id'
])
if
align2_pos
in
junction_dict
[
align2_chr
][
'down'
]:
circ_align2
=
True
align2_gene_id
.
extend
(
junction_dict
[
align2_chr
][
'down'
][
align2_pos
][
'gene_id'
])
if
(
linear_align1
and
linear_align1
)
or
(
circ_align1
and
circ_align2
):
align_gene_ids
=
list
(
set
(
align1_gene_id
)
&
set
(
align2_gene_id
))
if
len
(
align_gene_ids
)
>
0
:
for
align_gene_id
in
align_gene_ids
:
ciri_result
[
align_gene_id
]
+=
1
else
:
align1_gene_id
=
[]
linear_align1
=
False
circ_align1
=
False
align_pattern
=
None
align1
=
''
if
re
.
match
(
r
'^\d+M\d+[SH]$'
,
CIGAR
):
align1_aligned_seg
=
int
(
re
.
findall
(
r
'\d+'
,
CIGAR
)[
0
])
align1_pos
=
int
(
content
[
POS_INDEX
])
align1_pos_plusM
=
align1_pos
+
align1_aligned_seg
-
1
align1_chr
=
content
[
CHROM_INDEX
]
if
align1_pos_plusM
not
in
junction_dict
[
align1_chr
][
'up'
]
and
align1_pos
in
junction_dict
[
align1_chr
][
'down'
]:
continue
if
align1_pos_plusM
in
junction_dict
[
align1_chr
][
'up'
]:
linear_align1
=
True
for
gene_id
in
junction_dict
[
align1_chr
][
'up'
][
align1_pos_plusM
][
'gene_id'
]:
if
gene_id
in
ciri_result
:
align1_gene_id
.
append
(
gene_id
)
if
align1_pos
in
junction_dict
[
align1_chr
][
'down'
]:
circ_align1
=
True
for
gene_id
in
junction_dict
[
align1_chr
][
'down'
][
align1_pos
][
'gene_id'
]:
if
gene_id
in
ciri_result
:
align1_gene_id
.
append
(
gene_id
)
if
len
(
align1_gene_id
)
==
0
:
continue
first_align
=
True
align_pattern
=
'MS'
align1_name
=
content
[
NAME_INDEX
]
align1_length
=
sum
([
int
(
i
)
for
i
in
re
.
findall
(
r
'\d+'
,
CIGAR
)])
align1_flag
=
int
(
content
[
FLAG_INDEX
])
align1
=
line
continue
if
re
.
match
(
r
'^\d+[SH]\d+M$'
,
CIGAR
):
align1_aligned_seg
=
int
(
re
.
findall
(
r
'\d+'
,
CIGAR
)[
1
])
align1_pos
=
int
(
content
[
POS_INDEX
])
align1_pos_plusM
=
align1_pos
+
align1_aligned_seg
-
1
align1_chr
=
content
[
CHROM_INDEX
]
if
align1_pos
not
in
junction_dict
[
align1_chr
][
'down'
]
and
align1_pos_plusM
in
junction_dict
[
align1_chr
][
'up'
]:
continue
if
align1_pos
in
junction_dict
[
align1_chr
][
'down'
]:
linear_align1
=
True
for
gene_id
in
junction_dict
[
align1_chr
][
'down'
][
align1_pos
][
'gene_id'
]:
if
gene_id
in
ciri_result
:
align1_gene_id
.
append
(
gene_id
)
if
align1_pos_plusM
in
junction_dict
[
align1_chr
][
'up'
]:
circ_align1
=
True
for
gene_id
in
junction_dict
[
align1_chr
][
'up'
][
align1_pos_plusM
][
'gene_id'
]:
if
gene_id
in
ciri_result
:
align1_gene_id
.
append
(
gene_id
)
if
len
(
align1_gene_id
)
==
0
:
continue
first_align
=
True
align_pattern
=
'SM'
align1_name
=
content
[
NAME_INDEX
]
align1_length
=
sum
([
int
(
i
)
for
i
in
re
.
findall
(
r
'\d+'
,
CIGAR
)])
align1_flag
=
int
(
content
[
FLAG_INDEX
])
align1
=
line
continue
return
def
main
():
use_list
=
sys
.
argv
input_file
=
use_list
[
1
]
outputdir
=
use_list
[
2
]
gtffile
=
use_list
[
3
]
junction_dict
=
gtf_junction_extract
(
gtf_file
)
print
(
'gtf reading finished.'
)
input
=
pd
.
read_csv
(
input_file
,
sep
=
'
\t
'
)
for
i
in
range
(
input
.
shape
[
0
]):
sample_id
=
input
.
iloc
[
i
,
0
]
ciri_fn
=
input
.
iloc
[
i
,
1
]
sam_fn
=
input
.
iloc
[
i
,
3
]
ciri_gene_result
,
ciri_circ_result
=
read_ciri_result
(
ciri_fn
)
print
(
'ciriquant result reading finished.'
)
extract_juncreads_from_sam
(
sam_fn
,
ciri_gene_result
,
junction_dict
,
outputdir
)
output
=
'%s/%s.splicing_amount.output'
%
(
outputdir
,
sample_id
)
with
open
(
output
,
'w'
)
as
f
:
f
.
write
(
'circ_id
\t
gene_id
\t
splicing_amount
\n
'
)
for
circ_id
in
ciri_circ_result
:
gene_ids
=
ciri_circ_result
[
circ_id
]
for
gene_id
in
gene_ids
:
f
.
write
(
'{0}
\t
{1}
\t
{2}
\n
'
.
format
(
circ_id
,
gene_id
,
ciri_gene_result
[
gene_id
]))
return
if
__name__
==
'__main__'
:
main
()
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment