Add files via upload

b7502586 · adaZ-9 · GitHub · b7502586 · b7502586
Unverified Commit b7502586 authored Jul 24, 2023 by adaZ-9 Committed by GitHub Jul 24, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 937 additions and 0 deletions

CIRIdeep.py CIRIdeep.py +915 -0

config.py config.py +22 -0

No files found.
--- a/CIRIdeep.py
+++ b/CIRIdeep.py
+from keras.models import Sequential, Model, load_model
+from keras.layers import Dense, Dropout, Activation, Input, Lambda, LeakyReLU, Concatenate, BatchNormalization
+from keras.optimizers import Adam
+import keras.backend as K
+import pandas as pd
+import os
+import numpy as np
+from sklearn import metrics
+import random
+import matplotlib
+matplotlib.use('agg')
+import matplotlib.pyplot as plt
+import argparse as ap
+from keras.utils import np_utils
+import math
+import config
+class SiameseNet():
+    def __init__(self, n_in=6523, hidden=[1200, 500, 300, 200], drop_out=[0, 0, 0, 0, 0], batch_size=512, kernel_initializer='glorot_normal', learning_rate=0.001, splicing_amount=True, CIRIdeepA=False):
+        if splicing_amount:
+            n_in += 2
+        input = Input(shape=(n_in, ), dtype='float32', name='input') 
+        firstlayer = True
+        if drop_out[0] > 0: 
+            h = Dropout(drop_out[0])(input)
+            firstlayer = False
+        for i in range(len(hidden)):
+            if firstlayer:
+                h = Dense(hidden[i], activation=None, kernel_initializer=kernel_initializer)(input) 
+                h = BatchNormalization()(h)
+                h = Activation('relu')(h)
+                firstlayer = False
+            else:
+                h = Dense(hidden[i], activation=None, kernel_initializer=kernel_initializer)(h)
+                h = BatchNormalization()(h)
+                h = Activation('relu')(h)
+            if drop_out[i+1] > 0:
+                h = Dropout(drop_out[i+1])(h)
+        if not CIRIdeepA:
+            output = Dense(1, activation='sigmoid', name='output')(h)
+            model = Model(inputs=input, outputs=output)
+            adam = Adam(lr=learning_rate)
+            model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
+        else:
+            output = Dense(3, activation='softmax', name='output')(h)
+            model = Model(inputs=input, outputs=output)
+            adam = Adam(lr=learning_rate)
+            model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])
+        self.model = model
+        self.batch_size = batch_size
+        self.n_in = n_in
+        self.hidden = hidden
+        self.drop_out = drop_out
+        self.learning_rate = learning_rate
+        self.CIRIdeepA = CIRIdeepA
+    def fit(self, inputdata):
+        best_metrics = train_on_balanced_batch(self.model, inputdata, batch_size=self.batch_size, CIRIdeepA=self.CIRIdeepA)
+        return best_metrics
+    def predict(self, test_data):
+        Y_pred = self.model.predict([test_data['X']])
+        return Y_pred
+#######################################
+def generate_data_batch(train_list, test_data, seqFeature_df, geneExp_absmax, geneExp_colnames, odir, RBP_dir, splicing_max='', splicing_dir='', splicing_amount=False, n_epoch=100, CIRIdeepA=False):
+    '''
+    generate dataset for training while fill test data
+    '''
+    # epoch looping
+    # test_data_saved = False
+    n_epoch = config.n_epoch
+    for i in range(n_epoch):
+        sample_list = np.array(list(train_list.index))
+        # 1 epoch
+        while len(sample_list) > 0:
+            # randomly select 4 sample comparisons
+            random.seed(config.random_seed_eval)
+            idx_inbatch = random.sample(range(len(sample_list)), min(4, len(sample_list)))
+            sample_inbatch = sample_list[idx_inbatch]
+            sample_list = np.delete(sample_list, idx_inbatch)
+            print('epoch: %i; sample in batch: %s' %(i, ', '.join(sample_inbatch)))
+            X_train_list = []
+            Y_train_list = []
+            rownames_train_list = []
+            X_test_list = []
+            Y_test_list = []
+            rownames_test_list = []
+            for sample in sample_inbatch:
+                label_fn = train_list.loc[sample].label_fn
+                X_tmp, Y_tmp, rownames_tmp, colnames = \
+                    construct_data_from_label(label_fn, sample, seqFeature_df, geneExp_absmax, geneExp_colnames, RBP_dir, splicing_dir=splicing_dir, splicing_max=splicing_max, splicing_amount=False, phase='train', CIRIdeepA=CIRIdeepA)
+                inputdata = {'X':X_tmp, 'Y':Y_tmp, 'rownames':rownames_tmp}
+                X_train_tmp, Y_train_tmp, X_test_tmp, Y_test_tmp, rownames_train_tmp, rownames_test_tmp = \
+                    split_testdata_out(inputdata, CIRIdeepA=CIRIdeepA)
+                X_train_list.append(X_train_tmp)
+                Y_train_list.append(Y_train_tmp)
+                rownames_train_list.append(rownames_train_tmp)
+                X_test_list.append(X_test_tmp)
+                Y_test_list.append(Y_test_tmp)
+                rownames_test_list.append(rownames_test_tmp)
+            # concatenate 4 data
+            X_train = np.vstack(X_train_list)
+            if not CIRIdeepA:
+                Y_train = np.hstack(Y_train_list)
+                Y_test = np.hstack(Y_test_list)
+            else:
+                Y_train = np.vstack(Y_train_list)
+                Y_test = np.vstack(Y_test_list)
+            rownames_train = np.hstack(rownames_train_list)
+            X_test = np.vstack(X_test_list)
+            rownames_test = np.hstack(rownames_test_list)
+            # add test dataset into test_data in epoch 0
+            if i == 0:
+                test_data['X'].append(X_test)
+                test_data['Y'].append(Y_test)
+                test_data['rownames'].append(rownames_test)
+            # generate train dataset for 4 samples
+            train_data = {'X':X_train, 'Y':Y_train, 'rownames':rownames_train, 'colnames':colnames}
+            yield train_data
+    return
+###################################
+def split_testdata_out(inputdata, circid_test=[], test_prop=0.05, random_seed=12345, CIRIdeepA=False):
+    '''
+    split data into test data and train data
+    '''
+    test_prop = config.test_prop
+    random_seed = config.random_seed_test
+    X = inputdata['X']
+    Y = inputdata['Y']
+    rownames = inputdata['rownames']
+    if not CIRIdeepA:
+        # half pos half neg in test data
+        test_size = int(len(rownames) * test_prop * 0.5) ## 要不要取pos和neg的最大？
+        pos_idx = np.where(Y == 1)[0]
+        neg_idx = np.where(Y == 0)[0]
+        # randomly select test data
+        np.random.seed(random_seed)
+        np.random.shuffle(pos_idx) ### random seed set
+        np.random.seed(random_seed)
+        np.random.shuffle(neg_idx) ### random seed set
+        pos_idx_test = pos_idx[:test_size]
+        pos_idx_train = pos_idx[test_size:]
+        neg_idx_test = neg_idx[:test_size]
+        neg_idx_train = neg_idx[test_size:]
+        X_test = np.vstack((X[pos_idx_test, ], X[neg_idx_test, ]))
+        X_train = np.vstack((X[pos_idx_train, ], X[neg_idx_train, ]))
+        Y_test = np.asarray([1.]*len(pos_idx_test) + [0.]*len(neg_idx_test))
+        Y_train = np.asarray([1.]*len(pos_idx_train) + [0.]*len(neg_idx_train))
+        rownames_test = np.hstack((rownames[pos_idx_test], rownames[neg_idx_test]))
+        rownames_train = np.hstack((rownames[pos_idx_train], rownames[neg_idx_train]))
+    else:
+        test_size = int(len(rownames) * test_prop)
+        np.random.seed(random_seed)
+        idx_shuffled = list(range(len(rownames)))
+        for i in range(5):
+            np.random.shuffle(idx_shuffled)
+        test_idx = idx_shuffled[:test_size]
+        train_idx = idx_shuffled[test_size:]
+        X_test = X[test_idx, ]
+        X_train = X[train_idx, ]
+        Y_test = Y[test_idx, ]
+        Y_train = Y[train_idx, ]
+        rownames_test = rownames[test_idx]
+        rownames_train = rownames[train_idx]
+    return X_train, Y_train, X_test, Y_test, rownames_train, rownames_test
+######################
+def split_eval_train_data(inputdata, n_fold=5): #######
+    '''
+    split train data into train and validation
+    '''
+    X = inputdata['X']
+    Y = inputdata['Y']
+    rownames = inputdata['rownames']
+    # randomly select 1/5 data from whole dataset
+    fold_size = int(len(rownames)/n_fold)
+    idx = np.asarray(range(0, len(rownames)))
+    np.random.seed(config.random_seed_eval)
+    np.random.shuffle(idx)
+    idx_val = idx[:fold_size]
+    idx_train = idx[fold_size:]
+    X_val = X[idx_val, ]
+    X_train = X[idx_train, ]
+    Y_val = Y[idx_val, ]
+    Y_train = Y[idx_train, ]
+    rownames_val = rownames[idx_val]
+    rownames_train = rownames[idx_train]
+    return X_train, X_val, Y_train, Y_val, rownames_train, rownames_val
+###################################
+# idx_inbatch_list = split_data_into_balanced_minibatch(X_a_train, X_b_train, Y_train, rownames_train)
+def split_data_into_balanced_minibatch(Y_train, batch_size=64, pos_prop=0.5, CIRIdeepA=False):
+    '''
+    split train data into balanced mini-batches
+    '''
+    pos_prop = config.pos_prop
+    if not CIRIdeepA:
+        pos_idx = np.where(Y_train == 1)[0] # shuffle
+        neg_idx = np.where(Y_train == 0)[0]
+    else:
+        neg_idx = np.where(Y_train[:,0] == 1)[0]
+        pos_idx = np.where(Y_train[:,0] == 0)[0]
+    if len(pos_idx) + len(neg_idx) < batch_size:
+        batch_size = 2 * min(len(pos_idx), len(neg_idx))
+    # half pos half neg in batch
+    pos_size = int(batch_size * pos_prop)
+    neg_size = batch_size - pos_size
+    pos_batch = len(pos_idx)//pos_size
+    neg_batch = len(neg_idx)//neg_size
+    n_batch = max(pos_batch, neg_batch)
+    for i in range(5):
+        np.random.shuffle(pos_idx)
+    pos_idx = np.tile(pos_idx, n_batch)
+    for i in range(5):
+        np.random.shuffle(neg_idx)
+    neg_idx = np.tile(neg_idx, n_batch)
+    idx_inbatch_list = []
+    for i in range(0, n_batch):
+        pos_idx_inbatch = pos_idx[i*pos_size:(i+1)*pos_size]
+        neg_idx_inbatch = neg_idx[i*neg_size:(i+1)*neg_size]
+        idx_inbatch = np.hstack((pos_idx_inbatch, neg_idx_inbatch))
+        np.random.shuffle(idx_inbatch)
+        idx_inbatch_list.append(idx_inbatch)
+    return idx_inbatch_list
+###################################
+def train_on_balanced_batch(model, inputdata, batch_size=64, validation_freq=10, CIRIdeepA=False):
+    '''
+    train and validation (for each sample pair)
+    '''
+    # construct balanced minibatch
+    Y_train = inputdata['Y_train']
+    idx_inbatch_list = split_data_into_balanced_minibatch(Y_train, batch_size=batch_size, CIRIdeepA=CIRIdeepA)
+    # initiate
+    best_roc = 0
+    best_loss = float('Inf')
+    n_batch = len(idx_inbatch_list)
+    # train on batch for n times
+    for i in range(0, n_batch): # train_on_batch update weight?
+        X_train, Y_train, X_val, Y_val, rownames_train, rownames_val = \
+            inputdata['X_train'], inputdata['Y_train'], inputdata['X_val'], inputdata['Y_val'], inputdata['rownames_train'], inputdata['rownames_val']
+        loss_on_batch = model.train_on_batch(X_train[idx_inbatch_list[i], ], Y_train[idx_inbatch_list[i], ]) # debug: loss unstable
+        print('loss on batch%i: %.3f' % (i, loss_on_batch[0]))
+        print('accuracy on batch%i: %.3f' % (i, loss_on_batch[1]))
+        # validation: loss_val, loss_train, roc, pr # validation change the weight? why the loss becomes unstable after evaluate
+        if i == n_batch - 1:
+            Y_predict = model.predict(X_val)
+            eval_val = model.evaluate(X_val, Y_val, verbose=1) # is eval_val a loss?
+            eval_train = model.evaluate(X_train, Y_train, verbose=1) # loss unstable # LeakyReLU better than ReLU, prevent the nan occur
+            if not CIRIdeepA:
+                roc = metrics.roc_auc_score(Y_val, Y_predict)
+                pr = metrics.average_precision_score(Y_val, Y_predict)
+            else:
+                Y_predict_categorical = (Y_predict == Y_predict.max(axis=1, keepdims=True)).astype('float')
+                precision_score_macro = metrics.precision_score(Y_val, Y_predict_categorical, average="macro")
+                precision_score_weighted = metrics.precision_score(Y_val, Y_predict_categorical, average="weighted")
+                recall_macro = metrics.recall_score(Y_val, Y_predict_categorical, average='macro')
+                recall_weighted = metrics.recall_score(Y_val, Y_predict_categorical, average="weighted")
+                F1_macro = metrics.f1_score(Y_val, Y_predict_categorical, average='macro')
+                F1_weighted = metrics.f1_score(Y_val, Y_predict_categorical, average='weighted')
+    print('*' * 50)
+    if not CIRIdeepA:
+        print('batch %i: loss_val - %.3f; accuracy_val - %.3f; loss_train - %.3f; accuracy_train - %.3f; auroc_val - %.3f; aupr_val - %.3f' % (i, eval_val[0], eval_val[1], eval_train[0], eval_train[1], roc, pr))
+    else:
+        print('batch %i: loss_val - %.3f; accuracy_val - %.3f; loss_train - %.3f; accuracy_train - %.3f' % (i, eval_val[0], eval_val[1], eval_train[0], eval_train[1]))
+    print('*' * 50)
+    if not CIRIdeepA:
+        return {'roc':roc, 'loss_eval':eval_val, 'loss_train':eval_train, 'aupr':pr}
+    else:
+        return {'loss_train':eval_train[0], 'acc_train':eval_train[1], 'loss_val':eval_val[0], 'acc_val':eval_val[1], 
+        'precision_score_macro':precision_score_macro, 'precision_score_weighted':precision_score_weighted, 
+        'recall_macro':recall_macro, 'recall_weighted':recall_weighted, 
+        'F1_macro':F1_macro, 'F1_weighted':F1_weighted}
+###################################
+def read_label_fn(label_fn, min_read_cov=20, significance=0.1, CIRIdeepA=False):
+    '''
+    get circid and label for one sample comparison
+    return pos(dict), neg(dict)
+    '''
+    if not CIRIdeepA:
+        pos = {}
+        neg = {}
+        with open(label_fn, 'r') as f:
+            firstline=True
+            for line in f:
+                ele = line.strip().split('\t')
+                if firstline:
+                    header = {ele[i]:i for i in range(len(ele))} # 列名与列名index对应的字典
+                    firstline = False
+                    continue
+                post_pr = float(ele[header['post_pr']])
+                eid = ele[header['ID']]
+                I1 = int(ele[header['I1']])
+                I2 = int(ele[header['I2']])
+                S1 = int(ele[header['S1']])
+                S2 = int(ele[header['S2']])
+                delta = float(ele[header['delta.mle']])
+                if I1+S1 <= min_read_cov or I2+S2 <= min_read_cov:
+                    continue
+                if post_pr > 1-significance and min(S1, S2) > 2:
+                    pos[eid] = post_pr
+                elif post_pr < significance and min(I1, S1)>2 and min(I2, S2)>2:
+                    neg[eid] = post_pr
+        return pos, neg
+    else:
+        no_diff = {}
+        a_higher = {}
+        b_higher = {}
+        with open(label_fn, 'r') as f:
+            firstline=True
+            for line in f:
+                ele = line.strip().split('\t')
+                if firstline:
+                    header = {ele[i]:i for i in range(len(ele))} # 列名与列名index对应的字典
+                    firstline = False
+                    continue
+                post_pr = float(ele[header['post_pr']])
+                eid = ele[header['ID']]
+                I1 = int(ele[header['I1']])
+                I2 = int(ele[header['I2']])
+                S1 = int(ele[header['S1']])
+                S2 = int(ele[header['S2']])
+                #***direction***#
+                delta = float(ele[header['delta.mle']])
+                if I1+S1 <= min_read_cov or I2+S2 <= min_read_cov:
+                    continue
+                if post_pr > 1-significance and min(S1, S2) > 2 and delta < 0:
+                    a_higher[eid] = post_pr
+                if post_pr > 1-significance and min(S1, S2) > 2 and delta > 0:
+                    b_higher[eid] = post_pr
+                elif post_pr < significance and min(I1, S1)>2 and min(I2, S2)>2:
+                    no_diff[eid] = post_pr
+        return no_diff, a_higher, b_higher
+def read_label_fn_predict(label_fn):
+    with open(label_fn, 'r') as f:
+        circlist = f.read().strip().split('\n')
+    return circlist
+def read_train_list(fn):
+    '''
+    return dataframe(sampleID label_fn)
+    '''
+    df = pd.read_table(fn, index_col=0)    # ID label_fn num_pos num_neg geneExp_fn
+    return df
+def read_sequence_feature(seqFeature_fn):
+    data = pd.read_csv(seqFeature_fn, sep='\t', index_col=0)
+    data = data.iloc[:, 13:]
+    return data
+def read_geneExp_absmax(fn):
+    df = pd.read_csv(fn, index_col=0, sep='\t')['max']
+    colnames = list(df.index)
+    rbp_max = df.values
+    return rbp_max, colnames
+def read_geneExp(sample, geneExp_absmax, nrow=1, phase='train', RBPexp_dir=''):
+    # geneExp_fn = os.path.join('/xtdisk/gaoyuan_group/zhouzh/ProjectDeepLearning/Training/TrainingResource_afterfilter/RBPexp', sample+'_rpb.csv')
+    geneExp_fn = os.path.join(RBPexp_dir, sample + '_rpb.csv')
+    df = pd.read_csv(geneExp_fn, index_col=0, sep='\t').transpose()
+    vec = df.values.flatten() / geneExp_absmax # 每个基因的表达比上这个基因的最大值
+    if phase == 'predict':
+        vec[vec > 1] = 1
+    mat = np.tile(vec, (nrow, 1))
+    return mat
+def read_splicing_amount(sample1, sample2, splicing_max, eid_list, phase='train', splicing_dir=''):
+    splicing_amount_max_eidlist = splicing_max.loc[eid_list]
+    # splicing_dir = '/xtdisk/gaoyuan_group/zhouzh/ProjectDeepLearning/Training/TrainingResource_afterfilter/SplicingAmount'
+    splicing_amount_fn1 = os.path.join(splicing_dir, sample1+'.output')
+    splicing_amount_fn2 = os.path.join(splicing_dir, sample2+'.output')
+    df1 = pd.read_csv(splicing_amount_fn1, index_col=0, sep='\t')
+    df1 = df1.loc[eid_list, 'splicing_amount']
+    df1 = df1/splicing_amount_max_eidlist['max_splicing_amount']
+    if phase == 'predict':
+        df1[df1 > 1] = 1
+    df1 = np.asarray(df1)
+    df1 = df1.reshape(df1.shape[0], 1)
+    df2 = pd.read_csv(splicing_amount_fn2, index_col=0, sep='\t')
+    df2 = df2.loc[eid_list, 'splicing_amount']
+    df2 = df2/splicing_amount_max_eidlist['max_splicing_amount']
+    if phase == 'predict':
+        df2[df2 > 1] = 1
+    df2 = np.asarray(df2)
+    df2 = df2.reshape(df2.shape[0], 1)
+    colnames = ['splicing_amount'] ##
+    return df1, df2, colnames
+def construct_data_from_label(label_fn, sample, seqFeature_df, geneExp_absmax, geneExp_colnames, RBPexp_dir, splicing_max='', splicing_dir='', splicing_amount=False, sep='_', CIRIdeepA=False, phase='train'):
+    if CIRIdeepA:
+        if phase == 'train':
+            no_diff, a_higher, b_higher = read_label_fn(label_fn, CIRIdeepA=CIRIdeepA)
+            #***direction***#
+            no_diff_eid = list(seqFeature_df.index.intersection(no_diff.keys()))
+            a_higher_eid = list(seqFeature_df.index.intersection(a_higher.keys()))
+            b_higher_eid = list(seqFeature_df.index.intersection(b_higher.keys()))
+            eid_list = no_diff_eid + a_higher_eid + b_higher_eid + no_diff_eid + a_higher_eid + b_higher_eid
+        elif phase == 'predict':
+            eid_list = list(seqFeature_df.index.intersection(read_label_fn_predict(label_fn)))
+        seqFeature_df = seqFeature_df.loc[eid_list]
+        # trans-feature
+        sample_a = sample.split('_')[0]
+        sample_b = sample.split('_')[1]
+        #***direction***#
+        if phase == 'train':
+            geneExp_a = read_geneExp(sample_a, geneExp_absmax, nrow=int(len(eid_list)/2), RBPexp_dir=RBPexp_dir)
+            geneExp_b = read_geneExp(sample_b, geneExp_absmax, nrow=int(len(eid_list)/2), RBPexp_dir=RBPexp_dir)
+            geneExp_left = np.vstack((geneExp_a, geneExp_b))
+            geneExp_right = np.vstack((geneExp_b, geneExp_a))
+        elif phase == 'predict':
+            geneExp_a = read_geneExp(sample_a, geneExp_absmax, nrow=len(eid_list), phase='predict', RBPexp_dir=RBPexp_dir)
+            geneExp_b = read_geneExp(sample_b, geneExp_absmax, nrow=len(eid_list), phase='predict', RBPexp_dir=RBPexp_dir)
+        if phase == 'train':
+            X = np.hstack((seqFeature_df, geneExp_left, geneExp_right))
+            Y = np_utils.to_categorical(
+                np.asarray([0.]*len(no_diff_eid) + [1.]*len(a_higher_eid) + [2.]*len(b_higher_eid) + 
+                [0.]*len(no_diff_eid) + [2.]*len(a_higher_eid) + [1.]*len(b_higher_eid)))
+            rownames = np.asarray([sample + '-' + x for x in eid_list])
+            geneExp_simple_colnames = [gene for gene in geneExp_colnames]
+            colnames = np.hstack([seqFeature_df.columns.values, geneExp_colnames])
+        elif phase == 'predict':
+            X = np.hstack((seqFeature_df, geneExp_a, geneExp_b))
+            rownames = np.asarray([sample + '-' + x for x in eid_list])
+            geneExp_simple_colnames = [gene for gene in geneExp_colnames]
+            colnames = np.hstack([seqFeature_df.columns.values, geneExp_colnames])
+        if phase == 'train':
+            return X, Y, rownames, colnames
+        elif phase == 'predict':
+            return X, rownames, colnames
+    else:
+        if phase == 'train':
+            pos, neg = read_label_fn(label_fn, CIRIdeepA=CIRIdeepA)
+            pos_eid = list(seqFeature_df.index.intersection(pos.keys()))
+            neg_eid = list(seqFeature_df.index.intersection(neg.keys()))
+            eid_list = pos_eid + neg_eid
+        elif phase == 'predict':
+            eid_list = list(seqFeature_df.index.intersection(read_label_fn_predict(label_fn)))
+        # cis-feature
+        seqFeature_df = seqFeature_df.loc[eid_list]
+        # trans-feature
+        sample_a = sample.split(sep)[0]
+        sample_b = sample.split(sep)[1]
+        geneExp_a = read_geneExp(sample_a, geneExp_absmax, nrow=len(eid_list), phase=phase, RBPexp_dir=RBPexp_dir)
+        geneExp_b = read_geneExp(sample_b, geneExp_absmax, nrow=len(eid_list), phase=phase, RBPexp_dir=RBPexp_dir)
+        # splicing amount 
+        splicing_amount_a, splicing_amount_b, splicing_amount_colnames = read_splicing_amount(sample_a, sample_b, splicing_max, eid_list, phase=phase, splicing_dir=splicing_dir)
+        X = np.hstack((seqFeature_df, geneExp_a, geneExp_b, splicing_amount_a, splicing_amount_b))
+        if phase == 'train':
+            Y = np.asarray([1.]*len(pos_eid) + [0.]*len(neg_eid))
+        rownames = np.asarray([sample + '-' + x for x in eid_list])
+        geneExp_simple_colnames = [sample + '-' + gene for sample in [sample_a, sample_b] for gene in geneExp_colnames]
+        colnames = np.hstack([seqFeature_df.columns.values, geneExp_simple_colnames, ['splicing_a', 'splicing_b']])
+        if phase == 'train':
+            return X, Y, rownames, colnames
+        elif phase == 'predict':
+            return X, rownames, colnames
+def write_current_pred(test_data, y_pred, fn, CIRIdeepA, phase='train'):
+    if phase == 'predict':
+        if CIRIdeepA:
+            with open(fn, 'w') as fo:
+                for i in range(len(test_data['rownames'])):
+                    fo.write('\t'.join([test_data['rownames'][i], str(y_pred[i, 0]), str(y_pred[i, 1]), str(y_pred[i, 2])]) + '\n')
+        else:
+            with open(fn, 'w') as fo:
+                for i in range(len(test_data['rownames'])):
+                    fo.write('\t'.join([test_data['rownames'][i], str(y_pred[i, 0])]) + '\n')
+    elif phase == 'train':
+        if CIRIdeepA:
+            with open(fn, 'w') as fo:
+                for i in range(len(test_data['rownames'])):
+                    fo.write('\t'.join([test_data['rownames'][i], str(test_data['Y'][i, 0]), str(test_data['Y'][i, 1]), str(test_data['Y'][i, 2]), str(y_pred[i, 0]), str(y_pred[i, 1]), str(y_pred[i, 2])]) + '\n')
+        else:
+            with open(fn, 'w') as fo:
+                for i in range(len(test_data['rownames'])):
+                    fo.write('\t'.join([test_data['rownames'][i], str(test_data['Y'][i]), str(y_pred[i, 0])]) + '\n')
+    return
+def plot_auroc(roc_val, roc_test, loss_training, loss_eval, loss_test, test_freq, odir):
+    # plot
+    plt.figure(figsize=(6,6))
+    plt.title('Evaluation/Test ROC')
+    plt.plot(range(1, len(roc_val) + 1), roc_val)
+    plt.plot(range(test_freq, len(roc_val) + 1, test_freq), roc_test)
+    # plt.legend(loc = 'lower right')
+    plt.ylabel('Auroc')
+    plt.xlabel('Step')
+    plt.savefig(os.path.join(odir, 'evaluation test roc.png'))
+    # plot loss
+    plt.figure(figsize=(6,6))
+    plt.title('Training/Evaluation/Test loss')
+    plt.plot(range(1, len(loss_training) + 1), loss_training)
+    plt.plot(range(1, len(loss_eval) + 1), loss_eval)
+    plt.plot(range(test_freq, len(loss_eval) + 1, test_freq), loss_test)
+    plt.ylabel('Loss')
+    plt.xlabel('Step')
+    plt.savefig(os.path.join(odir, 'loss_training_eval_test.png'))
+    return
+def read_splicing_amount_max(fn):
+    df = pd.read_csv(fn, index_col=0, sep='\t')
+    return df
+def eval_continuous_label_auc(y_true, y_pred, outputdir, prefix):
+    auroc = metrics.roc_auc_score(y_true, y_pred)
+    fpr, tpr, threshold = metrics.roc_curve(y_true, y_pred)
+    np.save(os.path.join(outputdir, 'y_true.npy'), y_true)
+    np.save(os.path.join(outputdir, 'y_pred.npy'), y_pred)
+    roc_auc = metrics.auc(fpr, tpr)
+    plt.figure(figsize=(6,6))
+    plt.title('Test AUROC')
+    plt.plot(fpr, tpr, 'b', label = 'Test AUC = %0.3f' % roc_auc)
+    plt.legend(loc = 'lower right')
+    plt.plot([0, 1], [0, 1],'r--')
+    plt.xlim([0, 1])
+    plt.ylim([0, 1])
+    plt.ylabel('True Positive Rate')
+    plt.xlabel('False Positive Rate')
+    plt.savefig(os.path.join(outputdir, prefix+'.png'))
+    return
+def read_roc(odir):
+    fn = os.path.join(odir, 'validation_test_auroc.txt')
+    roc_val = []
+    roc_test = []
+    with open(fn, 'r') as f:
+        for line in f:
+            if line.startswith('v'):
+                continue
+            content = line.strip().split('\t')
+            roc_val.append(content[0])
+            if content[1] != 'none':
+                roc_test.append(content[1])
+    return roc_val, roc_test
+def train_model(fn, odir, geneExp_absmax, seqFeature, RBP_dir, splicing_max='', splicing_dir='', splicing_amount=True, CIRIdeepA=False):
+    if not os.path.isdir(odir):
+        os.mkdir(odir)
+    print('output directory: %s' % odir)
+    print('input train list: %s' % fn)
+    train_list = read_train_list(fn)
+    siam_model = SiameseNet(**config.architecture, splicing_amount=splicing_amount, CIRIdeepA=CIRIdeepA)
+    test_data = {'X':np.empty((0, siam_model.n_in), dtype='float32'), 'Y':np.asarray([], dtype='float32'), 'rownames':np.asarray([], dtype='str')}
+    test_data_lst = {'X':[], 'Y':[], 'rownames':[]}
+    print('Loading features...')
+    seqFeature_df = read_sequence_feature(seqFeature)
+    geneExp_absmax, geneExp_colnames = read_geneExp_absmax(geneExp_absmax)
+    if splicing_amount:
+        splicing_max = read_splicing_amount_max(splicing_max)
+    else:
+        splicing_max = []
+    print('Training start...')
+    data_batch_generator = generate_data_batch(
+        train_list, test_data_lst, seqFeature_df, geneExp_absmax, geneExp_colnames, odir, RBP_dir, splicing_dir=splicing_dir, splicing_max=splicing_max, splicing_amount=splicing_amount, CIRIdeepA=CIRIdeepA)
+    # initiate
+    roc_val = []
+    roc_test = []
+    loss_train_lst = []
+    acc_train_lst = []
+    loss_eval_lst = []
+    acc_eval_lst = []
+    loss_test_lst = []
+    acc_test_lst = []
+    best_roc = 0
+    best_loss = float('Inf')
+    best_loss_mean = float('Inf')
+    patience_loss_mean = 0
+    patience_loss_mean_limit = 10
+    patience_limit = config.patience_limit
+    n_batch = 0
+    patience = 0
+    step = 0
+    # test_freq = config.test_freq
+    test_freq = math.ceil(len(train_list.index)/4)
+    print('step', 'auroc_eval', 'aupr_eval', 'loss_eval', 'acc_eval', 'loss_train', 'acc_train', sep='\t', file=open(odir + '/roc_pr_losseval_losstrain.log', 'a+'))
+    print('step', 'patience', 'auroc_test', 'aupr_test', 'loss_test', 'acc_test', sep='\t', file=open(odir + '/roc_pr_loss_test.log', 'a+'))
+    # training start
+    for data_batch in data_batch_generator:
+        step += 1
+        X = data_batch['X']
+        Y = data_batch['Y']
+        rownames = data_batch['rownames']
+        # train and evaluation
+        inputdata = {'X':X, 'Y':Y, 'rownames':rownames}
+        X_train, X_val, Y_train, Y_val, rownames_train, rownames_val = split_eval_train_data(inputdata, n_fold=config.n_fold)
+        inputdata = {'X_train':X_train, 'X_val':X_val, 'Y_train':Y_train, 'Y_val':Y_val, 'rownames_train':rownames_train, 'rownames_val':rownames_val}
+        print('step %i optimization start.' % step)
+        best_metrics = siam_model.fit(inputdata)
+        if not CIRIdeepA:
+            loss_train_lst.append(best_metrics['loss_train'][0])
+            acc_train_lst.append(best_metrics['loss_train'][1])
+            loss_eval_lst.append(best_metrics['loss_eval'][0])
+            acc_eval_lst.append(best_metrics['loss_eval'][1])
+            print(step, best_metrics['roc'], best_metrics['aupr'], best_metrics['loss_eval'][0], best_metrics['loss_eval'][1], best_metrics['loss_train'][0], best_metrics['loss_train'][1], sep='\t', file=open(odir + '/roc_pr_losseval_losstrain.log', 'a+'))
+            roc_val.append(best_metrics['roc'])
+        else:
+            loss_train_lst.append(best_metrics['loss_train'])
+            acc_train_lst.append(best_metrics['acc_train'])
+            loss_eval_lst.append(best_metrics['loss_val'])
+            acc_eval_lst.append(best_metrics['acc_val'])
+            print('%d\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' % 
+                (step, best_metrics['precision_score_macro'], best_metrics['precision_score_weighted'], 
+                best_metrics['recall_macro'], best_metrics['recall_weighted'], 
+                best_metrics['F1_macro'], best_metrics['F1_weighted'], 
+                best_metrics['loss_val'], best_metrics['acc_val'], 
+                best_metrics['loss_train'], best_metrics['acc_train']), 
+                file=open(odir + '/roc_pr_losseval_losstrain.log', 'a+'))
+        # metrics of test dataset
+        if step % test_freq == 0:
+            if step <= test_freq:
+                test_data_lst['X'] = np.concatenate(test_data_lst['X'])
+                test_data_lst['Y'] = np.concatenate(test_data_lst['Y'])
+                test_data_lst['rownames'] = np.concatenate(test_data_lst['rownames'])
+            loss_test = siam_model.model.evaluate(test_data_lst['X'], test_data_lst['Y'], verbose=1)
+            Y_pred = siam_model.predict(test_data_lst)
+            if not CIRIdeepA:
+                auroc_test = metrics.roc_auc_score(test_data_lst['Y'], Y_pred)
+                aupr_test = metrics.average_precision_score(test_data_lst['Y'], Y_pred)
+                roc_test.append(auroc_test)
+                loss_test_lst.append(loss_test[0])
+                acc_test_lst.append(loss_test[1])
+                # write predict result
+                write_current_pred(test_data_lst, Y_pred, os.path.join(odir, 'step_{0}_pred.txt'.format(step)), CIRIdeepA, phase='train')
+                # plot auroc
+                plot_auroc(roc_val, roc_test, loss_train_lst, loss_eval_lst, loss_test_lst, test_freq, odir)
+            else:
+                Y_predict_categorical = (Y_pred == Y_pred.max(axis=1, keepdims=True)).astype('float')
+                precision_score_macro = metrics.precision_score(test_data_lst['Y'], Y_predict_categorical, average="macro")
+                precision_score_weighted = metrics.precision_score(test_data_lst['Y'], Y_predict_categorical, average="weighted")
+                recall_macro = metrics.recall_score(test_data_lst['Y'], Y_predict_categorical, average='macro')
+                recall_weighted = metrics.recall_score(test_data_lst['Y'], Y_predict_categorical, average="weighted")
+                F1_macro = metrics.f1_score(test_data_lst['Y'], Y_predict_categorical, average='macro')
+                F1_weighted = metrics.f1_score(test_data_lst['Y'], Y_predict_categorical, average='weighted')
+                loss_test_lst.append(loss_test[0])
+                acc_test_lst.append(loss_test[1])
+                write_current_pred(test_data_lst, Y_pred, os.path.join(odir, 'step_{0}_pred.txt'.format(step)), CIRIdeepA, phase='train')
+                # plot auroc
+                plot_auroc(acc_eval_lst, acc_test_lst, loss_train_lst, loss_eval_lst, loss_test_lst, test_freq, odir)
+            # compare the metrics
+            if loss_test[0] < best_loss:
+                best_loss = loss_test[0]
+                siam_model.model.save(os.path.join(odir, str(step)+'_model_weight.h5'))
+                patience = 0
+            else:
+                patience += 1
+                siam_model.model.save(os.path.join(odir, str(step)+'_model_weight.h5'))
+                if patience > patience_limit:
+                    print('patience > patience_limit. Early Stop.')
+                    break
+            del siam_model
+            K.clear_session()
+            siam_model = SiameseNet(**config.architecture, splicing_amount=splicing_amount, CIRIdeepA=CIRIdeepA)
+            siam_model.model.load_weights(os.path.join(odir, str(step)+'_model_weight.h5'))
+            # print test result 
+            print('*' * 50)
+            print('test result:')
+            if not CIRIdeepA:
+                print('step %i: auroc-%.3f; aupr-%.3f; best_loss=%.3f    patience-%i' % (step, auroc_test, aupr_test, best_loss, patience))
+                print('*' * 50)
+                print(step, patience, auroc_test, aupr_test, loss_test[0], loss_test[1], sep='\t', file=open(odir + '/roc_pr_loss_test.log', 'a+'))
+            else:
+                print('step %i: acc-%.3f; loss=%.3f; best loss=%.3f; patience-%i' % (step, loss_test[1], loss_test[0], best_loss, patience))
+                print('*' * 50)
+                print('%d\t%d\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' % 
+                    (step, patience, precision_score_macro, precision_score_weighted, 
+                    recall_macro, recall_weighted, 
+                    F1_macro, F1_weighted, 
+                    loss_test[0], loss_test[1]), 
+                    file=open(odir + '/roc_pr_loss_test.log', 'a+'))
+    siam_model.model.save(os.path.join(odir, 'final_model_weight.h5'))
+    Y_pred = siam_model.predict(test_data)
+    auroc_test = metrics.roc_auc_score(test_data['Y'], Y_pred)
+    print('auroc after 10 epoch:', auroc_test)
+    eval_continuous_label_auc(test_data['Y'], Y_pred, odir, 'auroc')
+    return
+def main():
+    prog = 'CIRI-deep'
+    description = 'CIRI-deep predicts differentially spliced circRNAs between biological samples. This program can be used for training and predicting.'
+    epilog = 'Zihan Zhou <zhouzihan2018m@big.ac.cn>'
+    argparser = ap.ArgumentParser(prog=prog, description=description, epilog=epilog)
+    subparser = argparser.add_subparsers(dest='subcommand')
+    train = subparser.add_parser('train', help='Train CIRI-deep or CIRI-deep(A) using customized data.')
+    predict = subparser.add_parser('predict', help='Predict differentially spliced circRNA using CIRI-deep.')
+    predict.add_argument('-predict_list', help='file to predict.', dest='predict_list', required=True)
+    predict.add_argument('-model_path', help='model path.', dest='model_path', required=True)
+    predict.add_argument('-outdir', help='output directory.', dest='outdir', required=True)
+    predict.add_argument('-geneExp_absmax', help='gene expression max value.', dest='geneExp_absmax', required=True)
+    predict.add_argument('-seqFeature', help='cis feature for all circular RNA.', dest='seqFeature', required=True)
+    predict.add_argument('-RBP_dir', help='RBP directory.', dest='RBP_dir', required=True)
+    predict.add_argument('-splicing_max', help='splicing amount max value.', default='', dest='splicing_max')
+    predict.add_argument('-splicing_dir', help='splicing_amount directory.', default='', dest='splicing_dir')
+    predict.add_argument('-sep', help='seperate two samples.', default='_', dest='sep')
+    predict.add_argument('--CIRIdeepA', action='store_true', dest='CIRIdeepA')
+    train.add_argument('-train_list', help='file to predict.', dest='train_list', required=True)
+    train.add_argument('-outdir', help='output directory.', dest='outdir', required=True)
+    train.add_argument('-geneExp_absmax', help='gene expression max value.', dest='geneExp_absmax', required=True)
+    train.add_argument('-seqFeature', help='cis feature for all circular RNA.', dest='seqFeature', required=True)
+    train.add_argument('-splicing_max', help='splicing amount max value.', dest='splicing_max', default='')
+    train.add_argument('-RBP_dir', help='RBP directory.', dest='RBP_dir', required=True)
+    train.add_argument('-splicing_dir', help='splicing_amount directory.', dest='splicing_dir', default='')
+    train.add_argument('--CIRIdeepA', action='store_true', dest='CIRIdeepA')
+    args = argparser.parse_args()
+    subcommand = args.subcommand
+    if subcommand == 'train':
+        train_list = args.train_list
+        outdir = args.outdir
+        geneExp_absmax = args.geneExp_absmax
+        seqFeature = args.seqFeature
+        splicing_max = args.splicing_max
+        splicing_dir = args.splicing_dir
+        RBP_dir = args.RBP_dir
+        CIRIdeepA = args.CIRIdeepA
+        if CIRIdeepA:
+            splicing_amount = False
+        else:
+            splicing_amount = True
+        train_model(train_list, outdir, geneExp_absmax, seqFeature, RBP_dir, splicing_max=splicing_max, splicing_dir=splicing_dir, splicing_amount=splicing_amount, CIRIdeepA=CIRIdeepA)
+    if subcommand == 'predict':
+        geneExp_absmax = args.geneExp_absmax
+        seqFeature = args.seqFeature
+        splicing_max = args.splicing_max
+        test_fn = args.predict_list
+        model_path = args.model_path
+        out_dir = args.outdir
+        RBP_dir = args.RBP_dir
+        splicing_dir = args.splicing_dir
+        sep = args.sep
+        CIRIdeepA = args.CIRIdeepA
+        if CIRIdeepA:
+            splicing_amount = False
+        else:
+            splicing_amount = True
+        phase = 'predict'
+        print('Loading file from %s' % seqFeature)
+        seqFeature_df = read_sequence_feature(seqFeature)
+        print('Loading gene expression from %s' % geneExp_absmax)
+        geneExp_absmax, geneExp_colnames = read_geneExp_absmax(geneExp_absmax)
+        if splicing_amount:
+            print('Loading splicing amount feature from %s' % splicing_max)
+            splicing_max = read_splicing_amount_max(splicing_max)
+        print('Loading model weight from %s' % model_path)
+        clf = load_model(model_path)
+        test_list = read_train_list(test_fn)
+        print('Prediction...')
+        for i in range(len(test_list)):
+            inputfile = test_list.label_fn[i]
+            sample = test_list.index[i]
+            X_tmp, rownames_tmp, colnames = construct_data_from_label(inputfile, 
+              sample, seqFeature_df, geneExp_absmax, geneExp_colnames, RBP_dir, splicing_max=splicing_max, splicing_amount=splicing_amount, phase=phase, splicing_dir=splicing_dir, sep=sep, CIRIdeepA=CIRIdeepA)
+            test_data = {'X':X_tmp, 'rownames':rownames_tmp}
+            y_pred = clf.predict([test_data['X']])
+            write_current_pred(test_data, y_pred, '%s/%s.txt' % (out_dir, sample), CIRIdeepA, phase='predict')
+    return 
+if __name__ == '__main__':
+    main()
\ No newline at end of file
--- a/config.py
+++ b/config.py
+import os
+test_prop = 0.01
+random_seed_test = 12345
+random_seed_eval = 12345
+n_fold = 10
+pos_prop = 0.7
+class_weight = {0:1, 1:1.2, 2:1.2}
+architecture = {
+    'n_in': 6523, 
+    'hidden': [1200, 500, 300, 200], 
+    'drop_out': [0, 0.5, 0.3, 0.2, 0.1], 
+    'batch_size': 512, 
+    'learning_rate': 0.001
+    }
+test_freq = 6000
+patience_limit = 200
+n_epoch = 100