|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119 |
- __author__ = "Alireza Tajmirriahi"
- __version__ = "1.0.0"
- __maintainer__ = "Alireza Tajmirriahi"
- __email__ = "[email protected]"
- __project__ = "CancerGenomics"
-
- import argparse
- import os
-
- import numpy as np
- import pandas as pd
-
- from tqdm import tqdm
-
- DATA_DIR = '/PROJECTS/Taj/1_PreprocessData/data'
- MATRICES_PATH = '/PROJECTS/Taj/1_PreprocessData/output'
- OUT_DIR = 'ICGC'
- MIN_SAMPLES = 10 # 50
- TRAIN_SPLIT = 0.70
-
-
- def aggregate_dataframes(data):
- cols = sorted(set.union(*[set(df.columns) for df in data.values()]))
- to_merge = []
- for label, df in data.items():
- new_df = df.reindex(columns=cols, fill_value=0)
- new_df['label'] = label
- to_merge.append(new_df)
- return pd.concat(to_merge)
-
-
- def agg_mutation(mat, cancer_type, file_name):
- df = pd.read_csv(f'{args.DATA_DIR}/{cancer_type}/{file_name}', sep='\t')
- mutation_count = df.groupby(['icgc_donor_id', 'gene_symbol']).size()
-
- result = mat.copy()
- for row in tqdm(result.index):
- cols = mat.columns[np.where(mat.loc[row] == 1)]
- for col in cols:
- result.loc[row][col] = mutation_count[row][col]
- return result
-
-
- def agg_expression(mat, cancer_type, file_name):
- df = pd.read_csv(f'{args.DATA_DIR}/{cancer_type}/{file_name}')
- df = df.drop(columns=['Unnamed: 0'], axis=1)
- df = df.set_index(['icgc_donor_id', 'gene_id'])
- df = df[~df.index.duplicated(keep='first')]
-
- result = mat.copy().astype(np.float64)
- for row in tqdm(result.index):
- cols = mat.columns[np.where(mat.loc[row] == 1)]
- for col in cols:
- result.loc[row][col] = df.loc[row, col].normalized_expression_value
- return result
-
-
- def main():
- file_paths = [*os.walk(MATRICES_PATH)][0][2]
- all_dfs = list(map(lambda p: pd.read_csv(f'{args.MATRICES_PATH}/{p}', index_col=0, delimiter='\t'), file_paths))
- all_labels = list(map(lambda p: p[7:-4], file_paths))
-
- omic_paths = ['symbol_mutation.tsv', 'expression_data.tsv']
- mut_data, exp_data = dict(), dict()
-
- for label, df in zip(all_labels, all_dfs):
- if len(df) >= args.MIN_SAMPLES:
- print('Aggregating', label, '...')
- mut_data[label] = agg_mutation(df, label, omic_paths[0])
- exp_data[label] = agg_expression(df, label, omic_paths[1])
-
- print('Merging dataframes', end=' ')
- merged_mut = aggregate_dataframes(mut_data)
- merged_exp = aggregate_dataframes(exp_data)
- print(f'Done')
-
- assert merged_mut.shape == merged_exp.shape
- labels = merged_mut.label
-
- train_indices = np.random.choice([True, False], len(merged_mut), p=[args.TRAIN_SPLIT, 1 - args.TRAIN_SPLIT])
-
- labels_train = labels[train_indices]
- labels_test = labels[~train_indices]
- mut_train = merged_mut[train_indices]
- mut_test = merged_mut[~train_indices]
- exp_train = merged_exp[train_indices]
- exp_test = merged_exp[~train_indices]
-
- if not os.path.exists(args.OUT_DIR):
- os.mkdir(args.OUT_DIR)
- labels_train.to_csv(f'{args.OUT_DIR}/labels_tr.csv', index=False, header=False)
- labels_test.to_csv(f'{args.OUT_DIR}/labels_te.csv', index=False, header=False)
- merged_mut.columns.to_frame().to_csv(f'{args.OUT_DIR}/1_featname.csv', index=False, header=False)
- merged_exp.columns.to_frame().to_csv(f'{args.OUT_DIR}/2_featname.csv', index=False, header=False)
- mut_train.to_csv(f'{args.OUT_DIR}/1_tr.csv', index=False, header=False)
- mut_test.to_csv(f'{args.OUT_DIR}/1_te.csv', index=False, header=False)
- exp_train.to_csv(f'{args.OUT_DIR}/2_tr.csv', index=False, header=False)
- exp_test.to_csv(f'{args.OUT_DIR}/2_te.csv', index=False, header=False)
-
- print('num classes=', len(mut_data.keys()))
- # run_mogonet(num_class=len(mut_data.keys()))
-
-
- if __name__ == '__main__':
- parser = argparse.ArgumentParser()
-
- parser.add_argument('--data-dir', action='store', dest='DATA_DIR', type=str,
- default='/PROJECTS/Taj/1_PreprocessData/data', help='Path to data')
- parser.add_argument('--matrices-path', action='store', dest='MATRICES_PATH', type=str,
- default='/PROJECTS/Taj/1_PreprocessData/output', help='Path to preprocessed matrices')
- parser.add_argument('--out-dir', action='store', dest='OUT_DIR', type=str, default='ICGC',
- help='The output directory')
- parser.add_argument('--min-samples', action='store', dest='MIN_SAMPLES', type=int, default=10,
- help='Minimum samples required to include a cancer type')
- parser.add_argument('--train-split', action='store', dest='TRAIN_SPLIT', type=float, default=0.70,
- help='train/test split. By default set to 0.70')
-
- args = parser.parse_args()
- main()
|