|
|
|
|
|
|
|
|
|
|
|
import os |
|
|
|
|
|
import argparse |
|
|
|
|
|
|
|
|
|
|
|
import numpy as np |
|
|
|
|
|
import pandas as pd |
|
|
|
|
|
|
|
|
|
|
|
from tqdm import tqdm |
|
|
|
|
|
|
|
|
|
|
|
tqdm.pandas() |
|
|
|
|
|
|
|
|
|
|
|
DATA_DIR = './data' |
|
|
|
|
|
OUTPUT_DIR = './output' |
|
|
|
|
|
|
|
|
|
|
|
cancers_with_ILMN = ['Pancreas'] # gene_id: ILMN_ |
|
|
|
|
|
cancers_with_NM = ['Nervous System'] # gene_id: NM_/NR/_ |
|
|
|
|
|
cancers_with_ENS_version = ['Blood'] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def read_data(folder: str, file_path: str): |
|
|
|
|
|
if file_path.endswith('.tsv'): |
|
|
|
|
|
return pd.read_csv(f'{DATA_DIR}/{folder}/{file_path}', sep='\t', header=0, low_memory=False) |
|
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def read_data_csv(folder: str, file_path: str): |
|
|
|
|
|
return pd.read_csv(f'{DATA_DIR}/{folder}/{file_path}', header=0, low_memory=False) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def read_chunk_by_chunk(folder: str, file_path: str, columns=None): |
|
|
|
|
|
df = pd.DataFrame() |
|
|
|
|
|
for chunk in pd.read_csv(f"{DATA_DIR}/{folder}/{file_path}", sep='\t', header=0, low_memory=False, chunksize=1e6): |
|
|
|
|
|
df = pd.concat([df, chunk[columns] if columns else chunk], ignore_index=True) |
|
|
|
|
|
return df |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def save_tsv(df, output_path, file_path): |
|
|
|
|
|
if not os.path.exists(output_path): |
|
|
|
|
|
os.makedirs(output_path) |
|
|
|
|
|
df.to_csv(f'{output_path}/{file_path}', sep='\t') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_mutation_data(cancer_type, mutation_path, mutation_type=None): |
|
|
|
|
|
columns = ['icgc_donor_id', 'gene_affected', 'mutation_type'] |
|
|
|
|
|
data = read_chunk_by_chunk(cancer_type, mutation_path, columns) |
|
|
|
|
|
if mutation_type: |
|
|
|
|
|
data = data[data['mutation_type'] == mutation_type] \ |
|
|
|
|
|
.drop(columns=['mutation_type']) |
|
|
|
|
|
return data.dropna() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_expression_data(cancer_type, expression_path): |
|
|
|
|
|
columns = ['icgc_donor_id', 'gene_id'] |
|
|
|
|
|
data = read_chunk_by_chunk(cancer_type, expression_path, columns) |
|
|
|
|
|
return data.dropna() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_genes(genes_path, gene_class=None): |
|
|
|
|
|
genes = pd.read_csv(genes_path, sep='\t', header=0) |
|
|
|
|
|
genes.gene_symbol = list(map(lambda g: g[1:-1], genes.gene_symbol)) |
|
|
|
|
|
if gene_class: |
|
|
|
|
|
genes = genes[genes['gene_class'] == gene_class] |
|
|
|
|
|
genes = genes[['gene_name', 'gene_symbol']] \ |
|
|
|
|
|
.rename({'gene_name': 'gene_ensembl_id'}, axis=1) |
|
|
|
|
|
return genes |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def store_summary(df, output_path, file_name): |
|
|
|
|
|
if not os.path.exists(output_path): |
|
|
|
|
|
os.makedirs(output_path) |
|
|
|
|
|
save_tsv(df, OUTPUT_DIR, f'result-{file_name}.tsv') |
|
|
|
|
|
print(f'>>> Summary for {file_name} (considering mutation and expression):') |
|
|
|
|
|
print('\tDonors in Common:', df.shape[0]) |
|
|
|
|
|
print('\tGenes in Common:', df.shape[1]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def perform_analysis(args, cancer_type): |
|
|
|
|
|
genes = get_genes(args.genes_path) |
|
|
|
|
|
|
|
|
|
|
|
### Mutation |
|
|
|
|
|
mut = get_mutation_data(cancer_type, args.mutation_path, mutation_type='single base substitution') |
|
|
|
|
|
mut_data = mut.rename({'gene_affected': 'gene_ensembl_id'}, axis=1) |
|
|
|
|
|
sign_mut_samples = pd.merge(genes, mut_data, how='left', on='gene_ensembl_id') \ |
|
|
|
|
|
.drop(columns=['gene_ensembl_id']) \ |
|
|
|
|
|
.drop_duplicates() \ |
|
|
|
|
|
.dropna() |
|
|
|
|
|
|
|
|
|
|
|
### Expression |
|
|
|
|
|
expr = get_expression_data(cancer_type, args.expression_path) |
|
|
|
|
|
|
|
|
|
|
|
#### Before this part the R script needs to have been run to convert Illumina probe to gene |
|
|
|
|
|
if cancer_type in cancers_with_ILMN: |
|
|
|
|
|
ILMN_genes = pd.read_csv(f'./{DATA_DIR}/{cancer_type}/converted_genes.csv')['Gene'] |
|
|
|
|
|
expr['gene_symbol'] = ILMN_genes |
|
|
|
|
|
expr = expr.dropna() |
|
|
|
|
|
if cancer_type in cancers_with_ENS_version + cancers_with_NM: |
|
|
|
|
|
converted_genes = pd.read_csv(f'./{DATA_DIR}/{cancer_type}/converted_genes.csv') |
|
|
|
|
|
expr = pd.merge(expr, converted_genes, how='left', left_on="gene_id", right_on='initial_id') |
|
|
|
|
|
expr = expr.rename({'Gene': 'gene_symbol'}, axis=1) |
|
|
|
|
|
expr = expr.dropna() |
|
|
|
|
|
else: |
|
|
|
|
|
expr = expr.rename({'gene_id': 'gene_symbol'}, axis=1) |
|
|
|
|
|
|
|
|
|
|
|
## Merge datasets |
|
|
|
|
|
### Find intersection |
|
|
|
|
|
cols = ['icgc_donor_id', 'gene_symbol'] |
|
|
|
|
|
mut_data = sign_mut_samples[cols] |
|
|
|
|
|
expr_data = expr[cols] |
|
|
|
|
|
|
|
|
|
|
|
common_donors = np.intersect1d(mut_data[['icgc_donor_id']], expr_data[['icgc_donor_id']]) |
|
|
|
|
|
print('Initial common donors:', common_donors.shape) |
|
|
|
|
|
common_genes = np.intersect1d(mut_data[['gene_symbol']], expr_data[['gene_symbol']]) |
|
|
|
|
|
print('Initial common genes:', common_genes.shape) |
|
|
|
|
|
|
|
|
|
|
|
# Narrow down both datasets |
|
|
|
|
|
final_mut = pd.merge(pd.Series(common_genes, name='gene_symbol'), mut_data, how='left', on='gene_symbol') |
|
|
|
|
|
final_mut = pd.merge(pd.Series(common_donors, name='icgc_donor_id'), final_mut, how='left', on='icgc_donor_id') |
|
|
|
|
|
final_expr = pd.merge(pd.Series(common_genes, name='gene_symbol'), expr_data, how='left', on='gene_symbol') |
|
|
|
|
|
final_expr = pd.merge(pd.Series(common_donors, name='icgc_donor_id'), final_expr, how='left', on='icgc_donor_id') |
|
|
|
|
|
|
|
|
|
|
|
updated_common_genes = np.intersect1d(final_expr.gene_symbol.unique(), final_mut.gene_symbol.unique()) |
|
|
|
|
|
updated_common_donor = np.intersect1d(final_expr.icgc_donor_id.unique(), final_mut.icgc_donor_id.unique()) |
|
|
|
|
|
final_mut = pd.merge(pd.Series(updated_common_genes, name='gene_symbol'), final_mut, how='left', on='gene_symbol') |
|
|
|
|
|
final_mut = pd.merge(pd.Series(updated_common_donor, name='icgc_donor_id'), final_mut, how='left', |
|
|
|
|
|
on='icgc_donor_id') |
|
|
|
|
|
final_expr = pd.merge(pd.Series(updated_common_genes, name='gene_symbol'), final_expr, how='left', on='gene_symbol') |
|
|
|
|
|
final_expr = pd.merge(pd.Series(updated_common_donor, name='icgc_donor_id'), final_expr, how='left', |
|
|
|
|
|
on='icgc_donor_id') |
|
|
|
|
|
|
|
|
|
|
|
final_mut.sort_values(by='gene_symbol', inplace=True) |
|
|
|
|
|
final_expr.sort_values(by='gene_symbol', inplace=True) |
|
|
|
|
|
sorted_common_genes = np.sort(updated_common_genes) |
|
|
|
|
|
|
|
|
|
|
|
### Matrix generation |
|
|
|
|
|
result_mut = pd.DataFrame(index=updated_common_donor, columns=updated_common_genes).fillna(0) |
|
|
|
|
|
for idx, row in tqdm(final_mut.drop_duplicates().groupby('icgc_donor_id')['gene_symbol'].apply(list).iteritems()): |
|
|
|
|
|
result_mut.loc[idx, row] = 1 |
|
|
|
|
|
|
|
|
|
|
|
result_expr = pd.DataFrame(index=updated_common_donor, columns=updated_common_genes).fillna(0) |
|
|
|
|
|
for idx, row in tqdm(final_expr.drop_duplicates().groupby('icgc_donor_id')['gene_symbol'].apply(list).iteritems()): |
|
|
|
|
|
result_expr.loc[idx, row] = 1 |
|
|
|
|
|
|
|
|
|
|
|
result_values = result_expr.values * result_mut.values |
|
|
|
|
|
result = pd.DataFrame(data=result_values, index=updated_common_donor, columns=updated_common_genes) |
|
|
|
|
|
|
|
|
|
|
|
#### Store results |
|
|
|
|
|
store_summary(result, OUTPUT_DIR, f'result-{cancer_type}.csv') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def run(args): |
|
|
|
|
|
if not args.cancer_type: |
|
|
|
|
|
if args.run_all: |
|
|
|
|
|
sub_folders = [f.name for f in os.scandir(args.data_path) if f.is_dir()] |
|
|
|
|
|
for cancer_type in sub_folders: |
|
|
|
|
|
perform_analysis(args, cancer_type) |
|
|
|
|
|
else: |
|
|
|
|
|
raise Exception('Either set --cancer-type or set run_all to True') |
|
|
|
|
|
if not os.path.exists(f'{args.data_path}/{args.cancer_type}'): |
|
|
|
|
|
raise Exception('arg --cancer-type is not a valid directory') |
|
|
|
|
|
perform_analysis(args, args.cancer_type) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
|
|
parser = argparse.ArgumentParser() |
|
|
|
|
|
|
|
|
|
|
|
parser.add_argument('--cancer-type', type=str, default='Test') |
|
|
|
|
|
parser.add_argument('--run-all', type=bool, default=False) |
|
|
|
|
|
|
|
|
|
|
|
parser.add_argument('--data-path', type=str, default='./data') |
|
|
|
|
|
parser.add_argument('--genes-path', type=str, default='./data/genes_list.tsv') |
|
|
|
|
|
|
|
|
|
|
|
parser.add_argument('--expression-path', type=str, default='exp_array.tsv') |
|
|
|
|
|
parser.add_argument('--mutation-path', type=str, default='simple_somatic_mutation.open.tsv') |
|
|
|
|
|
parser.add_argument('--output-path', type=str, default='./output') |
|
|
|
|
|
|
|
|
|
|
|
run(parser.parse_args()) |