Browse Source

Add code for the 1st stage

master
AlirezaT99 1 year ago
parent
commit
a71923b550

+ 3
- 0
1-Preprocessing/README.md View File

@@ -0,0 +1,3 @@
### Preprocess data

TODO

+ 23
- 0
1-Preprocessing/expressionSummarizer.r View File

@@ -0,0 +1,23 @@
cancer_types <- c('Brain', 'Breast', 'Colorectal', 'Lung', 'Nervous System', 'Pancreas', 'Uterus')
trouble_maker <- c('Blood', 'Pancreas')

prepare_expression <- function(data_folder) {
exp_path <- sprintf('%s/exp_array.tsv', data_folder)
exp_array <- read.csv(exp_path, sep='\t')[c('icgc_donor_id', 'gene_id', 'normalized_expression_value')]
if (data_folder %in% trouble_maker) {
mapper <- read.csv(sprintf("%s/%s_gene_mapper.csv", data_folder, data_folder))[c('gene_id', 'gene_symbol')]
exp_array$gene_id <- sub("[.][0-9]*", "", exp_array$gene_id)
exp_array1 <- merge(x = exp_array, y = mapper, by = 'gene_id', all.x = TRUE)
exp_array1 <- na.omit(exp_array1)
exp_array <- exp_array1[c('icgc_donor_id', 'gene_symbol', 'normalized_expression_value')]
colnames(exp_array)[colnames(exp_array) == 'gene_symbol'] <- 'gene_id'
}
write.csv(x = exp_array, file = sprintf("%s/expression_data.tsv", data_folder), sep = "\t")
}

setwd("../../1_PreprocessData/data")

for (c_type in cancer_types) {
print(sprintf("Working on %s", c_type))
prepare_expression(data_folder = c_type)
}

+ 47
- 0
1-Preprocessing/geneIdConverter.r View File

@@ -0,0 +1,47 @@
#### Install and library ####

# install.packages("biotools")

if (!require("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
BiocManager::install("illuminaHumanv4.db")
BiocManager::install("clusterProfiler")
BiocManager::install("org.Hs.eg.db")
}

library("illuminaHumanv4.db")
library("clusterProfiler")
library("org.Hs.eg.db")

#### read genes ####
exp_path <- '../../1_PreprocessData/data/Blood/exp_array.tsv'
genes <- read.csv(exp_path, sep='\t')[['gene_id']]

#### Preprocess the genes ####

src <- "ENSEMBL"
dst <- "SYMBOL"

if (substr(genes[1], 0, 3) == "ENS") { # ENSEMBL + version
genes <- sub("[.][0-9]*", "", genes)
} else if (substr(genes[1], 0, 3) == "NM_" | substr(genes[1], 0, 3) == "NR_") { # RefSeq
src <- "REFSEQ"
}

#### Convert probe Id to gene symbol ####

df <- data.frame(
bitr(
genes,
fromType = src,
toType = dst,
OrgDb = org.Hs.eg.db,
drop = TRUE
)
)
colnames(df) <- c('initial_id', 'Gene')

# For Illumina (Such as Pancreas)
# df <- data.frame(Gene=unlist(mget(x = genes, envir = illuminaHumanv4SYMBOL)))

write.csv(x = df, file = './converted_genes.csv')

+ 98
- 0
1-Preprocessing/prepare_mutation.py View File

@@ -0,0 +1,98 @@
import os
import argparse

import numpy as np
import pandas as pd

from tqdm import tqdm

tqdm.pandas()

DATA_DIR = './data'


def read_data(folder: str, file_path: str):
if file_path.endswith('.tsv'):
return pd.read_csv(f'{DATA_DIR}/{folder}/{file_path}', sep='\t', header=0, low_memory=False)
return None


def read_data_csv(folder: str, file_path: str):
return pd.read_csv(f'{DATA_DIR}/{folder}/{file_path}', header=0, low_memory=False)


def read_chunk_by_chunk(folder: str, file_path: str, columns=None):
df = pd.DataFrame()
for chunk in pd.read_csv(f"{DATA_DIR}/{folder}/{file_path}", sep='\t', header=0, low_memory=False, chunksize=1e6):
df = pd.concat([df, chunk[columns] if columns else chunk], ignore_index=True)
return df


def save_tsv(df, output_path, file_path):
if not os.path.exists(output_path):
os.makedirs(output_path)
df.to_csv(f'{output_path}/{file_path}', sep='\t')


def get_mutation_data(cancer_type, mutation_path, mutation_type=None):
columns = ['icgc_donor_id', 'gene_affected', 'mutation_type']
data = read_chunk_by_chunk(cancer_type, mutation_path, columns)
if mutation_type:
data = data[data['mutation_type'] == mutation_type] \
.drop(columns=['mutation_type'])
return data.dropna()


def get_genes(genes_path, gene_class=None):
genes = pd.read_csv(genes_path, sep='\t', header=0)
genes.gene_symbol = list(map(lambda g: g[1:-1], genes.gene_symbol))
if gene_class:
genes = genes[genes['gene_class'] == gene_class]
genes = genes[['gene_name', 'gene_symbol']] \
.rename({'gene_name': 'gene_ensembl_id'}, axis=1)
return genes


def perform_analysis(args, cancer_type):
genes = get_genes(args.genes_path)
print('Converting', cancer_type, end='...')


### Mutation
mut = get_mutation_data(cancer_type, args.mutation_path, mutation_type='single base substitution')
mut_data = mut.rename({'gene_affected': 'gene_ensembl_id'}, axis=1)
sign_mut_samples = pd.merge(genes, mut_data, how='left', on='gene_ensembl_id') \
.drop(columns=['gene_ensembl_id']) \
.drop_duplicates() \
.dropna()
sign_mut_samples.to_csv(f'{DATA_DIR}/{cancer_type}/symbol_mutation.tsv', sep='\t')
print('done')


def run(args):
if not args.cancer_type:
if args.run_all:
sub_folders = [f.name for f in os.scandir(args.data_path) if f.is_dir()]
for cancer_type in sub_folders:
perform_analysis(args, cancer_type)
else:
raise Exception('Either set --cancer-type or set run_all to True')
if not os.path.exists(f'{args.data_path}/{args.cancer_type}'):
raise Exception('arg --cancer-type is not a valid directory')
perform_analysis(args, args.cancer_type)


if __name__ == '__main__':
parser = argparse.ArgumentParser()

parser.add_argument('--cancer-type', type=str) # , default='Test'
parser.add_argument('--run-all', type=bool, default=False)

parser.add_argument('--data-path', type=str, default='./data')
parser.add_argument('--genes-path', type=str, default='./data/genes_list.tsv')

parser.add_argument('--expression-path', type=str, default='exp_array.tsv')
parser.add_argument('--mutation-path', type=str, default='simple_somatic_mutation.open.tsv')
parser.add_argument('--output-path', type=str, default='./output')

run(parser.parse_args())

+ 175
- 0
1-Preprocessing/preprocessing.py View File

@@ -0,0 +1,175 @@
import os
import argparse

import numpy as np
import pandas as pd

from tqdm import tqdm

tqdm.pandas()

DATA_DIR = './data'
OUTPUT_DIR = './output'

cancers_with_ILMN = ['Pancreas'] # gene_id: ILMN_
cancers_with_NM = ['Nervous System'] # gene_id: NM_/NR/_
cancers_with_ENS_version = ['Blood']


def read_data(folder: str, file_path: str):
if file_path.endswith('.tsv'):
return pd.read_csv(f'{DATA_DIR}/{folder}/{file_path}', sep='\t', header=0, low_memory=False)
return None


def read_data_csv(folder: str, file_path: str):
return pd.read_csv(f'{DATA_DIR}/{folder}/{file_path}', header=0, low_memory=False)


def read_chunk_by_chunk(folder: str, file_path: str, columns=None):
df = pd.DataFrame()
for chunk in pd.read_csv(f"{DATA_DIR}/{folder}/{file_path}", sep='\t', header=0, low_memory=False, chunksize=1e6):
df = pd.concat([df, chunk[columns] if columns else chunk], ignore_index=True)
return df


def save_tsv(df, output_path, file_path):
if not os.path.exists(output_path):
os.makedirs(output_path)
df.to_csv(f'{output_path}/{file_path}', sep='\t')


def get_mutation_data(cancer_type, mutation_path, mutation_type=None):
columns = ['icgc_donor_id', 'gene_affected', 'mutation_type']
data = read_chunk_by_chunk(cancer_type, mutation_path, columns)
if mutation_type:
data = data[data['mutation_type'] == mutation_type] \
.drop(columns=['mutation_type'])
return data.dropna()


def get_expression_data(cancer_type, expression_path):
columns = ['icgc_donor_id', 'gene_id']
data = read_chunk_by_chunk(cancer_type, expression_path, columns)
return data.dropna()


def get_genes(genes_path, gene_class=None):
genes = pd.read_csv(genes_path, sep='\t', header=0)
genes.gene_symbol = list(map(lambda g: g[1:-1], genes.gene_symbol))
if gene_class:
genes = genes[genes['gene_class'] == gene_class]
genes = genes[['gene_name', 'gene_symbol']] \
.rename({'gene_name': 'gene_ensembl_id'}, axis=1)
return genes


def store_summary(df, output_path, file_name):
if not os.path.exists(output_path):
os.makedirs(output_path)
save_tsv(df, OUTPUT_DIR, f'result-{file_name}.tsv')
print(f'>>> Summary for {file_name} (considering mutation and expression):')
print('\tDonors in Common:', df.shape[0])
print('\tGenes in Common:', df.shape[1])


def perform_analysis(args, cancer_type):
genes = get_genes(args.genes_path)

### Mutation
mut = get_mutation_data(cancer_type, args.mutation_path, mutation_type='single base substitution')
mut_data = mut.rename({'gene_affected': 'gene_ensembl_id'}, axis=1)
sign_mut_samples = pd.merge(genes, mut_data, how='left', on='gene_ensembl_id') \
.drop(columns=['gene_ensembl_id']) \
.drop_duplicates() \
.dropna()

### Expression
expr = get_expression_data(cancer_type, args.expression_path)

#### Before this part the R script needs to have been run to convert Illumina probe to gene
if cancer_type in cancers_with_ILMN:
ILMN_genes = pd.read_csv(f'./{DATA_DIR}/{cancer_type}/converted_genes.csv')['Gene']
expr['gene_symbol'] = ILMN_genes
expr = expr.dropna()
if cancer_type in cancers_with_ENS_version + cancers_with_NM:
converted_genes = pd.read_csv(f'./{DATA_DIR}/{cancer_type}/converted_genes.csv')
expr = pd.merge(expr, converted_genes, how='left', left_on="gene_id", right_on='initial_id')
expr = expr.rename({'Gene': 'gene_symbol'}, axis=1)
expr = expr.dropna()
else:
expr = expr.rename({'gene_id': 'gene_symbol'}, axis=1)

## Merge datasets
### Find intersection
cols = ['icgc_donor_id', 'gene_symbol']
mut_data = sign_mut_samples[cols]
expr_data = expr[cols]

common_donors = np.intersect1d(mut_data[['icgc_donor_id']], expr_data[['icgc_donor_id']])
print('Initial common donors:', common_donors.shape)
common_genes = np.intersect1d(mut_data[['gene_symbol']], expr_data[['gene_symbol']])
print('Initial common genes:', common_genes.shape)

# Narrow down both datasets
final_mut = pd.merge(pd.Series(common_genes, name='gene_symbol'), mut_data, how='left', on='gene_symbol')
final_mut = pd.merge(pd.Series(common_donors, name='icgc_donor_id'), final_mut, how='left', on='icgc_donor_id')
final_expr = pd.merge(pd.Series(common_genes, name='gene_symbol'), expr_data, how='left', on='gene_symbol')
final_expr = pd.merge(pd.Series(common_donors, name='icgc_donor_id'), final_expr, how='left', on='icgc_donor_id')

updated_common_genes = np.intersect1d(final_expr.gene_symbol.unique(), final_mut.gene_symbol.unique())
updated_common_donor = np.intersect1d(final_expr.icgc_donor_id.unique(), final_mut.icgc_donor_id.unique())
final_mut = pd.merge(pd.Series(updated_common_genes, name='gene_symbol'), final_mut, how='left', on='gene_symbol')
final_mut = pd.merge(pd.Series(updated_common_donor, name='icgc_donor_id'), final_mut, how='left',
on='icgc_donor_id')
final_expr = pd.merge(pd.Series(updated_common_genes, name='gene_symbol'), final_expr, how='left', on='gene_symbol')
final_expr = pd.merge(pd.Series(updated_common_donor, name='icgc_donor_id'), final_expr, how='left',
on='icgc_donor_id')

final_mut.sort_values(by='gene_symbol', inplace=True)
final_expr.sort_values(by='gene_symbol', inplace=True)
sorted_common_genes = np.sort(updated_common_genes)

### Matrix generation
result_mut = pd.DataFrame(index=updated_common_donor, columns=updated_common_genes).fillna(0)
for idx, row in tqdm(final_mut.drop_duplicates().groupby('icgc_donor_id')['gene_symbol'].apply(list).iteritems()):
result_mut.loc[idx, row] = 1

result_expr = pd.DataFrame(index=updated_common_donor, columns=updated_common_genes).fillna(0)
for idx, row in tqdm(final_expr.drop_duplicates().groupby('icgc_donor_id')['gene_symbol'].apply(list).iteritems()):
result_expr.loc[idx, row] = 1

result_values = result_expr.values * result_mut.values
result = pd.DataFrame(data=result_values, index=updated_common_donor, columns=updated_common_genes)

#### Store results
store_summary(result, OUTPUT_DIR, f'result-{cancer_type}.csv')


def run(args):
if not args.cancer_type:
if args.run_all:
sub_folders = [f.name for f in os.scandir(args.data_path) if f.is_dir()]
for cancer_type in sub_folders:
perform_analysis(args, cancer_type)
else:
raise Exception('Either set --cancer-type or set run_all to True')
if not os.path.exists(f'{args.data_path}/{args.cancer_type}'):
raise Exception('arg --cancer-type is not a valid directory')
perform_analysis(args, args.cancer_type)


if __name__ == '__main__':
parser = argparse.ArgumentParser()

parser.add_argument('--cancer-type', type=str, default='Test')
parser.add_argument('--run-all', type=bool, default=False)

parser.add_argument('--data-path', type=str, default='./data')
parser.add_argument('--genes-path', type=str, default='./data/genes_list.tsv')

parser.add_argument('--expression-path', type=str, default='exp_array.tsv')
parser.add_argument('--mutation-path', type=str, default='simple_somatic_mutation.open.tsv')
parser.add_argument('--output-path', type=str, default='./output')

run(parser.parse_args())

Loading…
Cancel
Save