The codes and documentation for my BSc project in the area of Cancer Genomics
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

preprocessing.py 7.3KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. import os
  2. import argparse
  3. import numpy as np
  4. import pandas as pd
  5. from tqdm import tqdm
  6. tqdm.pandas()
  7. DATA_DIR = './data'
  8. OUTPUT_DIR = './output'
  9. cancers_with_ILMN = ['Pancreas'] # gene_id: ILMN_
  10. cancers_with_NM = ['Nervous System'] # gene_id: NM_/NR/_
  11. cancers_with_ENS_version = ['Blood']
  12. def read_data(folder: str, file_path: str):
  13. if file_path.endswith('.tsv'):
  14. return pd.read_csv(f'{DATA_DIR}/{folder}/{file_path}', sep='\t', header=0, low_memory=False)
  15. return None
  16. def read_data_csv(folder: str, file_path: str):
  17. return pd.read_csv(f'{DATA_DIR}/{folder}/{file_path}', header=0, low_memory=False)
  18. def read_chunk_by_chunk(folder: str, file_path: str, columns=None):
  19. df = pd.DataFrame()
  20. for chunk in pd.read_csv(f"{DATA_DIR}/{folder}/{file_path}", sep='\t', header=0, low_memory=False, chunksize=1e6):
  21. df = pd.concat([df, chunk[columns] if columns else chunk], ignore_index=True)
  22. return df
  23. def save_tsv(df, output_path, file_path):
  24. if not os.path.exists(output_path):
  25. os.makedirs(output_path)
  26. df.to_csv(f'{output_path}/{file_path}', sep='\t')
  27. def get_mutation_data(cancer_type, mutation_path, mutation_type=None):
  28. columns = ['icgc_donor_id', 'gene_affected', 'mutation_type']
  29. data = read_chunk_by_chunk(cancer_type, mutation_path, columns)
  30. if mutation_type:
  31. data = data[data['mutation_type'] == mutation_type] \
  32. .drop(columns=['mutation_type'])
  33. return data.dropna()
  34. def get_expression_data(cancer_type, expression_path):
  35. columns = ['icgc_donor_id', 'gene_id']
  36. data = read_chunk_by_chunk(cancer_type, expression_path, columns)
  37. return data.dropna()
  38. def get_genes(genes_path, gene_class=None):
  39. genes = pd.read_csv(genes_path, sep='\t', header=0)
  40. genes.gene_symbol = list(map(lambda g: g[1:-1], genes.gene_symbol))
  41. if gene_class:
  42. genes = genes[genes['gene_class'] == gene_class]
  43. genes = genes[['gene_name', 'gene_symbol']] \
  44. .rename({'gene_name': 'gene_ensembl_id'}, axis=1)
  45. return genes
  46. def store_summary(df, output_path, file_name):
  47. if not os.path.exists(output_path):
  48. os.makedirs(output_path)
  49. save_tsv(df, OUTPUT_DIR, f'result-{file_name}.tsv')
  50. print(f'>>> Summary for {file_name} (considering mutation and expression):')
  51. print('\tDonors in Common:', df.shape[0])
  52. print('\tGenes in Common:', df.shape[1])
  53. def perform_analysis(args, cancer_type):
  54. genes = get_genes(args.genes_path)
  55. ### Mutation
  56. mut = get_mutation_data(cancer_type, args.mutation_path, mutation_type='single base substitution')
  57. mut_data = mut.rename({'gene_affected': 'gene_ensembl_id'}, axis=1)
  58. sign_mut_samples = pd.merge(genes, mut_data, how='left', on='gene_ensembl_id') \
  59. .drop(columns=['gene_ensembl_id']) \
  60. .drop_duplicates() \
  61. .dropna()
  62. ### Expression
  63. expr = get_expression_data(cancer_type, args.expression_path)
  64. #### Before this part the R script needs to have been run to convert Illumina probe to gene
  65. if cancer_type in cancers_with_ILMN:
  66. ILMN_genes = pd.read_csv(f'./{DATA_DIR}/{cancer_type}/converted_genes.csv')['Gene']
  67. expr['gene_symbol'] = ILMN_genes
  68. expr = expr.dropna()
  69. if cancer_type in cancers_with_ENS_version + cancers_with_NM:
  70. converted_genes = pd.read_csv(f'./{DATA_DIR}/{cancer_type}/converted_genes.csv')
  71. expr = pd.merge(expr, converted_genes, how='left', left_on="gene_id", right_on='initial_id')
  72. expr = expr.rename({'Gene': 'gene_symbol'}, axis=1)
  73. expr = expr.dropna()
  74. else:
  75. expr = expr.rename({'gene_id': 'gene_symbol'}, axis=1)
  76. ## Merge datasets
  77. ### Find intersection
  78. cols = ['icgc_donor_id', 'gene_symbol']
  79. mut_data = sign_mut_samples[cols]
  80. expr_data = expr[cols]
  81. common_donors = np.intersect1d(mut_data[['icgc_donor_id']], expr_data[['icgc_donor_id']])
  82. print('Initial common donors:', common_donors.shape)
  83. common_genes = np.intersect1d(mut_data[['gene_symbol']], expr_data[['gene_symbol']])
  84. print('Initial common genes:', common_genes.shape)
  85. # Narrow down both datasets
  86. final_mut = pd.merge(pd.Series(common_genes, name='gene_symbol'), mut_data, how='left', on='gene_symbol')
  87. final_mut = pd.merge(pd.Series(common_donors, name='icgc_donor_id'), final_mut, how='left', on='icgc_donor_id')
  88. final_expr = pd.merge(pd.Series(common_genes, name='gene_symbol'), expr_data, how='left', on='gene_symbol')
  89. final_expr = pd.merge(pd.Series(common_donors, name='icgc_donor_id'), final_expr, how='left', on='icgc_donor_id')
  90. updated_common_genes = np.intersect1d(final_expr.gene_symbol.unique(), final_mut.gene_symbol.unique())
  91. updated_common_donor = np.intersect1d(final_expr.icgc_donor_id.unique(), final_mut.icgc_donor_id.unique())
  92. final_mut = pd.merge(pd.Series(updated_common_genes, name='gene_symbol'), final_mut, how='left', on='gene_symbol')
  93. final_mut = pd.merge(pd.Series(updated_common_donor, name='icgc_donor_id'), final_mut, how='left',
  94. on='icgc_donor_id')
  95. final_expr = pd.merge(pd.Series(updated_common_genes, name='gene_symbol'), final_expr, how='left', on='gene_symbol')
  96. final_expr = pd.merge(pd.Series(updated_common_donor, name='icgc_donor_id'), final_expr, how='left',
  97. on='icgc_donor_id')
  98. final_mut.sort_values(by='gene_symbol', inplace=True)
  99. final_expr.sort_values(by='gene_symbol', inplace=True)
  100. sorted_common_genes = np.sort(updated_common_genes)
  101. ### Matrix generation
  102. result_mut = pd.DataFrame(index=updated_common_donor, columns=updated_common_genes).fillna(0)
  103. for idx, row in tqdm(final_mut.drop_duplicates().groupby('icgc_donor_id')['gene_symbol'].apply(list).iteritems()):
  104. result_mut.loc[idx, row] = 1
  105. result_expr = pd.DataFrame(index=updated_common_donor, columns=updated_common_genes).fillna(0)
  106. for idx, row in tqdm(final_expr.drop_duplicates().groupby('icgc_donor_id')['gene_symbol'].apply(list).iteritems()):
  107. result_expr.loc[idx, row] = 1
  108. result_values = result_expr.values * result_mut.values
  109. result = pd.DataFrame(data=result_values, index=updated_common_donor, columns=updated_common_genes)
  110. #### Store results
  111. store_summary(result, OUTPUT_DIR, f'result-{cancer_type}.csv')
  112. def run(args):
  113. if not args.cancer_type:
  114. if args.run_all:
  115. sub_folders = [f.name for f in os.scandir(args.data_path) if f.is_dir()]
  116. for cancer_type in sub_folders:
  117. perform_analysis(args, cancer_type)
  118. else:
  119. raise Exception('Either set --cancer-type or set run_all to True')
  120. if not os.path.exists(f'{args.data_path}/{args.cancer_type}'):
  121. raise Exception('arg --cancer-type is not a valid directory')
  122. perform_analysis(args, args.cancer_type)
  123. if __name__ == '__main__':
  124. parser = argparse.ArgumentParser()
  125. parser.add_argument('--cancer-type', type=str, default='Test')
  126. parser.add_argument('--run-all', type=bool, default=False)
  127. parser.add_argument('--data-path', type=str, default='./data')
  128. parser.add_argument('--genes-path', type=str, default='./data/genes_list.tsv')
  129. parser.add_argument('--expression-path', type=str, default='exp_array.tsv')
  130. parser.add_argument('--mutation-path', type=str, default='simple_somatic_mutation.open.tsv')
  131. parser.add_argument('--output-path', type=str, default='./output')
  132. run(parser.parse_args())