The codes and documentation for my BSc project in the area of Cancer Genomics
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

integrate_matrices.py 4.6KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. __author__ = "Alireza Tajmirriahi"
  2. __version__ = "1.0.0"
  3. __maintainer__ = "Alireza Tajmirriahi"
  4. __email__ = "[email protected]"
  5. __project__ = "CancerGenomics"
  6. import argparse
  7. import os
  8. import numpy as np
  9. import pandas as pd
  10. from tqdm import tqdm
  11. DATA_DIR = '/PROJECTS/Taj/1_PreprocessData/data'
  12. MATRICES_PATH = '/PROJECTS/Taj/1_PreprocessData/output'
  13. OUT_DIR = 'ICGC'
  14. MIN_SAMPLES = 10 # 50
  15. TRAIN_SPLIT = 0.70
  16. def aggregate_dataframes(data):
  17. cols = sorted(set.union(*[set(df.columns) for df in data.values()]))
  18. to_merge = []
  19. for label, df in data.items():
  20. new_df = df.reindex(columns=cols, fill_value=0)
  21. new_df['label'] = label
  22. to_merge.append(new_df)
  23. return pd.concat(to_merge)
  24. def agg_mutation(mat, cancer_type, file_name):
  25. df = pd.read_csv(f'{args.DATA_DIR}/{cancer_type}/{file_name}', sep='\t')
  26. mutation_count = df.groupby(['icgc_donor_id', 'gene_symbol']).size()
  27. result = mat.copy()
  28. for row in tqdm(result.index):
  29. cols = mat.columns[np.where(mat.loc[row] == 1)]
  30. for col in cols:
  31. result.loc[row][col] = mutation_count[row][col]
  32. return result
  33. def agg_expression(mat, cancer_type, file_name):
  34. df = pd.read_csv(f'{args.DATA_DIR}/{cancer_type}/{file_name}')
  35. df = df.drop(columns=['Unnamed: 0'], axis=1)
  36. df = df.set_index(['icgc_donor_id', 'gene_id'])
  37. df = df[~df.index.duplicated(keep='first')]
  38. result = mat.copy().astype(np.float64)
  39. for row in tqdm(result.index):
  40. cols = mat.columns[np.where(mat.loc[row] == 1)]
  41. for col in cols:
  42. result.loc[row][col] = df.loc[row, col].normalized_expression_value
  43. return result
  44. def main():
  45. file_paths = [*os.walk(MATRICES_PATH)][0][2]
  46. all_dfs = list(map(lambda p: pd.read_csv(f'{args.MATRICES_PATH}/{p}', index_col=0, delimiter='\t'), file_paths))
  47. all_labels = list(map(lambda p: p[7:-4], file_paths))
  48. omic_paths = ['symbol_mutation.tsv', 'expression_data.tsv']
  49. mut_data, exp_data = dict(), dict()
  50. for label, df in zip(all_labels, all_dfs):
  51. if len(df) >= args.MIN_SAMPLES:
  52. print('Aggregating', label, '...')
  53. mut_data[label] = agg_mutation(df, label, omic_paths[0])
  54. exp_data[label] = agg_expression(df, label, omic_paths[1])
  55. print('Merging dataframes', end=' ')
  56. merged_mut = aggregate_dataframes(mut_data)
  57. merged_exp = aggregate_dataframes(exp_data)
  58. print(f'Done')
  59. assert merged_mut.shape == merged_exp.shape
  60. labels = merged_mut.label
  61. train_indices = np.random.choice([True, False], len(merged_mut), p=[args.TRAIN_SPLIT, 1 - args.TRAIN_SPLIT])
  62. labels_train = labels[train_indices]
  63. labels_test = labels[~train_indices]
  64. mut_train = merged_mut[train_indices]
  65. mut_test = merged_mut[~train_indices]
  66. exp_train = merged_exp[train_indices]
  67. exp_test = merged_exp[~train_indices]
  68. if not os.path.exists(args.OUT_DIR):
  69. os.mkdir(args.OUT_DIR)
  70. labels_train.to_csv(f'{args.OUT_DIR}/labels_tr.csv', index=False, header=False)
  71. labels_test.to_csv(f'{args.OUT_DIR}/labels_te.csv', index=False, header=False)
  72. merged_mut.columns.to_frame().to_csv(f'{args.OUT_DIR}/1_featname.csv', index=False, header=False)
  73. merged_exp.columns.to_frame().to_csv(f'{args.OUT_DIR}/2_featname.csv', index=False, header=False)
  74. mut_train.to_csv(f'{args.OUT_DIR}/1_tr.csv', index=False, header=False)
  75. mut_test.to_csv(f'{args.OUT_DIR}/1_te.csv', index=False, header=False)
  76. exp_train.to_csv(f'{args.OUT_DIR}/2_tr.csv', index=False, header=False)
  77. exp_test.to_csv(f'{args.OUT_DIR}/2_te.csv', index=False, header=False)
  78. print('num classes=', len(mut_data.keys()))
  79. # run_mogonet(num_class=len(mut_data.keys()))
  80. if __name__ == '__main__':
  81. parser = argparse.ArgumentParser()
  82. parser.add_argument('--data-dir', action='store', dest='DATA_DIR', type=str,
  83. default='/PROJECTS/Taj/1_PreprocessData/data', help='Path to data')
  84. parser.add_argument('--matrices-path', action='store', dest='MATRICES_PATH', type=str,
  85. default='/PROJECTS/Taj/1_PreprocessData/output', help='Path to preprocessed matrices')
  86. parser.add_argument('--out-dir', action='store', dest='OUT_DIR', type=str, default='ICGC',
  87. help='The output directory')
  88. parser.add_argument('--min-samples', action='store', dest='MIN_SAMPLES', type=int, default=10,
  89. help='Minimum samples required to include a cancer type')
  90. parser.add_argument('--train-split', action='store', dest='TRAIN_SPLIT', type=float, default=0.70,
  91. help='train/test split. By default set to 0.70')
  92. args = parser.parse_args()
  93. main()