The codes and documentation for my BSc project in the area of Cancer Genomics
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

prepare_mutation.py 3.4KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. import os
  2. import argparse
  3. import numpy as np
  4. import pandas as pd
  5. from tqdm import tqdm
  6. tqdm.pandas()
  7. DATA_DIR = './data'
  8. def read_data(folder: str, file_path: str):
  9. if file_path.endswith('.tsv'):
  10. return pd.read_csv(f'{DATA_DIR}/{folder}/{file_path}', sep='\t', header=0, low_memory=False)
  11. return None
  12. def read_data_csv(folder: str, file_path: str):
  13. return pd.read_csv(f'{DATA_DIR}/{folder}/{file_path}', header=0, low_memory=False)
  14. def read_chunk_by_chunk(folder: str, file_path: str, columns=None):
  15. df = pd.DataFrame()
  16. for chunk in pd.read_csv(f"{DATA_DIR}/{folder}/{file_path}", sep='\t', header=0, low_memory=False, chunksize=1e6):
  17. df = pd.concat([df, chunk[columns] if columns else chunk], ignore_index=True)
  18. return df
  19. def save_tsv(df, output_path, file_path):
  20. if not os.path.exists(output_path):
  21. os.makedirs(output_path)
  22. df.to_csv(f'{output_path}/{file_path}', sep='\t')
  23. def get_mutation_data(cancer_type, mutation_path, mutation_type=None):
  24. columns = ['icgc_donor_id', 'gene_affected', 'mutation_type']
  25. data = read_chunk_by_chunk(cancer_type, mutation_path, columns)
  26. if mutation_type:
  27. data = data[data['mutation_type'] == mutation_type] \
  28. .drop(columns=['mutation_type'])
  29. return data.dropna()
  30. def get_genes(genes_path, gene_class=None):
  31. genes = pd.read_csv(genes_path, sep='\t', header=0)
  32. genes.gene_symbol = list(map(lambda g: g[1:-1], genes.gene_symbol))
  33. if gene_class:
  34. genes = genes[genes['gene_class'] == gene_class]
  35. genes = genes[['gene_name', 'gene_symbol']] \
  36. .rename({'gene_name': 'gene_ensembl_id'}, axis=1)
  37. return genes
  38. def perform_analysis(args, cancer_type):
  39. genes = get_genes(args.genes_path)
  40. print('Converting', cancer_type, end='...')
  41. ### Mutation
  42. mut = get_mutation_data(cancer_type, args.mutation_path, mutation_type='single base substitution')
  43. mut_data = mut.rename({'gene_affected': 'gene_ensembl_id'}, axis=1)
  44. sign_mut_samples = pd.merge(genes, mut_data, how='left', on='gene_ensembl_id') \
  45. .drop(columns=['gene_ensembl_id']) \
  46. .drop_duplicates() \
  47. .dropna()
  48. sign_mut_samples.to_csv(f'{DATA_DIR}/{cancer_type}/symbol_mutation.tsv', sep='\t')
  49. print('done')
  50. def run(args):
  51. if not args.cancer_type:
  52. if args.run_all:
  53. sub_folders = [f.name for f in os.scandir(args.data_path) if f.is_dir()]
  54. for cancer_type in sub_folders:
  55. perform_analysis(args, cancer_type)
  56. else:
  57. raise Exception('Either set --cancer-type or set run_all to True')
  58. if not os.path.exists(f'{args.data_path}/{args.cancer_type}'):
  59. raise Exception('arg --cancer-type is not a valid directory')
  60. perform_analysis(args, args.cancer_type)
  61. if __name__ == '__main__':
  62. parser = argparse.ArgumentParser()
  63. parser.add_argument('--cancer-type', type=str) # , default='Test'
  64. parser.add_argument('--run-all', type=bool, default=False)
  65. parser.add_argument('--data-path', type=str, default='./data')
  66. parser.add_argument('--genes-path', type=str, default='./data/genes_list.tsv')
  67. parser.add_argument('--expression-path', type=str, default='exp_array.tsv')
  68. parser.add_argument('--mutation-path', type=str, default='simple_somatic_mutation.open.tsv')
  69. parser.add_argument('--output-path', type=str, default='./output')
  70. run(parser.parse_args())