The codes and documentation for my BSc project in the area of Cancer Genomics
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

expressionSummarizer.r 1.1KB

1234567891011121314151617181920212223
  1. cancer_types <- c('Brain', 'Breast', 'Colorectal', 'Lung', 'Nervous System', 'Pancreas', 'Uterus')
  2. trouble_maker <- c('Blood', 'Pancreas')
  3. prepare_expression <- function(data_folder) {
  4. exp_path <- sprintf('%s/exp_array.tsv', data_folder)
  5. exp_array <- read.csv(exp_path, sep='\t')[c('icgc_donor_id', 'gene_id', 'normalized_expression_value')]
  6. if (data_folder %in% trouble_maker) {
  7. mapper <- read.csv(sprintf("%s/%s_gene_mapper.csv", data_folder, data_folder))[c('gene_id', 'gene_symbol')]
  8. exp_array$gene_id <- sub("[.][0-9]*", "", exp_array$gene_id)
  9. exp_array1 <- merge(x = exp_array, y = mapper, by = 'gene_id', all.x = TRUE)
  10. exp_array1 <- na.omit(exp_array1)
  11. exp_array <- exp_array1[c('icgc_donor_id', 'gene_symbol', 'normalized_expression_value')]
  12. colnames(exp_array)[colnames(exp_array) == 'gene_symbol'] <- 'gene_id'
  13. }
  14. write.csv(x = exp_array, file = sprintf("%s/expression_data.tsv", data_folder), sep = "\t")
  15. }
  16. setwd("../../1_PreprocessData/data")
  17. for (c_type in cancer_types) {
  18. print(sprintf("Working on %s", c_type))
  19. prepare_expression(data_folder = c_type)
  20. }