The codes and documentation for my BSc project in the area of Cancer Genomics
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

geneIdConverter.r 1.1KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647
  1. #### Install and library ####
  2. # install.packages("biotools")
  3. if (!require("BiocManager", quietly = TRUE)) {
  4. install.packages("BiocManager")
  5. BiocManager::install("illuminaHumanv4.db")
  6. BiocManager::install("clusterProfiler")
  7. BiocManager::install("org.Hs.eg.db")
  8. }
  9. library("illuminaHumanv4.db")
  10. library("clusterProfiler")
  11. library("org.Hs.eg.db")
  12. #### read genes ####
  13. exp_path <- '../../1_PreprocessData/data/Blood/exp_array.tsv'
  14. genes <- read.csv(exp_path, sep='\t')[['gene_id']]
  15. #### Preprocess the genes ####
  16. src <- "ENSEMBL"
  17. dst <- "SYMBOL"
  18. if (substr(genes[1], 0, 3) == "ENS") { # ENSEMBL + version
  19. genes <- sub("[.][0-9]*", "", genes)
  20. } else if (substr(genes[1], 0, 3) == "NM_" | substr(genes[1], 0, 3) == "NR_") { # RefSeq
  21. src <- "REFSEQ"
  22. }
  23. #### Convert probe Id to gene symbol ####
  24. df <- data.frame(
  25. bitr(
  26. genes,
  27. fromType = src,
  28. toType = dst,
  29. OrgDb = org.Hs.eg.db,
  30. drop = TRUE
  31. )
  32. )
  33. colnames(df) <- c('initial_id', 'Gene')
  34. # For Illumina (Such as Pancreas)
  35. # df <- data.frame(Gene=unlist(mget(x = genes, envir = illuminaHumanv4SYMBOL)))
  36. write.csv(x = df, file = './converted_genes.csv')