Browse Source

Add 2nd stage codes

master
AlirezaT99 10 months ago
parent
commit
4c528da8eb
2 changed files with 127 additions and 0 deletions
  1. 8
    0
      2-IntegrateData/README.md
  2. 119
    0
      2-IntegrateData/integrate_matrices.py

+ 8
- 0
2-IntegrateData/README.md View File

@@ -0,0 +1,8 @@
## Integrate data
The code in this directory takes the mutation and expression data for each cancer type as input and outputs the matrices according to [Mogonet](https://github.com/txWang/MOGONET)'s input format.

Run the following to learn more about the arguments to set:

```bash
python integrate_matrices.py --help
```

+ 119
- 0
2-IntegrateData/integrate_matrices.py View File

@@ -0,0 +1,119 @@
__author__ = "Alireza Tajmirriahi"
__version__ = "1.0.0"
__maintainer__ = "Alireza Tajmirriahi"
__email__ = "[email protected]"
__project__ = "CancerGenomics"

import argparse
import os

import numpy as np
import pandas as pd

from tqdm import tqdm

DATA_DIR = '/PROJECTS/Taj/1_PreprocessData/data'
MATRICES_PATH = '/PROJECTS/Taj/1_PreprocessData/output'
OUT_DIR = 'ICGC'
MIN_SAMPLES = 10 # 50
TRAIN_SPLIT = 0.70


def aggregate_dataframes(data):
cols = sorted(set.union(*[set(df.columns) for df in data.values()]))
to_merge = []
for label, df in data.items():
new_df = df.reindex(columns=cols, fill_value=0)
new_df['label'] = label
to_merge.append(new_df)
return pd.concat(to_merge)


def agg_mutation(mat, cancer_type, file_name):
df = pd.read_csv(f'{args.DATA_DIR}/{cancer_type}/{file_name}', sep='\t')
mutation_count = df.groupby(['icgc_donor_id', 'gene_symbol']).size()

result = mat.copy()
for row in tqdm(result.index):
cols = mat.columns[np.where(mat.loc[row] == 1)]
for col in cols:
result.loc[row][col] = mutation_count[row][col]
return result


def agg_expression(mat, cancer_type, file_name):
df = pd.read_csv(f'{args.DATA_DIR}/{cancer_type}/{file_name}')
df = df.drop(columns=['Unnamed: 0'], axis=1)
df = df.set_index(['icgc_donor_id', 'gene_id'])
df = df[~df.index.duplicated(keep='first')]

result = mat.copy().astype(np.float64)
for row in tqdm(result.index):
cols = mat.columns[np.where(mat.loc[row] == 1)]
for col in cols:
result.loc[row][col] = df.loc[row, col].normalized_expression_value
return result


def main():
file_paths = [*os.walk(MATRICES_PATH)][0][2]
all_dfs = list(map(lambda p: pd.read_csv(f'{args.MATRICES_PATH}/{p}', index_col=0, delimiter='\t'), file_paths))
all_labels = list(map(lambda p: p[7:-4], file_paths))

omic_paths = ['symbol_mutation.tsv', 'expression_data.tsv']
mut_data, exp_data = dict(), dict()

for label, df in zip(all_labels, all_dfs):
if len(df) >= args.MIN_SAMPLES:
print('Aggregating', label, '...')
mut_data[label] = agg_mutation(df, label, omic_paths[0])
exp_data[label] = agg_expression(df, label, omic_paths[1])

print('Merging dataframes', end=' ')
merged_mut = aggregate_dataframes(mut_data)
merged_exp = aggregate_dataframes(exp_data)
print(f'Done')

assert merged_mut.shape == merged_exp.shape
labels = merged_mut.label

train_indices = np.random.choice([True, False], len(merged_mut), p=[args.TRAIN_SPLIT, 1 - args.TRAIN_SPLIT])

labels_train = labels[train_indices]
labels_test = labels[~train_indices]
mut_train = merged_mut[train_indices]
mut_test = merged_mut[~train_indices]
exp_train = merged_exp[train_indices]
exp_test = merged_exp[~train_indices]

if not os.path.exists(args.OUT_DIR):
os.mkdir(args.OUT_DIR)
labels_train.to_csv(f'{args.OUT_DIR}/labels_tr.csv', index=False, header=False)
labels_test.to_csv(f'{args.OUT_DIR}/labels_te.csv', index=False, header=False)
merged_mut.columns.to_frame().to_csv(f'{args.OUT_DIR}/1_featname.csv', index=False, header=False)
merged_exp.columns.to_frame().to_csv(f'{args.OUT_DIR}/2_featname.csv', index=False, header=False)
mut_train.to_csv(f'{args.OUT_DIR}/1_tr.csv', index=False, header=False)
mut_test.to_csv(f'{args.OUT_DIR}/1_te.csv', index=False, header=False)
exp_train.to_csv(f'{args.OUT_DIR}/2_tr.csv', index=False, header=False)
exp_test.to_csv(f'{args.OUT_DIR}/2_te.csv', index=False, header=False)

print('num classes=', len(mut_data.keys()))
# run_mogonet(num_class=len(mut_data.keys()))


if __name__ == '__main__':
parser = argparse.ArgumentParser()

parser.add_argument('--data-dir', action='store', dest='DATA_DIR', type=str,
default='/PROJECTS/Taj/1_PreprocessData/data', help='Path to data')
parser.add_argument('--matrices-path', action='store', dest='MATRICES_PATH', type=str,
default='/PROJECTS/Taj/1_PreprocessData/output', help='Path to preprocessed matrices')
parser.add_argument('--out-dir', action='store', dest='OUT_DIR', type=str, default='ICGC',
help='The output directory')
parser.add_argument('--min-samples', action='store', dest='MIN_SAMPLES', type=int, default=10,
help='Minimum samples required to include a cancer type')
parser.add_argument('--train-split', action='store', dest='TRAIN_SPLIT', type=float, default=0.70,
help='train/test split. By default set to 0.70')

args = parser.parse_args()
main()

Loading…
Cancel
Save