Commit 68546ea4 authored by Alessia Marcolini's avatar Alessia Marcolini
Browse files

Add script to download tissue slides from TCGA

parent 6d443098
#%%
import os
from pathlib import Path
import pandas as pd
import requests
from tqdm import tqdm
# %%
DATA_DIR = Path('data/tcga_breast')
SVS_DIR = DATA_DIR / 'svs'
csv = pd.read_csv(DATA_DIR / 'TCGA-BRCA_slides_filename_uuid.csv')
os.makedirs(SVS_DIR, exist_ok=True)
#%%
for i, row in tqdm(csv.iterrows()):
uuid = row['uuid']
filename = row['filename']
print(uuid)
data_endpt = "https://api.gdc.cancer.gov/data/{}".format(uuid)
response = requests.get(data_endpt, headers = {"Content-Type": "application/json"})
# The file name can be found in the header within the Content-Disposition key.
response_head_cd = response.headers["Content-Disposition"]
output_path = SVS_DIR / filename
with open(output_path, "wb") as output_file:
output_file.write(response.content)
# %%
# %%
import os
from pathlib import Path
import numpy as np
import pandas as pd
# %%
DATA_DIR = Path('data/tcga_breast')
# %%
clinical = pd.read_csv(DATA_DIR / 'clinical.csv', sep='\t')
filenames_uuids = pd.read_csv(DATA_DIR / 'TCGA_BRCA_filename_uuid.tsv', sep='\t')
# %%
patient_column = 'Sample'
filename_column = 'patient.samples.sample.portions.portion.slides.slide.image_file_name'
patients_filenames = clinical[[patient_column, filename_column]]
patients_filenames.rename({filename_column: 'filename'}, axis=1, inplace=True)
# %%
filenames_uuids['filename_lower'] = filenames_uuids['filename'].str.lower()
# %%
merged = pd.merge(patients_filenames, filenames_uuids, left_on='filename', right_on='filename_lower')
# %%
merged = merged[['Sample', 'filename_y', 'id']]
merged.rename({'filename_y': 'filename', 'id': 'uuid'}, axis=1, inplace=True)
# %%
merged.to_csv(DATA_DIR / 'TCGA-BRCA_slides_filename_uuid.csv', index=False)
# %%
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment