Commit b4968c5e authored by Diego Fioravanti's avatar Diego Fioravanti
Browse files

Created script to represent the data as a graph

parent af7203a4
This diff is collapsed.
This diff is collapsed.
import os
from collections import OrderedDict
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
taxonomy_ranks = OrderedDict()
taxonomy_ranks['k__'] = 'kingdom'
taxonomy_ranks['p__'] = 'phylum'
taxonomy_ranks['c__'] = 'class'
taxonomy_ranks['o__'] = 'order'
taxonomy_ranks['f__'] = 'family'
taxonomy_ranks['g__'] = 'genus'
def empty_taxonomy():
taxonomy = OrderedDict()
taxonomy['kingdom'] = None
taxonomy['phylum'] = None
taxonomy['class'] = None
taxonomy['order'] = None
taxonomy['family'] = None
taxonomy['genus'] = None
return taxonomy
def split_taxonomy(bacteria):
splited_taxonomy = bacteria.split('.')
fixed_taxonomy = []
for element in splited_taxonomy:
if element.startswith('__', 1) or element == 'Other':
fixed_taxonomy.append(element)
else:
fixed_taxonomy[-1] = '.'.join([fixed_taxonomy[-1], element])
return fixed_taxonomy
def extract_taxonomy(bacteria):
list_taxonomy_ranks = split_taxonomy(bacteria)
taxonomy = empty_taxonomy()
for rank in list_taxonomy_ranks:
if not rank == 'Other':
correnct_rank = taxonomy_ranks[rank[0:3]]
if not rank[3:] == '':
taxonomy[correnct_rank] = rank[3:]
return taxonomy
DISEASE = 'CDf'
DATA_DIR = 'true_data'
# ============================================
# -- DATA loading section
# ============================================
DISEASE_FOLDER = ''.join(['HS_', DISEASE])
TRAINING_DATA_FILEPATH = os.path.join(DATA_DIR, DISEASE_FOLDER,
''.join(['Sokol_16S_taxa_HS_', DISEASE, '_commsamp_training.txt']))
COORDINATES_FILEPATH = os.path.join(DATA_DIR, 'coordinates',
''.join(['coordinates_', DISEASE.lower(), '.txt']))
TRAINING_LABELS_FILEPATH = os.path.join(DATA_DIR, DISEASE_FOLDER,
''.join(['Sokol_16S_taxa_HS_', DISEASE, '_commsamp_training_lab.txt']))
TEST_DATA_FILEPATH = os.path.join(DATA_DIR, DISEASE_FOLDER,
''.join(['Sokol_16S_taxa_HS_', DISEASE, '_commsamp_test.txt']))
TEST_LABELS_FILEPATH = os.path.join(DATA_DIR, DISEASE_FOLDER,
''.join(['Sokol_16S_taxa_HS_', DISEASE, '_commsamp_test_lab.txt']))
training = pd.read_csv(TRAINING_DATA_FILEPATH, sep='\t', index_col=0,)
training_lab = pd.read_csv(TRAINING_LABELS_FILEPATH, sep='\t',dtype=np.int, names=' ')
test = pd.read_csv(TEST_DATA_FILEPATH, sep='\t', index_col=0)
test_lab = pd.read_csv(TEST_LABELS_FILEPATH, sep='\t',dtype=np.int, names=' ')
# ============================================
# -- Process samples to extract taxonomy
# ============================================
# G = nx.DiGraph()
G = nx.Graph()
bacterias = training.columns
for bacteria in bacterias:
taxonomy = extract_taxonomy(bacteria)
previous_name = None
for n, (rank, name) in enumerate(taxonomy.items()):
if name is not None:
G.add_node(name)
if not n == 0 and previous_name is not None:
G.add_edge(previous_name, name)
previous_name = name
a = nx.adjacency_matrix(G)
print(a)
plt.spy(a, precision=0.01, markersize=1)
plt.show()
# pos=nx.nx.nx_pydot.graphviz_layout(G, prog='dot')
# nx.draw(G, pos, with_labels=True, arrows=False)
figure = plt.gcf()
figure.set_size_inches(4, 3)
# plt.show()
plt.savefig('2.jpg')
0
0
0
0
0
0
0
0
0
0
0
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
0
0
0
0
0
0
0
0
0
0
0
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment