Step2: Unsupervised soft TCN assignment
import torch
import torch.nn.functional as F
from torch.nn import Linear
from torch_geometric.loader import DenseDataLoader
from torch_geometric.nn import DenseGraphConv, dense_mincut_pool
from torch_geometric.data import InMemoryDataset
import torch_geometric.transforms as T
import os
import numpy
import datetime
import csv
import shutil
# Hyper-parameters
max_nodes = 6000 # This number must be higher than the largest number of cells in each image in the studied dataset.
Num_Run = 20
Num_Epoch = 3000
Num_TCN = 9
Num_Dimension = 128
LearningRate = 0.0001
## Load dataset from constructed Dataset.
class SpatialOmicsImageDataset(InMemoryDataset):
def __init__(self, root, transform=None, pre_transform=None):
super(SpatialOmicsImageDataset, self).__init__(root, transform, pre_transform)
self.data, self.slices = torch.load(self.processed_paths[0])
@property
def raw_file_names(self):
return []
@property
def processed_file_names(self):
return ['SpatialOmicsImageDataset.pt']
def download(self):
pass
def process(self):
# Read data_list into huge `Data` list.
data, slices = self.collate(data_list)
torch.save((data, slices), self.processed_paths[0])
dataset = SpatialOmicsImageDataset('./', transform=T.ToDense(max_nodes))
class Net(torch.nn.Module):
def __init__(self, in_channels, out_channels, hidden_channels=Num_Dimension):
super(Net, self).__init__()
self.conv1 = DenseGraphConv(in_channels, hidden_channels)
num_cluster1 = Num_TCN #This is a hyperparameter.
self.pool1 = Linear(hidden_channels, num_cluster1)
def forward(self, x, adj, mask=None):
x = F.relu(self.conv1(x, adj, mask))
s = self.pool1(x) #Here "s" is a non-softmax tensor.
x, adj, mc1, o1 = dense_mincut_pool(x, adj, s, mask)
#Save important clustering results_1.
ClusterAssignTensor_1 = s
ClusterAdjTensor_1 = adj
return F.log_softmax(x, dim=-1), mc1, o1, ClusterAssignTensor_1, ClusterAdjTensor_1
def train(epoch):
model.train()
loss_all = 0
for data in train_loader:
data = data.to(device)
optimizer.zero_grad()
out, mc_loss, o_loss, _, _ = model(data.x, data.adj, data.mask)
loss = mc_loss + o_loss
loss.backward()
loss_all += loss.item()
optimizer.step()
return loss_all
normal_index = [44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55] #Image index for AnimalID=1. (Example female in the original data paper.)
train_index = normal_index[2:3] #Extract a single graph.
train_dataset = dataset[train_index]
train_loader = DenseDataLoader(train_dataset, batch_size=1)
all_sample_loader = DenseDataLoader(train_dataset, batch_size=1)
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
run_number = 1
while run_number <= Num_Run: #Generate multiple independent runs for soft consensus clustering.
print(f"This is Run{run_number:02d}")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net(dataset.num_features, dataset.num_classes).to(device) #Initializing the model.
optimizer = torch.optim.Adam(model.parameters(), lr=LearningRate)
RunFolderName = "Run" + str(run_number)
os.makedirs(RunFolderName) #Creating the Run folder.
filename_0 = RunFolderName + "/Epoch_TrainLoss.csv"
headers_0 = ["Epoch", "TrainLoss"]
with open(filename_0, "w", newline='') as f0:
f0_csv = csv.writer(f0)
f0_csv.writerow(headers_0)
previous_loss = float("inf") #Initialization.
for epoch in range(1, Num_Epoch+1): #Specify the number of epoch in each independent run.
train_loss = train(epoch)
#print(f"Epoch: {epoch:03d}, Train Loss: {train_loss:.4f}")
with open(filename_0, "a", newline='') as f0:
f0_csv = csv.writer(f0)
f0_csv.writerow([epoch, train_loss])
if train_loss == 0 and train_loss == previous_loss: #If two consecutive losses are both zeros, training gets stuck.
break #stop the training.
else:
previous_loss = train_loss
print(f"Final train loss is {train_loss:.4f}")
if train_loss >= -0.6: #This is an empirical cutoff for final train loss.
shutil.rmtree(RunFolderName) #Remove the specific folder and all files below it for re-creating the Run folder.
continue #restart this run.
#Extract the soft clustering matrix using the trained model.
for EachData in all_sample_loader:
EachData = EachData.to(device)
TestModelResult = model(EachData.x, EachData.adj, EachData.mask)
ClusterAssignMatrix1 = TestModelResult[3][0, :, :]
ClusterAssignMatrix1 = torch.softmax(ClusterAssignMatrix1, dim=-1) #Checked, consistent with function built in "dense_mincut_pool".
ClusterAssignMatrix1 = ClusterAssignMatrix1.detach().numpy()
filename1 = RunFolderName + "/ClusterAssignMatrix1.csv"
numpy.savetxt(filename1, ClusterAssignMatrix1, delimiter=',')
ClusterAdjMatrix1 = TestModelResult[4][0, :, :]
ClusterAdjMatrix1 = ClusterAdjMatrix1.detach().numpy()
filename2 = RunFolderName + "/ClusterAdjMatrix1.csv"
numpy.savetxt(filename2, ClusterAdjMatrix1, delimiter=',')
NodeMask = EachData.mask
NodeMask = numpy.array(NodeMask)
filename3 = RunFolderName + "/NodeMask.csv"
numpy.savetxt(filename3, NodeMask.T, delimiter=',', fmt='%i') #save as integers.
GraphIdxArray = numpy.array(EachData.graph_idx.view(-1))
filename4 = RunFolderName + "/GraphIdx.csv"
numpy.savetxt(filename4, GraphIdxArray, delimiter=',', fmt='%i') #save as integer.
run_number = run_number + 1
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
Hyperparameters
max_nodes: The largest number of nodes in each KNN gragh, higher than the largest number of cells in each image in the studied dataset.
Num_Run: The number of runs.
Num_Epoch: The number of training epochs.
Num_TCN: The number of TCNs.
Num_Dimension: The dimension of the representation vector.
LearningRate: A tuning parameter in an optimization algorithm that determines the step size at each iteration while moving toward a minimum of a loss function.
Output
This step generates a folder for each run that contains a cluster adjacent matrix, a cluster assignment matrix, a node mask, a gragh index file and a loss recording file.
── Run1
├─ ClusterAdjMatrix1.csv
├─ ClusterAssignMatrix1.csv
├─ Epoch_TrainLoss.csv
├─ GraphIdx.csv
└─ NodeMask.csv
── Run2
── ⋮
── Run20