Step2: Supervised soft TCN assignment
import torch
import torch.nn.functional as F
from torch.nn import Linear
from torch_geometric.loader import DenseDataLoader
from torch_geometric.nn import GCNConv, DenseGraphConv, dense_mincut_pool
from torch_geometric.data import InMemoryDataset
import torch_geometric.transforms as T
import os
import numpy
import datetime
import csv
import shutil
import random
# Hyperparameters
max_nodes = 8300 #This number must be higher than the largest number of cells in each image in the studied dataset.
beta = 0.9
Num_TCN = 2
Num_Epoch = 100
Num_Dimension = 512
LearningRate = 0.0001
MiniBatchSize = 16
## Load dataset from constructed Dataset.
class SpatialOmicsImageDataset(InMemoryDataset):
def __init__(self, root, transform=None, pre_transform=None):
super(SpatialOmicsImageDataset, self).__init__(root, transform, pre_transform)
self.data, self.slices = torch.load(self.processed_paths[0])
@property
def raw_file_names(self):
return []
@property
def processed_file_names(self):
return ['SpatialOmicsImageDataset.pt']
def download(self):
pass
def process(self):
# Read data_list into huge `Data` list.
data, slices = self.collate(data_list)
torch.save((data, slices), self.processed_paths[0])
dataset = SpatialOmicsImageDataset('./', transform=T.ToDense(max_nodes))
class Net(torch.nn.Module):
def __init__(self, in_channels, out_channels, hidden_channels=Num_Dimension):
super(Net, self).__init__()
self.conv1 = DenseGraphConv(in_channels, hidden_channels)
num_cluster1 = Num_TCN #This is a hyperparameter.
self.pool1 = Linear(hidden_channels, num_cluster1)
self.conv3 = DenseGraphConv(hidden_channels, hidden_channels)
self.lin1 = Linear(hidden_channels, hidden_channels)
self.lin2 = Linear(hidden_channels, out_channels)
def forward(self, x, adj, mask=None):
x = F.relu(self.conv1(x, adj, mask))
s = self.pool1(x) #here s is a non-softmax tensor.
x, adj, mc1, o1 = dense_mincut_pool(x, adj, s, mask)
#Save important clustering results_1.
ClusterAssignTensor_1 = s
ClusterAdjTensor_1 = adj
x = self.conv3(x, adj)
x = x.mean(dim=1)
x = F.relu(self.lin1(x))
x = self.lin2(x)
return F.log_softmax(x, dim=-1), mc1, o1, ClusterAssignTensor_1, ClusterAdjTensor_1
def train(epoch):
model.train()
loss_all = 0
loss_CE_all = 0
loss_MinCut_all = 0
for data in train_loader:
data = data.to(device)
optimizer.zero_grad()
out, mc_loss, o_loss, _, _ = model(data.x, data.adj, data.mask)
loss_CE = F.nll_loss(out, data.y.view(-1))
loss_MinCut = mc_loss + o_loss
#loss = F.nll_loss(out, data.y.view(-1)) * (1 - beta) + (mc_loss + o_loss) * beta
loss = loss_CE * (1 - beta) + loss_MinCut * beta
loss.backward()
loss_all += data.y.size(0) * loss.item() #total running loss for a mini-batch.
loss_CE_all += data.y.size(0) * loss_CE.item()
loss_MinCut_all += data.y.size(0) * loss_MinCut.item()
optimizer.step()
return loss_all / len(train_dataset), loss_CE_all / len(train_dataset), loss_MinCut_all / len(train_dataset) #average sample loss for this particular epoch.
@torch.no_grad()
def test(loader):
model.eval()
correct = 0
pr_Table = numpy.zeros([1,4]) #initializing an array.
for data in loader:
data = data.to(device)
#pred = model(data.x, data.adj, data.mask)[0].max(dim=1)[1]
ModelResultPr = model(data.x, data.adj, data.mask)[0]
pred = ModelResultPr.max(dim=1)[1]
correct += pred.eq(data.y.view(-1)).sum().item()
pred_info = numpy.column_stack((numpy.array(torch.exp(ModelResultPr)), numpy.array(pred), numpy.array(data.y.view(-1)))) #cat by columns. And convert log_softmax back to probability.
pr_Table = numpy.row_stack((pr_Table, pred_info)) #cat by rows.
return correct / len(loader.dataset), pr_Table
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
for num_time in range(1, 11): #10 times of k-fold cross-validation.
print(f'This is time: {num_time:02d}')
TimeFolderName = "Time" + str(num_time)
os.makedirs(TimeFolderName) #Creating the Time folder.
#This is for repeated k-fold cross-validation.
#dataset = dataset.shuffle() #checked. It works.
###Below is for 10-fold cross-validation to evaluate model performance.
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
n_partition = len(dataset) // 10 #10-fold cross-validation---This is specific to TNBC 34 images.
#start_point = 0
test_pool = set(numpy.arange(0, len(dataset))) #test_pool initialization for each Time.
for num_fold in range(1, 11): #10-fold cross-validation.
if num_fold == 10:
#end_point = len(dataset)
n_test = len(dataset) - (n_partition*9)
else:
#end_point = n * num_fold
n_test = n_partition
test_list = random.sample(test_pool, n_test)
test_pool = test_pool.difference(set(test_list)) #update test pool by removing used test samples.
train_list = list(set(numpy.arange(0, len(dataset))).difference(set(test_list)))
#print(f'This is fold: {num_fold:02d}, TestStartSample: {start_point:03d}, TestEndSample: {end_point-1:03d}')
print(f'This is fold: {num_fold:02d}, TestSamples: {test_list}')
#test_dataset = dataset[start_point:end_point]
test_dataset = dataset[test_list]
#train_dataset = list(set(dataset).difference(set(test_dataset)))
train_dataset = dataset[train_list]
train_loader = DenseDataLoader(train_dataset, batch_size=MiniBatchSize, shuffle=True) #batch_size=16 is specific to TNBC 34 images.
test_loader = DenseDataLoader(test_dataset, batch_size=1)
#start_point = n * num_fold
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net(dataset.num_features, dataset.num_classes).to(device) #Initialize model for each fold.
optimizer = torch.optim.Adam(model.parameters(), lr=LearningRate)
FoldFolderName = TimeFolderName + "/Fold" + str(num_fold)
os.makedirs(FoldFolderName) #Creating the Fold folder.
filename_0 = FoldFolderName + "/Epoch_TrainLoss.csv"
headers_0 = ["Epoch", "TrainLoss", "TestAccuracy", "TrainLoss_CE", "TrainLoss_MinCut"]
with open(filename_0, "w", newline='') as f0:
f0_csv = csv.writer(f0)
f0_csv.writerow(headers_0)
for epoch in range(1, Num_Epoch+1): #Specify the number of epoch for training in each fold.
train_loss, train_loss_CE, train_loss_MinCut = train(epoch)
test_acc, test_pr = test(test_loader)
#print(f'Epoch: {epoch:03d}, Train Loss: {train_loss:.4f}, Test Acc: {test_acc:.4f}')
with open(filename_0, "a", newline='') as f0:
f0_csv = csv.writer(f0)
f0_csv.writerow([epoch, train_loss, test_acc, train_loss_CE, train_loss_MinCut])
print(f"Final train loss is {train_loss:.4f} with loss_CE of {train_loss_CE:.4f} and loss_MinCut of {train_loss_MinCut:.4f}, and final test accuracy is {test_acc:.4f}")
#print(test_pr)
filename6 = FoldFolderName + "/TestSet_Pr_Pred_Truth.csv"
numpy.savetxt(filename6, test_pr, delimiter=',')
#Extract the soft clustering matrix using the trained model of each fold.
all_sample_loader = DenseDataLoader(dataset, batch_size=1)
EachSample_num = 0
filename_5 = FoldFolderName + "/ModelPrediction.csv"
headers_5 = ["SampleNum", "PredictionCorrectFlag", "TrueLabel", "PredictedLabel"]
with open(filename_5, "w", newline='') as f5:
f5_csv = csv.writer(f5)
f5_csv.writerow(headers_5)
for EachData in all_sample_loader:
EachData = EachData.to(device)
TestModelResult = model(EachData.x, EachData.adj, EachData.mask)
PredLabel = TestModelResult[0].max(dim=1)[1]
CorrectFlag = PredLabel.eq(EachData.y.view(-1)).sum().item()
TrueLableArray = numpy.array(EachData.y.view(-1))
PredLabelArray = numpy.array(PredLabel)
#print(f'Prediction correct flag: {CorrectFlag:01d}, True label: {TrueLableArray}, Predicted label: {PredLabelArray}')
with open(filename_5, "a", newline='') as f5:
f5_csv = csv.writer(f5)
f5_csv.writerow([EachSample_num, CorrectFlag, TrueLableArray, PredLabelArray])
ClusterAssignMatrix1 = TestModelResult[3][0, :, :]
ClusterAssignMatrix1 = torch.softmax(ClusterAssignMatrix1, dim=-1) #checked, consistent with function built in "dense_mincut_pool".
ClusterAssignMatrix1 = ClusterAssignMatrix1.detach().numpy()
filename1 = FoldFolderName + "/ClusterAssignMatrix1_" + str(EachSample_num) + ".csv"
numpy.savetxt(filename1, ClusterAssignMatrix1, delimiter=',')
ClusterAdjMatrix1 = TestModelResult[4][0, :, :]
ClusterAdjMatrix1 = ClusterAdjMatrix1.detach().numpy()
filename2 = FoldFolderName + "/ClusterAdjMatrix1_" + str(EachSample_num) + ".csv"
numpy.savetxt(filename2, ClusterAdjMatrix1, delimiter=',')
NodeMask = EachData.mask
NodeMask = numpy.array(NodeMask)
filename3 = FoldFolderName + "/NodeMask_" + str(EachSample_num) + ".csv"
numpy.savetxt(filename3, NodeMask.T, delimiter=',', fmt='%i') #save as integers.
GraphIdxArray = numpy.array(EachData.graph_idx.view(-1))
filename4 = FoldFolderName + "/GraphIdx_" + str(EachSample_num) + ".csv"
numpy.savetxt(filename4, GraphIdxArray, delimiter=',', fmt='%i') #save as integer.
EachSample_num = EachSample_num + 1
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
Hyperparameters
max_nodes: The largest number of nodes in each KNN gragh, higher than the largest number of cells in each image in the studied dataset.
beta: A weight parameter to balance the minimum cut loss of gragh partitioning and the cross-entropy loss of gragh classification.
Num_TCN: The number of TCNs.
Num_Epoch: The number of training epochs.
Num_Dimension: The dimension of the representation vector.
LearningRate: A tuning parameter in an optimization algorithm that determines the step size at each iteration while moving toward a minimum of a loss function.
MinBatchSize: The minimum number of data samples captured in a training process.
Output
For each fold in each round of the training process, this step generates a folder that contains cluster adjacent matrix, cluster assignment matrix, gragh index, and node mask files and a training loss file.
── Time1
└─ Fold1
| ├─ ClusterAdjMatrix1_0.csv
| ├─ ClusterAssignMatrix1_0.csv
| ├─ GraphIdx_0.csv
| ├─ NodeMask_0.csv
| ├─ Epoch_TrainLoss.csv
| ├─ ModelPrediction.csv
| └─ TestSet_Pr_Pred_Truth.csv
└─ Fold2
└─ ⋮
└─ Fold10
── Time2
── ⋮
── Time10