Source code for dig.sslgraph.dataset.TUDataset

import os.path as osp
from itertools import repeat
import os, shutil, torch
import numpy as np

from torch_geometric.data import InMemoryDataset, download_url, extract_zip
from torch_geometric.io import read_tu_data


[docs]class TUDatasetExt(InMemoryDataset): r"""An extended TUDataset from `Pytorch Geometric <https://pytorch-geometric.readthedocs.io/en/latest/index.html>`_, including a variety of graph kernel benchmark datasets, *e.g.* "IMDB-BINARY", "REDDIT-BINARY" or "PROTEINS". .. note:: Some datasets may not come with any node labels. You can then either make use of the argument :obj:`use_node_attr` to load additional continuous node attributes (if present) or provide synthetic node features using transforms such as like :class:`torch_geometric.transforms.Constant` or :class:`torch_geometric.transforms.OneHotDegree`. Args: root (string): Root directory where the dataset should be saved. name (string): The `name <https://chrsmrrs.github.io/datasets/docs/datasets/>`_ of the dataset. task (string): The evaluation task. Either 'semisupervised' or 'unsupervised'. transform (callable, optional): A function/transform that takes in an :obj:`torch_geometric.data.Data` object and returns a transformed version. The data object will be transformed before every access. (default: :obj:`None`) pre_transform (callable, optional): A function/transform that takes in an :obj:`torch_geometric.data.Data` object and returns a transformed version. The data object will be transformed before being saved to disk. (default: :obj:`None`) pre_filter (callable, optional): A function that takes in an :obj:`torch_geometric.data.Data` object and returns a boolean value, indicating whether the data object should be included in the final dataset. (default: :obj:`None`) use_node_attr (bool, optional): If :obj:`True`, the dataset will contain additional continuous node attributes (if present). (default: :obj:`False`) use_edge_attr (bool, optional): If :obj:`True`, the dataset will contain additional continuous edge attributes (if present). (default: :obj:`False`) cleaned (bool, optional): If :obj:`True`, the dataset will contain only non-isomorphic graphs. (default: :obj:`False`) processed_filename (string, optional): The name of the processed data file. (default: obj: `data.pt`) """ url = 'https://ls11-www.cs.tu-dortmund.de/people/morris/graphkerneldatasets' cleaned_url = ('https://raw.githubusercontent.com/nd7141/graph_datasets/master/datasets') def __init__(self, root, name, task, transform=None, pre_transform=None, pre_filter=None, use_node_attr=False, use_edge_attr=False, cleaned=False, processed_filename='data.pt' ): self.processed_filename = processed_filename self.name = name self.cleaned = cleaned self.task = task super(TUDatasetExt, self).__init__(root, transform, pre_transform, pre_filter) if self.task == "semisupervised": self.data, self.slices = torch.load(self.processed_paths[0]) if self.data.x is not None and not use_node_attr: num_node_attributes = self.num_node_attributes self.data.x = self.data.x[:, num_node_attributes:] if self.data.edge_attr is not None and not use_edge_attr: num_edge_attributes = self.num_edge_attributes self.data.edge_attr = self.data.edge_attr[:, num_edge_attributes:] elif self.task == "unsupervised": self.data, self.slices = torch.load(self.processed_paths[0]) if self.data.x is not None and not use_node_attr: num_node_attributes = self.num_node_attributes self.data.x = self.data.x[:, num_node_attributes:] if self.data.edge_attr is not None and not use_edge_attr: num_edge_attributes = self.num_edge_attributes self.data.edge_attr = self.data.edge_attr[:, num_edge_attributes:] if self.data.x is None: edge_index = self.data.edge_index[0, :].numpy() _, num_edge = self.data.edge_index.size() nlist = [edge_index[n] + 1 for n in range(num_edge - 1) if edge_index[n] > edge_index[n + 1]] nlist.append(edge_index[-1] + 1) num_node = np.array(nlist).sum() self.data.x = torch.ones((num_node, 1)) edge_slice = [0] k = 0 for n in nlist: k = k + n edge_slice.append(k) self.slices['x'] = torch.tensor(edge_slice) else: ValueError("Wrong task name") @property def raw_dir(self): name = 'raw{}'.format('_cleaned' if self.cleaned else '') return osp.join(self.root, self.name, name) @property def processed_dir(self): name = 'processed{}'.format('_cleaned' if self.cleaned else '') return osp.join(self.root, self.name, name) @property def num_node_labels(self): if self.data.x is None: return 0 for i in range(self.data.x.size(1)): x = self.data.x[:, i:] if ((x == 0) | (x == 1)).all() and (x.sum(dim=1) == 1).all(): return self.data.x.size(1) - i return 0 @property def num_node_attributes(self): if self.data.x is None: return 0 return self.data.x.size(1) - self.num_node_labels @property def num_edge_labels(self): if self.data.edge_attr is None: return 0 for i in range(self.data.edge_attr.size(1)): if self.data.edge_attr[:, i:].sum() == self.data.edge_attr.size(0): return self.data.edge_attr.size(1) - i return 0 @property def num_edge_attributes(self): if self.data.edge_attr is None: return 0 return self.data.edge_attr.size(1) - self.num_edge_labels @property def raw_file_names(self): names = ['A', 'graph_indicator'] return ['{}_{}.txt'.format(self.name, name) for name in names] @property def processed_file_names(self): return self.processed_filename
[docs] def download(self): url = self.cleaned_url if self.cleaned else self.url folder = osp.join(self.root, self.name) path = download_url('{}/{}.zip'.format(url, self.name), folder) extract_zip(path, folder) os.unlink(path) shutil.rmtree(self.raw_dir) os.rename(osp.join(folder, self.name), self.raw_dir)
[docs] def process(self): self.data, self.slices = read_tu_data(self.raw_dir, self.name) if self.pre_filter is not None: data_list = [self.get(idx) for idx in range(len(self))] data_list = [data for data in data_list if self.pre_filter(data)] self.data, self.slices = self.collate(data_list) if self.pre_transform is not None: data_list = [self.get(idx) for idx in range(len(self))] data_list = [self.pre_transform(data) for data in data_list] self.data, self.slices = self.collate(data_list) torch.save((self.data, self.slices), self.processed_paths[0])
def __repr__(self): return '{}({})'.format(self.name, len(self)) def get_num_feature(self): data = self.data.__class__() for key in self.data.keys: item, slices = self.data[key], self.slices[key] if torch.is_tensor(item): s = list(repeat(slice(None), item.dim())) s[self.data.__cat_dim__(key, item)] = slice(slices[0], slices[0 + 1]) else: s = slice(slices[idx], slices[idx + 1]) data[key] = item[s] _, num_feature = data.x.size() return num_feature
[docs] def get(self, idx): data = self.data.__class__() for key in self.data.keys: if key == 'num_nodes': continue item, slices = self.data[key], self.slices[key] if torch.is_tensor(item): s = list(repeat(slice(None), item.dim())) s[self.data.__cat_dim__(key, item)] = slice(slices[idx], slices[idx + 1]) else: s = slice(slices[idx], slices[idx + 1]) data[key] = item[s] if self.task == "unsupervised": node_num = data.edge_index.max() sl = torch.tensor([[n, n] for n in range(node_num)]).t() data.edge_index = torch.cat((data.edge_index, sl), dim=1) return data