Use trainer.predict to run inference

snap-stanford · Nov 29, 2022 · 5e5d0d7 · 5e5d0d7
1 parent a842429
commit 5e5d0d7
Showing 1 changed file with 35 additions and 7 deletions.
diff --git a/examples/lsc/mag240m/gnn.py b/examples/lsc/mag240m/gnn.py
@@ -31,6 +31,8 @@
 from typing import Dict, Tuple
 from torch_geometric.data import Batch
 from torch_geometric.data import LightningNodeData
+import pathlib
+from torch.profiler import ProfilerActivity, profile
 
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
@@ -69,7 +71,6 @@ def __init__(self, model: str, in_channels: int, out_channels: int,
         self.conv1 = SAGEConv(in_channels, hidden_channels)
         self.conv2 = SAGEConv(hidden_channels, hidden_channels)
         self.lin = Linear(hidden_channels, out_channels)
-        # self.relu = ReLU(inplace=True)
 
     def forward(self, x: Tensor, edge_index: Adj) -> Tensor:
         x = x.to(torch.float)
@@ -104,31 +105,45 @@ def common_step(self, batch: Batch) -> Tuple[Tensor, Tensor]:
         y = batch['paper'].y[:batch_size].to(torch.long)
         return y_hat, y
 
-    def training_step(self, batch: Batch, batch_idx: int) -> Tensor:
+    def training_step(self, batch: Batch) -> Tensor:
         y_hat, y = self.common_step(batch)
         train_loss = F.cross_entropy(y_hat, y)
         self.train_acc(y_hat.softmax(dim=-1), y)
         self.log('train_acc', self.train_acc, prog_bar=True, on_step=False,
                  on_epoch=True)
         return train_loss
 
-    def validation_step(self, batch: Batch, batch_idx: int):
+    def validation_step(self, batch: Batch):
         y_hat, y = self.common_step(batch)
         self.val_acc(y_hat.softmax(dim=-1), y)
         self.log('val_acc', self.val_acc, on_step=False, on_epoch=True,
                  prog_bar=True, sync_dist=True)
 
-    def test_step(self, batch: Batch, batch_idx: int):
+    def test_step(self, batch: Batch):
         y_hat, y = self.common_step(batch)
         self.test_acc(y_hat.softmax(dim=-1), y)
         self.log('test_acc', self.test_acc, on_step=False, on_epoch=True,
                  prog_bar=True, sync_dist=True)
 
+    def predict_step(self, batch: Batch):
+        y_hat, y = self.common_step(batch)
+        return y_hat
+
     def configure_optimizers(self):
         optimizer = torch.optim.Adam(self.parameters(), lr=0.001)
         scheduler = StepLR(optimizer, step_size=25, gamma=0.25)
         return [optimizer], [scheduler]
 
+def trace_handler(p):
+    if torch.cuda.is_available():
+        profile_sort = 'self_cuda_time_total'
+    else:
+        profile_sort = 'self_cpu_time_total'
+    output = p.key_averages().table(sort_by=profile_sort)
+    print(output)
+    profile_dir = str(pathlib.Path.cwd()) + '/'
+    timeline_file = profile_dir + 'timeline' + '.json'
+    p.export_chrome_trace(timeline_file)
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
@@ -142,14 +157,20 @@ def configure_optimizers(self):
     parser.add_argument('--in-memory', action='store_true')
     parser.add_argument('--device', type=str, default='0')
     parser.add_argument('--evaluate', action='store_true')
+    parser.add_argument('--profile', action='store_true')
     args = parser.parse_args()
     args.sizes = [int(i) for i in args.sizes.split('-')]
     print(args)
 
     seed_everything(42)
     dataset = MAG240MDataset(ROOT)
     data = dataset.to_pyg_hetero_data()
-    datamodule = MAG240M(data, loader='neighbor', num_neighbors=args.sizes, batch_size=args.batch_size, num_workers=2)
+    datamodule = MAG240M(data, ('paper', data['paper'].train_mask),
+                        ('paper', data['paper'].val_mask),
+                        ('paper', data['paper'].test_mask),
+                        ('paper', data['paper'].test_mask),
+                        loader='neighbor', num_neighbors=args.sizes,
+                        batch_size=args.batch_size, num_workers=2)
     print(datamodule)
 
     if not args.evaluate:
@@ -160,7 +181,9 @@ def configure_optimizers(self):
         checkpoint_callback = ModelCheckpoint(monitor='val_acc', mode = 'max', save_top_k=1)
         trainer = Trainer(accelerator="cpu", max_epochs=args.epochs,
                           callbacks=[checkpoint_callback],
-                          default_root_dir=f'logs/{args.model}')
+                          default_root_dir=f'logs/{args.model}',
+                          limit_train_batches=10, limit_test_batches=10,
+                          limit_val_batches=10, limit_predict_batches=10)
         trainer.fit(model, datamodule=datamodule)
 
     if args.evaluate:
@@ -177,7 +200,12 @@ def configure_optimizers(self):
         datamodule.batch_size = 16
         datamodule.sizes = [160] * len(args.sizes)  # (Almost) no sampling...
 
-        trainer.test(model=model, datamodule=datamodule)
+        trainer.predict(model=model, datamodule=datamodule)
+        if args.profile:
+            with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+                            on_trace_ready=trace_handler) as p:
+                trainer.predict(model=model, datamodule=datamodule)
+                p.step()
 
         # evaluator = MAG240MEvaluator()
         # loader = datamodule.hidden_test_dataloader()