From b10c39acfbaf4fbfc0563746c5dd9ceef3b7a17c Mon Sep 17 00:00:00 2001
From: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Date: Tue, 20 Jun 2023 15:09:32 +0800
Subject: [PATCH 01/25] support training dsvt

---
 ...ecfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py |  97 ++++-
 projects/DSVT/dsvt/dsvt.py                    |   6 +-
 projects/DSVT/dsvt/dsvt_head.py               | 367 +++++++++++++++++-
 projects/DSVT/dsvt/utils.py                   | 141 ++++++-
 4 files changed, 594 insertions(+), 17 deletions(-)

diff --git a/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py b/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
index ac7a38b9bc..f9288f09e3 100644
--- a/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
+++ b/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
@@ -88,25 +88,28 @@
         loss_cls=dict(
             type='mmdet.GaussianFocalLoss', reduction='mean', loss_weight=1.0),
         loss_bbox=dict(type='mmdet.L1Loss', reduction='mean', loss_weight=2.0),
+        loss_iou=dict(type='mmdet.L1Loss', reduction='none', loss_weight=1.0),
+        loss_reg_iou=dict(
+            type='mmdet3d.DIoU3DLoss', reduction='mean', loss_weight=2.0),
         norm_bbox=True),
     # model training and testing settings
     train_cfg=dict(
-        pts=dict(
-            grid_size=grid_size,
-            voxel_size=voxel_size,
-            out_size_factor=4,
-            dense_reg=1,
-            gaussian_overlap=0.1,
-            max_objs=500,
-            min_radius=2,
-            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])),
+        grid_size=grid_size,
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=1,
+        dense_reg=1,
+        gaussian_overlap=0.1,
+        max_objs=500,
+        min_radius=2,
+        code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]),
     test_cfg=dict(
         max_per_img=500,
         max_pool_nms=False,
         min_radius=[4, 12, 10, 1, 0.85, 0.175],
         iou_rectifier=[[0.68, 0.71, 0.65]],
         pc_range=[-80, -80],
-        out_size_factor=4,
+        out_size_factor=1,
         voxel_size=voxel_size[:2],
         nms_type='rotate',
         multi_class_nms=True,
@@ -149,11 +152,16 @@
     #     remove_close=True),
     dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
     dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
     dict(
         type='GlobalRotScaleTrans',
         rot_range=[-0.78539816, 0.78539816],
         scale_ratio_range=[0.95, 1.05],
-        translation_std=[0.5, 0.5, 0]),
+        translation_std=[0.5, 0.5, 0.5]),
     dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
     dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
     dict(type='ObjectNameFilter', classes=class_names),
@@ -191,6 +199,26 @@
 ]
 
 dataset_type = 'WaymoDataset'
+train_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='waymo_infos_train.pkl',
+        data_prefix=dict(pts='training/velodyne', sweeps='training/velodyne'),
+        pipeline=train_pipeline,
+        modality=input_modality,
+        test_mode=False,
+        metainfo=metainfo,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR',
+        # load one frame every five frames
+        load_interval=5,
+        backend_args=backend_args))
 val_dataloader = dict(
     batch_size=4,
     num_workers=4,
@@ -223,6 +251,52 @@
 vis_backends = [dict(type='LocalVisBackend')]
 visualizer = dict(
     type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+lr = 1e-5
+# This schedule is mainly used by models on nuScenes dataset
+# max_norm=10 is better for SECOND
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='Adam', lr=lr, weight_decay=0.05, betas=(0.9, 0.999)),
+    clip_grad=dict(max_norm=10, norm_type=2))
+# learning rate
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        T_max=1.2,
+        eta_min=lr * 100,
+        begin=0,
+        end=1.2,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=10.8,
+        eta_min=lr * 1e-4,
+        begin=1.2,
+        end=12,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    # momentum scheduler
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=1.2,
+        eta_min=0.85,
+        begin=0,
+        end=1.2,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=10.8,
+        eta_min=0.95,
+        begin=1.2,
+        end=12,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+
+# runtime settings
+train_cfg = dict(by_epoch=True, max_epochs=12, val_interval=12)
 
 # runtime settings
 val_cfg = dict()
@@ -237,3 +311,4 @@
 default_hooks = dict(
     logger=dict(type='LoggerHook', interval=50),
     checkpoint=dict(type='CheckpointHook', interval=5))
+custom_hooks = [dict(type='DisableObjectSampleHook', disable_after_epoch=11)]
diff --git a/projects/DSVT/dsvt/dsvt.py b/projects/DSVT/dsvt/dsvt.py
index 3c5405679f..e6f6ceffbe 100644
--- a/projects/DSVT/dsvt/dsvt.py
+++ b/projects/DSVT/dsvt/dsvt.py
@@ -103,7 +103,11 @@ def loss(self, batch_inputs_dict: Dict[List, torch.Tensor],
         Returns:
             dict[str, Tensor]: A dictionary of loss components.
         """
-        pass
+        pts_feats = self.extract_feat(batch_inputs_dict)
+        losses = dict()
+        loss = self.bbox_head.loss(pts_feats, batch_data_samples)
+        losses.update(loss)
+        return losses
 
     def predict(self, batch_inputs_dict: Dict[str, Optional[Tensor]],
                 batch_data_samples: List[Det3DDataSample],
diff --git a/projects/DSVT/dsvt/dsvt_head.py b/projects/DSVT/dsvt/dsvt_head.py
index 35167919f0..2d7d8d03db 100644
--- a/projects/DSVT/dsvt/dsvt_head.py
+++ b/projects/DSVT/dsvt/dsvt_head.py
@@ -1,12 +1,15 @@
 from typing import Dict, List, Tuple
 
 import torch
+from mmcv.ops import boxes_iou3d
 from mmdet.models.utils import multi_apply
 from mmengine.structures import InstanceData
 from torch import Tensor
 
 from mmdet3d.models import CenterHead
 from mmdet3d.models.layers import circle_nms, nms_bev
+from mmdet3d.models.utils import (clip_sigmoid, draw_heatmap_gaussian,
+                                  gaussian_radius)
 from mmdet3d.registry import MODELS
 from mmdet3d.structures import Det3DDataSample, xywhr2xyxyr
 
@@ -18,8 +21,16 @@ class DSVTCenterHead(CenterHead):
     This head adds IoU prediction branch based on the original CenterHead.
     """
 
-    def __init__(self, *args, **kwargs):
+    def __init__(self,
+                 *args,
+                 loss_iou=dict(
+                     type='mmdet.L1Loss', reduction='mean', loss_weight=1),
+                 loss_reg_iou=None,
+                 **kwargs):
         super(DSVTCenterHead, self).__init__(*args, **kwargs)
+        self.loss_iou = MODELS.build(loss_iou)
+        self.loss_iou_reg = MODELS.build(
+            loss_reg_iou) if loss_reg_iou is not None else None
 
     def forward_single(self, x: Tensor) -> dict:
         """Forward function for CenterPoint.
@@ -66,7 +77,296 @@ def loss(self, pts_feats: List[Tensor],
         Returns:
             dict: Losses of each branch.
         """
-        pass
+        outs = self(pts_feats)
+        batch_gt_instance_3d = []
+        for data_sample in batch_data_samples:
+            batch_gt_instance_3d.append(data_sample.gt_instances_3d)
+        losses = self.loss_by_feat(outs, batch_gt_instance_3d)
+        return losses
+
+    def _decode_all_preds(self,
+                          pred_dict,
+                          point_cloud_range=None,
+                          voxel_size=None):
+        batch_size, _, H, W = pred_dict['reg'].shape
+
+        batch_center = pred_dict['reg'].permute(0, 2, 3, 1).contiguous().view(
+            batch_size, H * W, 2)  # (B, H, W, 2)
+        batch_center_z = pred_dict['height'].permute(
+            0, 2, 3, 1).contiguous().view(batch_size, H * W, 1)  # (B, H, W, 1)
+        batch_dim = pred_dict['dim'].exp().permute(
+            0, 2, 3, 1).contiguous().view(batch_size, H * W, 3)  # (B, H, W, 3)
+        batch_rot_cos = pred_dict['rot'][:, 0].unsqueeze(dim=1).permute(
+            0, 2, 3, 1).contiguous().view(batch_size, H * W, 1)  # (B, H, W, 1)
+        batch_rot_sin = pred_dict['rot'][:, 1].unsqueeze(dim=1).permute(
+            0, 2, 3, 1).contiguous().view(batch_size, H * W, 1)  # (B, H, W, 1)
+        batch_vel = pred_dict['vel'].permute(0, 2, 3, 1).contiguous().view(
+            batch_size, H * W, 2) if 'vel' in pred_dict.keys() else None
+
+        angle = torch.atan2(batch_rot_sin, batch_rot_cos)  # (B, H*W, 1)
+
+        ys, xs = torch.meshgrid([
+            torch.arange(
+                0, H, device=batch_center.device, dtype=batch_center.dtype),
+            torch.arange(
+                0, W, device=batch_center.device, dtype=batch_center.dtype)
+        ])
+        ys = ys.view(1, H, W).repeat(batch_size, 1, 1)
+        xs = xs.view(1, H, W).repeat(batch_size, 1, 1)
+        xs = xs.view(batch_size, -1, 1) + batch_center[:, :, 0:1]
+        ys = ys.view(batch_size, -1, 1) + batch_center[:, :, 1:2]
+
+        xs = xs * voxel_size[0] + point_cloud_range[0]
+        ys = ys * voxel_size[1] + point_cloud_range[1]
+
+        box_part_list = [xs, ys, batch_center_z, batch_dim, angle]
+        if batch_vel is not None:
+            box_part_list.append(batch_vel)
+
+        box_preds = torch.cat((box_part_list),
+                              dim=-1).view(batch_size, H, W, -1)
+
+        return box_preds
+
+    def _transpose_and_gather_feat(self, feat, ind):
+        feat = feat.permute(0, 2, 3, 1).contiguous()
+        feat = feat.view(feat.size(0), -1, feat.size(3))
+        feat = self._gather_feat(feat, ind)
+        return feat
+
+    def calc_iou_loss(self, iou_preds, batch_box_preds, mask, ind, gt_boxes):
+        """
+        Args:
+            iou_preds: (batch x 1 x h x w)
+            batch_box_preds: (batch x (7 or 9) x h x w)
+            mask: (batch x max_objects)
+            ind: (batch x max_objects)
+            gt_boxes: List of batch groundtruth boxes.
+
+        Returns:
+        """
+        if mask.sum() == 0:
+            return iou_preds.new_zeros((1))
+
+        mask = mask.bool()
+        selected_iou_preds = self._transpose_and_gather_feat(iou_preds,
+                                                             ind)[mask]
+
+        selected_box_preds = self._transpose_and_gather_feat(
+            batch_box_preds, ind)[mask]
+        gt_boxes = torch.cat(gt_boxes)
+        assert gt_boxes.size(0) == selected_box_preds.size(0)
+        iou_target = boxes_iou3d(selected_box_preds[:, 0:7], gt_boxes[:, 0:7])
+        iou_target = torch.diag(iou_target).view(-1)
+        iou_target = iou_target * 2 - 1  # [0, 1] ==> [-1, 1]
+
+        loss = self.loss_iou(selected_iou_preds.view(-1), iou_target)
+        return loss
+
+    def calc_iou_reg_loss(self, batch_box_preds, mask, ind, gt_boxes):
+        if mask.sum() == 0:
+            return batch_box_preds.new_zeros((1))
+
+        mask = mask.bool()
+
+        selected_box_preds = self._transpose_and_gather_feat(
+            batch_box_preds, ind)[mask]
+        gt_boxes = torch.cat(gt_boxes)
+        assert gt_boxes.size(0) == selected_box_preds.size(0)
+        loss = self.loss_iou_reg(selected_box_preds[:, 0:7], gt_boxes[:, 0:7])
+
+        return loss
+
+    def get_targets(
+        self,
+        batch_gt_instances_3d: List[InstanceData],
+    ) -> Tuple[List[Tensor]]:
+        """Generate targets.
+
+        How each output is transformed:
+
+            Each nested list is transposed so that all same-index elements in
+            each sub-list (1, ..., N) become the new sub-lists.
+                [ [a0, a1, a2, ... ], [b0, b1, b2, ... ], ... ]
+                ==> [ [a0, b0, ... ], [a1, b1, ... ], [a2, b2, ... ] ]
+
+            The new transposed nested list is converted into a list of N
+            tensors generated by concatenating tensors in the new sub-lists.
+                [ tensor0, tensor1, tensor2, ... ]
+
+        Args:
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances. It usually includes ``bboxes_3d`` and\
+                ``labels_3d`` attributes.
+
+        Returns:
+            Returns:
+                tuple[list[torch.Tensor]]: Tuple of target including
+                    the following results in order.
+
+                - list[torch.Tensor]: Heatmap scores.
+                - list[torch.Tensor]: Ground truth boxes.
+                - list[torch.Tensor]: Indexes indicating the
+                    position of the valid boxes.
+                - list[torch.Tensor]: Masks indicating which
+                    boxes are valid.
+        """
+        heatmaps, anno_boxes, inds, masks, task_gt_bboxes = multi_apply(
+            self.get_targets_single, batch_gt_instances_3d)
+        # Transpose heatmaps
+        heatmaps = list(map(list, zip(*heatmaps)))
+        heatmaps = [torch.stack(hms_) for hms_ in heatmaps]
+        # Transpose anno_boxes
+        anno_boxes = list(map(list, zip(*anno_boxes)))
+        anno_boxes = [torch.stack(anno_boxes_) for anno_boxes_ in anno_boxes]
+        # Transpose inds
+        inds = list(map(list, zip(*inds)))
+        inds = [torch.stack(inds_) for inds_ in inds]
+        # Transpose inds
+        masks = list(map(list, zip(*masks)))
+        masks = [torch.stack(masks_) for masks_ in masks]
+        # Transpose inds
+        task_gt_bboxes = list(map(list, zip(*task_gt_bboxes)))
+        return heatmaps, anno_boxes, inds, masks, task_gt_bboxes
+
+    def get_targets_single(self,
+                           gt_instances_3d: InstanceData) -> Tuple[Tensor]:
+        """Generate training targets for a single sample.
+
+        Args:
+            gt_instances_3d (:obj:`InstanceData`): Gt_instances of
+                single data sample. It usually includes
+                ``bboxes_3d`` and ``labels_3d`` attributes.
+
+        Returns:
+            tuple[list[torch.Tensor]]: Tuple of target including
+                the following results in order.
+
+                - list[torch.Tensor]: Heatmap scores.
+                - list[torch.Tensor]: Ground truth boxes.
+                - list[torch.Tensor]: Indexes indicating the position
+                    of the valid boxes.
+                - list[torch.Tensor]: Masks indicating which boxes
+                    are valid.
+        """
+        gt_labels_3d = gt_instances_3d.labels_3d
+        gt_bboxes_3d = gt_instances_3d.bboxes_3d
+        device = gt_labels_3d.device
+        gt_bboxes_3d = torch.cat(
+            (gt_bboxes_3d.gravity_center, gt_bboxes_3d.tensor[:, 3:]),
+            dim=1).to(device)
+        max_objs = self.train_cfg['max_objs'] * self.train_cfg['dense_reg']
+        grid_size = torch.tensor(self.train_cfg['grid_size']).to(device)
+        pc_range = torch.tensor(self.train_cfg['point_cloud_range'])
+        voxel_size = torch.tensor(self.train_cfg['voxel_size'])
+
+        feature_map_size = grid_size[:2] // self.train_cfg['out_size_factor']
+
+        # reorganize the gt_dict by tasks
+        task_masks = []
+        flag = 0
+        for class_name in self.class_names:
+            task_masks.append([
+                torch.where(gt_labels_3d == class_name.index(i) + flag)
+                for i in class_name
+            ])
+            flag += len(class_name)
+
+        task_boxes = []
+        task_classes = []
+        flag2 = 0
+        for idx, mask in enumerate(task_masks):
+            task_box = []
+            task_class = []
+            for m in mask:
+                task_box.append(gt_bboxes_3d[m])
+                # 0 is background for each task, so we need to add 1 here.
+                task_class.append(gt_labels_3d[m] + 1 - flag2)
+            task_boxes.append(torch.cat(task_box, axis=0).to(device))
+            task_classes.append(torch.cat(task_class).long().to(device))
+            flag2 += len(mask)
+        draw_gaussian = draw_heatmap_gaussian
+        heatmaps, anno_boxes, inds, masks = [], [], [], []
+
+        for idx, task_head in enumerate(self.task_heads):
+            heatmap = gt_bboxes_3d.new_zeros(
+                (len(self.class_names[idx]), feature_map_size[1],
+                 feature_map_size[0]))
+
+            anno_box = gt_bboxes_3d.new_zeros((max_objs, 8),
+                                              dtype=torch.float32)
+
+            ind = gt_labels_3d.new_zeros((max_objs), dtype=torch.int64)
+            mask = gt_bboxes_3d.new_zeros((max_objs), dtype=torch.uint8)
+
+            num_objs = min(task_boxes[idx].shape[0], max_objs)
+
+            for k in range(num_objs):
+                cls_id = task_classes[idx][k] - 1
+
+                length = task_boxes[idx][k][3]
+                width = task_boxes[idx][k][4]
+                length = length / voxel_size[0] / self.train_cfg[
+                    'out_size_factor']
+                width = width / voxel_size[1] / self.train_cfg[
+                    'out_size_factor']
+
+                if width > 0 and length > 0:
+                    radius = gaussian_radius(
+                        (width, length),
+                        min_overlap=self.train_cfg['gaussian_overlap'])
+                    radius = max(self.train_cfg['min_radius'], int(radius))
+
+                    # be really careful for the coordinate system of
+                    # your box annotation.
+                    x, y, z = task_boxes[idx][k][0], task_boxes[idx][k][
+                        1], task_boxes[idx][k][2]
+
+                    coor_x = (
+                        x - pc_range[0]
+                    ) / voxel_size[0] / self.train_cfg['out_size_factor']
+                    coor_y = (
+                        y - pc_range[1]
+                    ) / voxel_size[1] / self.train_cfg['out_size_factor']
+
+                    center = torch.tensor([coor_x, coor_y],
+                                          dtype=torch.float32,
+                                          device=device)
+                    center_int = center.to(torch.int32)
+
+                    # throw out not in range objects to avoid out of array
+                    # area when creating the heatmap
+                    if not (0 <= center_int[0] < feature_map_size[0]
+                            and 0 <= center_int[1] < feature_map_size[1]):
+                        continue
+
+                    draw_gaussian(heatmap[cls_id], center_int, radius)
+
+                    new_idx = k
+                    x, y = center_int[0], center_int[1]
+
+                    assert (y * feature_map_size[0] + x <
+                            feature_map_size[0] * feature_map_size[1])
+
+                    ind[new_idx] = y * feature_map_size[0] + x
+                    mask[new_idx] = 1
+                    # TODO: support other outdoor dataset
+                    rot = task_boxes[idx][k][6]
+                    box_dim = task_boxes[idx][k][3:6]
+                    if self.norm_bbox:
+                        box_dim = box_dim.log()
+                    anno_box[new_idx] = torch.cat([
+                        center - torch.tensor([x, y], device=device),
+                        z.unsqueeze(0), box_dim,
+                        torch.sin(rot).unsqueeze(0),
+                        torch.cos(rot).unsqueeze(0)
+                    ])
+
+            heatmaps.append(heatmap)
+            anno_boxes.append(anno_box)
+            masks.append(mask)
+            inds.append(ind)
+        return heatmaps, anno_boxes, inds, masks, task_boxes
 
     def loss_by_feat(self, preds_dicts: Tuple[List[dict]],
                      batch_gt_instances_3d: List[InstanceData], *args,
@@ -79,13 +379,72 @@ def loss_by_feat(self, preds_dicts: Tuple[List[dict]],
                 tasks head, and the internal list indicate different
                 FPN level.
             batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
-                gt_instances. It usually includes ``bboxes_3d`` and\
+                gt_instances. It usually includes ``bboxes_3d`` and
                 ``labels_3d`` attributes.
 
         Returns:
             dict[str,torch.Tensor]: Loss of heatmap and bbox of each task.
         """
-        pass
+        heatmaps, anno_boxes, inds, masks, task_gt_bboxes = self.get_targets(
+            batch_gt_instances_3d)
+        loss_dict = dict()
+        for task_id, preds_dict in enumerate(preds_dicts):
+            # heatmap focal loss
+            preds_dict[0]['heatmap'] = clip_sigmoid(preds_dict[0]['heatmap'])
+            num_pos = heatmaps[task_id].eq(1).float().sum().item()
+            loss_heatmap = self.loss_cls(
+                preds_dict[0]['heatmap'],
+                heatmaps[task_id],
+                avg_factor=max(num_pos, 1))
+            target_box = anno_boxes[task_id]
+            # reconstruct the anno_box from multiple reg heads
+            preds_dict[0]['anno_box'] = torch.cat(
+                (preds_dict[0]['reg'], preds_dict[0]['height'],
+                 preds_dict[0]['dim'], preds_dict[0]['rot']),
+                dim=1)
+
+            # Regression loss for dimension, offset, height, rotation
+            ind = inds[task_id]
+            num = masks[task_id].float().sum()
+            pred = preds_dict[0]['anno_box'].permute(0, 2, 3, 1).contiguous()
+            pred = pred.view(pred.size(0), -1, pred.size(3))
+            pred = self._gather_feat(pred, ind)
+            mask = masks[task_id].unsqueeze(2).expand_as(target_box).float()
+            isnotnan = (~torch.isnan(target_box)).float()
+            mask *= isnotnan
+
+            code_weights = self.train_cfg.get('code_weights', None)
+            bbox_weights = mask * mask.new_tensor(code_weights)
+            loss_bbox = self.loss_bbox(
+                pred, target_box, bbox_weights, avg_factor=(num + 1e-4))
+            loss_dict[f'task{task_id}.loss_heatmap'] = loss_heatmap
+            loss_dict[f'task{task_id}.loss_bbox'] = loss_bbox
+
+            if 'iou' in preds_dict[0]:
+                batch_box_preds = self._decode_all_preds(
+                    pred_dict=preds_dict[0],
+                    point_cloud_range=self.train_cfg['point_cloud_range'],
+                    voxel_size=self.train_cfg['voxel_size']
+                )  # (B, H, W, 7 or 9)
+
+                batch_box_preds_for_iou = batch_box_preds.permute(
+                    0, 3, 1, 2)  # (B, 7 or 9, H, W)
+                loss_dict[f'task{task_id}.loss_iou'] = self.calc_iou_loss(
+                    iou_preds=preds_dict[0]['iou'],
+                    batch_box_preds=batch_box_preds_for_iou.clone().detach(),
+                    mask=mask.all(dim=-1),
+                    ind=ind,
+                    gt_boxes=task_gt_bboxes[task_id])
+
+                if self.loss_iou_reg is not None:
+                    loss_dict[f'task{task_id}.loss_reg_iou'] = \
+                        self.calc_iou_reg_loss(
+                            batch_box_preds=batch_box_preds_for_iou,
+                            mask=mask.all(dim=-1),
+                            ind=ind,
+                            gt_boxes=task_gt_bboxes[task_id])
+
+        return loss_dict
 
     def predict(self,
                 pts_feats: Tuple[torch.Tensor],
diff --git a/projects/DSVT/dsvt/utils.py b/projects/DSVT/dsvt/utils.py
index 7c40383ce7..4d697d2dbb 100644
--- a/projects/DSVT/dsvt/utils.py
+++ b/projects/DSVT/dsvt/utils.py
@@ -3,10 +3,11 @@
 import numpy as np
 import torch
 import torch.nn as nn
+from mmdet.models.losses.utils import weighted_loss
 from torch import Tensor
 
 from mmdet3d.models.task_modules import CenterPointBBoxCoder
-from mmdet3d.registry import TASK_UTILS
+from mmdet3d.registry import MODELS, TASK_UTILS
 from .ops.ingroup_inds.ingroup_inds_op import ingroup_inds
 
 get_inner_win_inds_cuda = ingroup_inds
@@ -298,3 +299,141 @@ def decode(self,
                 'support post_center_range is not None for now!')
 
         return predictions_dicts
+
+
+def center_to_corner2d(center, dim):
+    corners_norm = torch.tensor(
+        [[-0.5, -0.5], [-0.5, 0.5], [0.5, 0.5], [0.5, -0.5]],
+        device=dim.device).type_as(center)  # (4, 2)
+    corners = dim.view([-1, 1, 2]) * corners_norm.view([1, 4, 2])  # (N, 4, 2)
+    corners = corners + center.view(-1, 1, 2)
+    return corners
+
+
+@weighted_loss
+def diou3d_loss(pred_boxes, gt_boxes, eps: float = 1e-7):
+    """
+    https://github.com/agent-sgs/PillarNet/blob/master/det3d/core/utils/center_utils.py # noqa
+    Args:
+        pred_boxes (N, 7):
+        gt_boxes (N, 7):
+
+    Returns:
+        _type_: _description_
+    """
+    assert pred_boxes.shape[0] == gt_boxes.shape[0]
+
+    qcorners = center_to_corner2d(pred_boxes[:, :2],
+                                  pred_boxes[:, 3:5])  # (N, 4, 2)
+    gcorners = center_to_corner2d(gt_boxes[:, :2], gt_boxes[:,
+                                                            3:5])  # (N, 4, 2)
+
+    inter_max_xy = torch.minimum(qcorners[:, 2], gcorners[:, 2])
+    inter_min_xy = torch.maximum(qcorners[:, 0], gcorners[:, 0])
+    out_max_xy = torch.maximum(qcorners[:, 2], gcorners[:, 2])
+    out_min_xy = torch.minimum(qcorners[:, 0], gcorners[:, 0])
+
+    # calculate area
+    volume_pred_boxes = pred_boxes[:, 3] * pred_boxes[:, 4] * pred_boxes[:, 5]
+    volume_gt_boxes = gt_boxes[:, 3] * gt_boxes[:, 4] * gt_boxes[:, 5]
+
+    inter_h = torch.minimum(
+        pred_boxes[:, 2] + 0.5 * pred_boxes[:, 5],
+        gt_boxes[:, 2] + 0.5 * gt_boxes[:, 5]) - torch.maximum(
+            pred_boxes[:, 2] - 0.5 * pred_boxes[:, 5],
+            gt_boxes[:, 2] - 0.5 * gt_boxes[:, 5])
+    inter_h = torch.clamp(inter_h, min=0)
+
+    inter = torch.clamp((inter_max_xy - inter_min_xy), min=0)
+    volume_inter = inter[:, 0] * inter[:, 1] * inter_h
+    volume_union = volume_gt_boxes + volume_pred_boxes - volume_inter + eps
+
+    # boxes_iou3d_gpu(pred_boxes, gt_boxes)
+    inter_diag = torch.pow(gt_boxes[:, 0:3] - pred_boxes[:, 0:3], 2).sum(-1)
+
+    outer_h = torch.maximum(
+        gt_boxes[:, 2] + 0.5 * gt_boxes[:, 5],
+        pred_boxes[:, 2] + 0.5 * pred_boxes[:, 5]) - torch.minimum(
+            gt_boxes[:, 2] - 0.5 * gt_boxes[:, 5],
+            pred_boxes[:, 2] - 0.5 * pred_boxes[:, 5])
+    outer_h = torch.clamp(outer_h, min=0)
+    outer = torch.clamp((out_max_xy - out_min_xy), min=0)
+    outer_diag = outer[:, 0]**2 + outer[:, 1]**2 + outer_h**2 + eps
+
+    dious = volume_inter / volume_union - inter_diag / outer_diag
+    dious = torch.clamp(dious, min=-1.0, max=1.0)
+
+    loss = 1 - dious
+
+    return loss
+
+
+@MODELS.register_module()
+class DIoU3DLoss(nn.Module):
+    r"""3D bboxes Implementation of `Distance-IoU Loss: Faster and Better
+    Learning for Bounding Box Regression https://arxiv.org/abs/1911.08287`_.
+
+    Code is modified from https://github.com/Zzh-tju/DIoU.
+
+    Args:
+        eps (float): Epsilon to avoid log(0).
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Weight of loss.
+    """
+
+    def __init__(self,
+                 eps: float = 1e-6,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super().__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+                shape (n, 4).
+            target (Tensor): The learning target of the prediction,
+                shape (n, 4).
+            weight (Optional[Tensor], optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (Optional[int], optional): Average factor that is used
+                to average the loss. Defaults to None.
+            reduction_override (Optional[str], optional): The reduction method
+                used to override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # giou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * diou3d_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss

From 7fecc0f2e5e98507a4c6486cff652ea2fe01541b Mon Sep 17 00:00:00 2001
From: sunjiahao1999 <578431509@qq.com>
Date: Wed, 9 Aug 2023 13:33:27 +0800
Subject: [PATCH 02/25] fix batch_size

---
 convert_ckpt.py                               | 34 +++++++++++++++++++
 mmdet3d/engine/hooks/visualization_hook.py    |  4 +--
 mmdet3d/structures/points/base_points.py      |  6 ++--
 ...ecfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py | 20 ++---------
 4 files changed, 42 insertions(+), 22 deletions(-)
 create mode 100644 convert_ckpt.py

diff --git a/convert_ckpt.py b/convert_ckpt.py
new file mode 100644
index 0000000000..f5c3c9317e
--- /dev/null
+++ b/convert_ckpt.py
@@ -0,0 +1,34 @@
+# from mmengine.config import Config
+# cfg = Config.fromfile(path)
+# print(cfg)
+import torch
+
+mm3d_model = torch.load('checkpoints/dsvt_convert.pth')
+dsvt_model = dict()
+dsvt_model['model_state'] = dict()
+for k, v in mm3d_model.items():
+    if 'voxel_encoder' in k:
+        k = k.replace('voxel_encoder', 'vfe')
+    if 'middle_encoder' in k:
+        k = k.replace('middle_encoder', 'backbone_3d')
+    if 'backbone.' in k:
+        k = k.replace('backbone', 'backbone_2d')
+    if 'neck' in k:
+        k = k.replace('neck', 'backbone_2d')
+    if 'bbox_head.shared_conv' in k:
+        k = k.replace('bbox_head.shared_conv.conv', 'dense_head.shared_conv.0')
+        k = k.replace('bbox_head.shared_conv.bn', 'dense_head.shared_conv.1')
+    if 'bbox_head.task_heads' in k:
+        k = k.replace('bbox_head.task_heads', 'dense_head.heads_list')
+        if 'reg' in k:
+            k = k.replace('reg', 'center')
+        if 'height' in k:
+            k = k.replace('height', 'center_z')
+        if 'heatmap' in k:
+            k = k.replace('heatmap', 'hm')
+        if '0.conv' in k:
+            k = k.replace('0.conv', '0.0')
+        if '0.bn' in k:
+            k = k.replace('0.bn', '0.1')
+    dsvt_model['model_state'][k] = v
+torch.save(dsvt_model, 'dsvt_ckpt.pth')
diff --git a/mmdet3d/engine/hooks/visualization_hook.py b/mmdet3d/engine/hooks/visualization_hook.py
index ffec1addc3..9de46d9692 100644
--- a/mmdet3d/engine/hooks/visualization_hook.py
+++ b/mmdet3d/engine/hooks/visualization_hook.py
@@ -78,11 +78,11 @@ def __init__(self,
                           'needs to be excluded.')
         self.vis_task = vis_task
 
-        if wait_time == -1:
+        if show and wait_time == -1:
             print_log(
                 'Manual control mode, press [Right] to next sample.',
                 logger='current')
-        else:
+        elif show:
             print_log(
                 'Autoplay mode, press [SPACE] to pause.', logger='current')
         self.wait_time = wait_time
diff --git a/mmdet3d/structures/points/base_points.py b/mmdet3d/structures/points/base_points.py
index 4cb54ce895..188d20d270 100644
--- a/mmdet3d/structures/points/base_points.py
+++ b/mmdet3d/structures/points/base_points.py
@@ -247,10 +247,10 @@ def in_range_3d(
         """
         in_range_flags = ((self.tensor[:, 0] > point_range[0])
                           & (self.tensor[:, 1] > point_range[1])
-                          & (self.tensor[:, 2] > point_range[2])
+                          #   & (self.tensor[:, 2] > point_range[2])
                           & (self.tensor[:, 0] < point_range[3])
-                          & (self.tensor[:, 1] < point_range[4])
-                          & (self.tensor[:, 2] < point_range[5]))
+                          & (self.tensor[:, 1] < point_range[4]))
+        #   & (self.tensor[:, 2] < point_range[5]))
         return in_range_flags
 
     @property
diff --git a/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py b/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
index f9288f09e3..0ac5c66c48 100644
--- a/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
+++ b/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
@@ -180,21 +180,7 @@
         norm_intensity=True,
         norm_elongation=True,
         backend_args=backend_args),
-    dict(
-        type='MultiScaleFlipAug3D',
-        img_scale=(1333, 800),
-        pts_scale_ratio=1,
-        flip=False,
-        transforms=[
-            dict(
-                type='GlobalRotScaleTrans',
-                rot_range=[0, 0],
-                scale_ratio_range=[1., 1.],
-                translation_std=[0, 0, 0]),
-            dict(type='RandomFlip3D'),
-            dict(
-                type='PointsRangeFilter', point_cloud_range=point_cloud_range)
-        ]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
     dict(type='Pack3DDetInputs', keys=['points'])
 ]
 
@@ -220,8 +206,8 @@
         load_interval=5,
         backend_args=backend_args))
 val_dataloader = dict(
-    batch_size=4,
-    num_workers=4,
+    batch_size=1,
+    num_workers=1,
     persistent_workers=True,
     drop_last=False,
     sampler=dict(type='DefaultSampler', shuffle=False),

From 232c3b89132327eddb72f9d09a834771593b7031 Mon Sep 17 00:00:00 2001
From: sjh <sunjiahao1999>
Date: Thu, 10 Aug 2023 14:57:59 +0800
Subject: [PATCH 03/25] chage cam to lidar

---
 .gitignore                                    |  4 ++
 mmdet3d/datasets/waymo_dataset.py             | 14 +++--
 ...ecfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py |  9 +++
 tools/create_data.py                          | 40 ++++++-------
 tools/dataset_converters/kitti_converter.py   | 57 ++++++++++---------
 tools/dataset_converters/kitti_data_utils.py  |  7 ++-
 .../dataset_converters/update_infos_to_v2.py  | 24 ++++----
 tools/dataset_converters/waymo_converter.py   | 53 +++++++++--------
 8 files changed, 119 insertions(+), 89 deletions(-)

diff --git a/.gitignore b/.gitignore
index 27cb9c7cb4..36817b21a6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -134,3 +134,7 @@ data/sunrgbd/OFFICIAL_SUNRGBD/
 # Waymo evaluation
 mmdet3d/evaluation/functional/waymo_utils/compute_detection_metrics_main
 mmdet3d/evaluation/functional/waymo_utils/compute_detection_let_metrics_main
+mmdet3d/evaluation/functional/waymo_utils/compute_segmentation_metrics_main
+
+check_waymo.py
+remove_infos.py
diff --git a/mmdet3d/datasets/waymo_dataset.py b/mmdet3d/datasets/waymo_dataset.py
index 5b3a83824e..243630d006 100644
--- a/mmdet3d/datasets/waymo_dataset.py
+++ b/mmdet3d/datasets/waymo_dataset.py
@@ -5,7 +5,7 @@
 import numpy as np
 
 from mmdet3d.registry import DATASETS
-from mmdet3d.structures import CameraInstance3DBoxes
+from mmdet3d.structures import LiDARInstance3DBoxes
 from .det3d_dataset import Det3DDataset
 from .kitti_dataset import KittiDataset
 
@@ -165,11 +165,13 @@ def parse_ann_info(self, info: dict) -> dict:
 
         # in waymo, lidar2cam = R0_rect @ Tr_velo_to_cam
         # convert gt_bboxes_3d to velodyne coordinates with `lidar2cam`
-        lidar2cam = np.array(info['images'][self.default_cam_key]['lidar2cam'])
-        gt_bboxes_3d = CameraInstance3DBoxes(
-            ann_info['gt_bboxes_3d']).convert_to(self.box_mode_3d,
-                                                 np.linalg.inv(lidar2cam))
-        ann_info['gt_bboxes_3d'] = gt_bboxes_3d
+        # lidar2cam = np.array(info['images'][self.default_cam_key]
+        #   ['lidar2cam'])
+        # gt_bboxes_3d = CameraInstance3DBoxes(
+        #     ann_info['gt_bboxes_3d']).convert_to(self.box_mode_3d,
+        #                                          np.linalg.inv(lidar2cam))
+        # ann_info['gt_bboxes_3d'] = gt_bboxes_3d
+        gt_bboxes_3d = LiDARInstance3DBoxes(ann_info['gt_bboxes_3d'])
 
         anns_results = dict(
             gt_bboxes_3d=gt_bboxes_3d,
diff --git a/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py b/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
index ac7a38b9bc..9132b45cb6 100644
--- a/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
+++ b/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
@@ -5,6 +5,7 @@
 voxel_size = [0.32, 0.32, 6]
 grid_size = [468, 468, 1]
 point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4.0]
+# data_root = 'data/waymo_mini/kitti_format/'
 data_root = 'data/waymo/kitti_format/'
 class_names = ['Car', 'Pedestrian', 'Cyclist']
 metainfo = dict(classes=class_names)
@@ -210,6 +211,14 @@
         backend_args=backend_args))
 test_dataloader = val_dataloader
 
+# val_evaluator = dict(
+#     type='WaymoMetric',
+#     ann_file='./data/waymo_mini/kitti_format/waymo_infos_val.pkl',
+#     waymo_bin_file='./data/waymo_mini/waymo_format/gt_mini.bin',
+#     data_root='./data/waymo_mini/waymo_format',
+#     backend_args=backend_args,
+#     convert_kitti_format=False,
+#     idx2metainfo='./data/waymo_mini/waymo_format/idx2metainfo.pkl')
 val_evaluator = dict(
     type='WaymoMetric',
     ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
diff --git a/tools/create_data.py b/tools/create_data.py
index 34356c2a8f..eff511ef4a 100644
--- a/tools/create_data.py
+++ b/tools/create_data.py
@@ -7,8 +7,10 @@
 from tools.dataset_converters import lyft_converter as lyft_converter
 from tools.dataset_converters import nuscenes_converter as nuscenes_converter
 from tools.dataset_converters import semantickitti_converter
-from tools.dataset_converters.create_gt_database import (
-    GTDatabaseCreater, create_groundtruth_database)
+# from tools.dataset_converters.create_gt_database import (
+#     GTDatabaseCreater, create_groundtruth_database)
+from tools.dataset_converters.create_gt_database import \
+    create_groundtruth_database
 from tools.dataset_converters.update_infos_to_v2 import update_pkl_infos
 
 
@@ -185,9 +187,7 @@ def waymo_data_prep(root_path,
     """
     from tools.dataset_converters import waymo_converter as waymo
 
-    splits = [
-        'training', 'validation', 'testing', 'testing_3d_camera_only_detection'
-    ]
+    splits = ['training', 'validation']
     for i, split in enumerate(splits):
         load_dir = osp.join(root_path, 'waymo_format', split)
         if split == 'validation':
@@ -203,29 +203,29 @@ def waymo_data_prep(root_path,
                        in ['testing', 'testing_3d_camera_only_detection']))
         converter.convert()
 
-    from tools.dataset_converters.waymo_converter import \
-        create_ImageSets_img_ids
-    create_ImageSets_img_ids(osp.join(out_dir, 'kitti_format'), splits)
+    # from tools.dataset_converters.waymo_converter import \
+    #     create_ImageSets_img_ids
+    # create_ImageSets_img_ids(osp.join(out_dir, 'kitti_format'), splits)
     # Generate waymo infos
     out_dir = osp.join(out_dir, 'kitti_format')
-    kitti.create_waymo_info_file(
-        out_dir, info_prefix, max_sweeps=max_sweeps, workers=workers)
+    # kitti.create_waymo_info_file(
+    #     out_dir, info_prefix, max_sweeps=max_sweeps, workers=workers)
     info_train_path = osp.join(out_dir, f'{info_prefix}_infos_train.pkl')
     info_val_path = osp.join(out_dir, f'{info_prefix}_infos_val.pkl')
     info_trainval_path = osp.join(out_dir, f'{info_prefix}_infos_trainval.pkl')
-    info_test_path = osp.join(out_dir, f'{info_prefix}_infos_test.pkl')
+    # info_test_path = osp.join(out_dir, f'{info_prefix}_infos_test.pkl')
     update_pkl_infos('waymo', out_dir=out_dir, pkl_path=info_train_path)
     update_pkl_infos('waymo', out_dir=out_dir, pkl_path=info_val_path)
     update_pkl_infos('waymo', out_dir=out_dir, pkl_path=info_trainval_path)
-    update_pkl_infos('waymo', out_dir=out_dir, pkl_path=info_test_path)
-    GTDatabaseCreater(
-        'WaymoDataset',
-        out_dir,
-        info_prefix,
-        f'{info_prefix}_infos_train.pkl',
-        relative_path=False,
-        with_mask=False,
-        num_worker=workers).create()
+    # update_pkl_infos('waymo', out_dir=out_dir, pkl_path=info_test_path)
+    # GTDatabaseCreater(
+    #     'WaymoDataset',
+    #     out_dir,
+    #     info_prefix,
+    #     f'{info_prefix}_infos_train.pkl',
+    #     relative_path=False,
+    #     with_mask=False,
+    #     num_worker=workers).create()
 
 
 def semantickitti_data_prep(info_prefix, out_dir):
diff --git a/tools/dataset_converters/kitti_converter.py b/tools/dataset_converters/kitti_converter.py
index 367cfd7ba9..68a84329ab 100644
--- a/tools/dataset_converters/kitti_converter.py
+++ b/tools/dataset_converters/kitti_converter.py
@@ -109,8 +109,9 @@ def calculate_single(self, info):
         return info
 
     def calculate(self, infos):
-        ret_infos = mmengine.track_parallel_progress(self.calculate_single,
-                                                     infos, self.num_worker)
+        # ret_infos = mmengine.track_parallel_progress(self.calculate_single,
+        #                                              infos, self.num_worker)
+        ret_infos = [self.calculate_single(info) for info in infos]
         for i, ret_info in enumerate(ret_infos):
             infos[i] = ret_info
 
@@ -252,7 +253,7 @@ def create_waymo_info_file(data_path,
     imageset_folder = Path(data_path) / 'ImageSets'
     train_img_ids = _read_imageset_file(str(imageset_folder / 'train.txt'))
     val_img_ids = _read_imageset_file(str(imageset_folder / 'val.txt'))
-    test_img_ids = _read_imageset_file(str(imageset_folder / 'test.txt'))
+    # test_img_ids = _read_imageset_file(str(imageset_folder / 'test.txt'))
 
     print('Generate info. this may take several minutes.')
     if save_path is None:
@@ -268,40 +269,40 @@ def create_waymo_info_file(data_path,
         relative_path=relative_path,
         max_sweeps=max_sweeps,
         num_worker=workers)
-    waymo_infos_gatherer_test = WaymoInfoGatherer(
-        data_path,
-        training=False,
-        label_info=False,
-        velodyne=True,
-        calib=True,
-        pose=True,
-        relative_path=relative_path,
-        max_sweeps=max_sweeps,
-        num_worker=workers)
-    num_points_in_gt_calculater = _NumPointsInGTCalculater(
-        data_path,
-        relative_path,
-        num_features=6,
-        remove_outside=False,
-        num_worker=workers)
+    # waymo_infos_gatherer_test = WaymoInfoGatherer(
+    #     data_path,
+    #     training=False,
+    #     label_info=False,
+    #     velodyne=True,
+    #     calib=True,
+    #     pose=True,
+    #     relative_path=relative_path,
+    #     max_sweeps=max_sweeps,
+    #     num_worker=workers)
+    # num_points_in_gt_calculater = _NumPointsInGTCalculater(
+    #     data_path,
+    #     relative_path,
+    #     num_features=6,
+    #     remove_outside=False,
+    #     num_worker=workers)
 
     waymo_infos_train = waymo_infos_gatherer_trainval.gather(train_img_ids)
-    num_points_in_gt_calculater.calculate(waymo_infos_train)
+    # num_points_in_gt_calculater.calculate(waymo_infos_train)
     filename = save_path / f'{pkl_prefix}_infos_train.pkl'
     print(f'Waymo info train file is saved to {filename}')
     mmengine.dump(waymo_infos_train, filename)
     waymo_infos_val = waymo_infos_gatherer_trainval.gather(val_img_ids)
-    num_points_in_gt_calculater.calculate(waymo_infos_val)
+    # num_points_in_gt_calculater.calculate(waymo_infos_val)
     filename = save_path / f'{pkl_prefix}_infos_val.pkl'
     print(f'Waymo info val file is saved to {filename}')
     mmengine.dump(waymo_infos_val, filename)
-    filename = save_path / f'{pkl_prefix}_infos_trainval.pkl'
-    print(f'Waymo info trainval file is saved to {filename}')
-    mmengine.dump(waymo_infos_train + waymo_infos_val, filename)
-    waymo_infos_test = waymo_infos_gatherer_test.gather(test_img_ids)
-    filename = save_path / f'{pkl_prefix}_infos_test.pkl'
-    print(f'Waymo info test file is saved to {filename}')
-    mmengine.dump(waymo_infos_test, filename)
+    # filename = save_path / f'{pkl_prefix}_infos_trainval.pkl'
+    # print(f'Waymo info trainval file is saved to {filename}')
+    # mmengine.dump(waymo_infos_train + waymo_infos_val, filename)
+    # waymo_infos_test = waymo_infos_gatherer_test.gather(test_img_ids)
+    # filename = save_path / f'{pkl_prefix}_infos_test.pkl'
+    # print(f'Waymo info test file is saved to {filename}')
+    # mmengine.dump(waymo_infos_test, filename)
 
 
 def _create_reduced_point_cloud(data_path,
diff --git a/tools/dataset_converters/kitti_data_utils.py b/tools/dataset_converters/kitti_data_utils.py
index 64c3bc415b..b0e90bd8d3 100644
--- a/tools/dataset_converters/kitti_data_utils.py
+++ b/tools/dataset_converters/kitti_data_utils.py
@@ -148,10 +148,14 @@ def get_label_anno(label_path):
                                         for x in content]).reshape(-1, 3)
     annotations['rotation_y'] = np.array([float(x[14])
                                           for x in content]).reshape(-1)
-    if len(content) != 0 and len(content[0]) == 16:  # have score
+    if len(content) != 0 and len(content[0]) >= 16:  # have score
         annotations['score'] = np.array([float(x[15]) for x in content])
     else:
         annotations['score'] = np.zeros((annotations['bbox'].shape[0], ))
+    # have num_lidar_points_in_box, given in waymo
+    if len(content) != 0 and len(content[0]) == 17:
+        annotations['num_points_in_gt'] = np.array(
+            [int(x[16]) for x in content])
     index = list(range(num_objects)) + [-1] * (num_gt - num_objects)
     annotations['index'] = np.array(index, dtype=np.int32)
     annotations['group_ids'] = np.arange(num_gt, dtype=np.int32)
@@ -552,6 +556,7 @@ def gather(self, image_ids):
         image_infos = mmengine.track_parallel_progress(self.gather_single,
                                                        image_ids,
                                                        self.num_worker)
+        # image_infos = mmengine.track_progress(self.gather_single, image_ids)
         return list(image_infos)
 
 
diff --git a/tools/dataset_converters/update_infos_to_v2.py b/tools/dataset_converters/update_infos_to_v2.py
index a2ddbd0688..42d0f5e8ba 100644
--- a/tools/dataset_converters/update_infos_to_v2.py
+++ b/tools/dataset_converters/update_infos_to_v2.py
@@ -923,8 +923,8 @@ def update_waymo_infos(pkl_path, out_dir):
         base_img_path = Path(ori_info_dict['image']['image_path']).name
 
         for cam_idx, cam_key in enumerate(camera_types):
-            temp_data_info['images'][cam_key]['timestamp'] = ori_info_dict[
-                'timestamp']
+            # temp_data_info['images'][cam_key]['timestamp'] = ori_info_dict[
+            #     'timestamp']
             temp_data_info['images'][cam_key]['img_path'] = base_img_path
 
         h, w = ori_info_dict['image']['image_shape']
@@ -934,16 +934,16 @@ def update_waymo_infos(pkl_path, out_dir):
         temp_data_info['images'][camera_types[0]]['width'] = w
         temp_data_info['lidar_points']['num_pts_feats'] = ori_info_dict[
             'point_cloud']['num_features']
-        temp_data_info['lidar_points']['timestamp'] = ori_info_dict[
-            'timestamp']
+        # temp_data_info['lidar_points']['timestamp'] = ori_info_dict[
+        #     'timestamp']
         velo_path = ori_info_dict['point_cloud'].get('velodyne_path')
         if velo_path is not None:
             temp_data_info['lidar_points']['lidar_path'] = Path(velo_path).name
 
         # TODO discuss the usage of Tr_velo_to_cam in lidar
-        Trv2c = ori_info_dict['calib']['Tr_velo_to_cam'].astype(np.float32)
+        # Trv2c = ori_info_dict['calib']['Tr_velo_to_cam'].astype(np.float32)
 
-        temp_data_info['lidar_points']['Tr_velo_to_cam'] = Trv2c.tolist()
+        # temp_data_info['lidar_points']['Tr_velo_to_cam'] = Trv2c.tolist()
 
         # for potential usage
         # temp_data_info['images']['R0_rect'] = ori_info_dict['calib'][
@@ -951,7 +951,8 @@ def update_waymo_infos(pkl_path, out_dir):
 
         # for the sweeps part:
         temp_data_info['timestamp'] = ori_info_dict['timestamp']
-        temp_data_info['ego2global'] = ori_info_dict['pose']
+        temp_data_info['ego2global'] = ori_info_dict['pose'].astype(
+            np.float32).tolist()
 
         for ori_sweep in ori_info_dict['sweeps']:
             # lidar sweeps
@@ -961,6 +962,8 @@ def update_waymo_infos(pkl_path, out_dir):
             lidar_sweep['lidar_points']['lidar_path'] = Path(
                 ori_sweep['velodyne_path']).name
             # image sweeps
+            # image_sweep = dict()
+            # image_sweep['sample_idx']= ori_sweep['image']['image_idx']
             image_sweep = get_single_image_sweep(camera_types)
             image_sweep['ego2global'] = ori_sweep['pose']
             image_sweep['timestamp'] = ori_sweep['timestamp']
@@ -1059,12 +1062,13 @@ def update_waymo_infos(pkl_path, out_dir):
                 instance_list.append(empty_instance)
             temp_data_info['cam_sync_instances'] = instance_list
 
-            cam_instances = generate_waymo_camera_instances(
-                ori_info_dict, camera_types)
-            temp_data_info['cam_instances'] = cam_instances
+            # cam_instances = generate_waymo_camera_instances(
+            #     ori_info_dict, camera_types)
+            # temp_data_info['cam_instances'] = cam_instances
 
         temp_data_info, _ = clear_data_info_unused_keys(temp_data_info)
         converted_list.append(temp_data_info)
+    del data_list
     pkl_name = Path(pkl_path).name
     out_path = osp.join(out_dir, pkl_name)
     print(f'Writing to output file: {out_path}.')
diff --git a/tools/dataset_converters/waymo_converter.py b/tools/dataset_converters/waymo_converter.py
index 87f9c54b54..edfff10ca3 100644
--- a/tools/dataset_converters/waymo_converter.py
+++ b/tools/dataset_converters/waymo_converter.py
@@ -109,6 +109,7 @@ def __init__(self,
     def convert(self):
         """Convert action."""
         print('Start converting ...')
+        # mmengine.track_progress(self.convert_one, range(len(self)))
         mmengine.track_parallel_progress(self.convert_one, range(len(self)),
                                          self.workers)
         print('\nFinished ...')
@@ -131,11 +132,11 @@ def convert_one(self, file_idx):
                     not in self.selected_waymo_locations):
                 continue
 
-            self.save_image(frame, file_idx, frame_idx)
-            self.save_calib(frame, file_idx, frame_idx)
-            self.save_lidar(frame, file_idx, frame_idx)
-            self.save_pose(frame, file_idx, frame_idx)
-            self.save_timestamp(frame, file_idx, frame_idx)
+            # self.save_image(frame, file_idx, frame_idx)
+            # self.save_calib(frame, file_idx, frame_idx)
+            # self.save_lidar(frame, file_idx, frame_idx)
+            # self.save_pose(frame, file_idx, frame_idx)
+            # self.save_timestamp(frame, file_idx, frame_idx)
 
             if not self.test_mode:
                 # TODO save the depth image for waymo challenge solution.
@@ -354,16 +355,17 @@ def save_label(self, frame, file_idx, frame_idx, cam_sync=False):
             width = box3d.width
             length = box3d.length
 
+            # TODO: Discuss if we should keep kitti_format
             x = box3d.center_x
             y = box3d.center_y
             z = box3d.center_z - height / 2
 
-            # project bounding box to the virtual reference frame
-            pt_ref = self.T_velo_to_front_cam @ \
-                np.array([x, y, z, 1]).reshape((4, 1))
-            x, y, z, _ = pt_ref.flatten().tolist()
+            # # project bounding box to the virtual reference frame
+            # pt_ref = self.T_velo_to_front_cam @ \
+            #     np.array([x, y, z, 1]).reshape((4, 1))
+            # x, y, z, _ = pt_ref.flatten().tolist()
 
-            rotation_y = -box3d.heading - np.pi / 2
+            rotation_y = box3d.heading
             track_id = obj.id
 
             # not available
@@ -373,25 +375,28 @@ def save_label(self, frame, file_idx, frame_idx, cam_sync=False):
 
             line = my_type + \
                 ' {} {} {} {} {} {} {} {} {} {} {} {} {} {}\n'.format(
-                    round(truncated, 2), occluded, round(alpha, 2),
-                    round(bounding_box[0], 2), round(bounding_box[1], 2),
-                    round(bounding_box[2], 2), round(bounding_box[3], 2),
-                    round(height, 2), round(width, 2), round(length, 2),
-                    round(x, 2), round(y, 2), round(z, 2),
-                    round(rotation_y, 2))
+                    truncated, occluded, alpha,
+                    bounding_box[0], bounding_box[1],
+                    bounding_box[2], bounding_box[3],
+                    width, height, length,
+                    x, y, z, rotation_y)
 
             if self.save_track_id:
                 line_all = line[:-1] + ' ' + name + ' ' + track_id + '\n'
             else:
                 line_all = line[:-1] + ' ' + name + '\n'
 
-            label_path = f'{self.label_save_dir}{name}/{self.prefix}' + \
-                f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt'
-            if cam_sync:
-                label_path = label_path.replace('label_', 'cam_sync_label_')
-            fp_label = open(label_path, 'a')
-            fp_label.write(line)
-            fp_label.close()
+            # Save num_lidar_points_in_box
+            line_all = line_all[:-1] + ' ' + str(
+                obj.num_lidar_points_in_box) + '\n'
+
+            # label_path = f'{self.label_save_dir}{name}/{self.prefix}' + \
+            #     f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt'
+            # if cam_sync:
+            #     label_path = label_path.replace('label_', 'cam_sync_label_')
+            # fp_label = open(label_path, 'a')
+            # fp_label.write(line)
+            # fp_label.close()
 
             fp_label_all.write(line_all)
 
@@ -627,6 +632,6 @@ def create_ImageSets_img_ids(root_dir, splits):
     open(save_dir + 'train.txt', 'w').writelines(idx_all[0])
     open(save_dir + 'val.txt', 'w').writelines(idx_all[1])
     open(save_dir + 'trainval.txt', 'w').writelines(idx_all[0] + idx_all[1])
-    open(save_dir + 'test.txt', 'w').writelines(idx_all[2])
+    # open(save_dir + 'test.txt', 'w').writelines(idx_all[2])
     # open(save_dir+'test_cam_only.txt','w').writelines(idx_all[3])
     print('created txt files indicating what to collect in ', splits)

From a81529e5ac1a74e831d67c4293f84493ac1be7a4 Mon Sep 17 00:00:00 2001
From: sjh <sunjiahao1999>
Date: Mon, 14 Aug 2023 14:34:12 +0800
Subject: [PATCH 04/25] add cam instances

---
 mmdet3d/datasets/convert_utils.py              | 17 +++++++++++------
 tools/create_data.py                           |  4 ++--
 tools/create_data.sh                           |  2 ++
 tools/dataset_converters/kitti_converter.py    |  6 +++---
 tools/dataset_converters/update_infos_to_v2.py |  6 +++---
 5 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/mmdet3d/datasets/convert_utils.py b/mmdet3d/datasets/convert_utils.py
index cb4d97e137..84a5c34fa9 100644
--- a/mmdet3d/datasets/convert_utils.py
+++ b/mmdet3d/datasets/convert_utils.py
@@ -10,7 +10,7 @@
 from shapely.geometry import MultiPoint, box
 from shapely.geometry.polygon import Polygon
 
-from mmdet3d.structures import Box3DMode, CameraInstance3DBoxes, points_cam2img
+from mmdet3d.structures import Box3DMode, LiDARInstance3DBoxes, points_cam2img
 from mmdet3d.structures.ops import box_np_ops
 
 kitti_categories = ('Pedestrian', 'Cyclist', 'Car', 'Van', 'Truck',
@@ -318,7 +318,7 @@ def get_kitti_style_2d_boxes(info: dict,
 def convert_annos(info: dict, cam_idx: int) -> dict:
     """Convert front-cam anns to i-th camera (KITTI-style info)."""
     rect = info['calib']['R0_rect'].astype(np.float32)
-    lidar2cam0 = info['calib']['Tr_velo_to_cam'].astype(np.float32)
+    # lidar2cam0 = info['calib']['Tr_velo_to_cam'].astype(np.float32)
     lidar2cami = info['calib'][f'Tr_velo_to_cam{cam_idx}'].astype(np.float32)
     annos = info['annos']
     converted_annos = copy.deepcopy(annos)
@@ -328,11 +328,16 @@ def convert_annos(info: dict, cam_idx: int) -> dict:
     gt_bboxes_3d = np.concatenate([loc, dims, rots[..., np.newaxis]],
                                   axis=1).astype(np.float32)
     # convert gt_bboxes_3d to velodyne coordinates
-    gt_bboxes_3d = CameraInstance3DBoxes(gt_bboxes_3d).convert_to(
-        Box3DMode.LIDAR, np.linalg.inv(rect @ lidar2cam0), correct_yaw=True)
+    # BC-breaking: gt_bboxes_3d is already in lidar coordinates
+    # TODO: Discuss correcet_yaw if necessary
+    # gt_bboxes_3d = CameraInstance3DBoxes(gt_bboxes_3d).convert_to(
+    #     Box3DMode.LIDAR, np.linalg.inv(rect @ lidar2cam0), correct_yaw=True)
     # convert gt_bboxes_3d to cam coordinates
-    gt_bboxes_3d = gt_bboxes_3d.convert_to(
-        Box3DMode.CAM, rect @ lidar2cami, correct_yaw=True).numpy()
+    # gt_bboxes_3d = gt_bboxes_3d.convert_to(
+    #     Box3DMode.CAM, rect @ lidar2cami, correct_yaw=True).numpy()
+    gt_bboxes_3d = LiDARInstance3DBoxes(gt_bboxes_3d).convert_to(
+        Box3DMode.CAM, rect @ lidar2cami, correct_yaw=True)
+
     converted_annos['location'] = gt_bboxes_3d[:, :3]
     converted_annos['dimensions'] = gt_bboxes_3d[:, 3:6]
     converted_annos['rotation_y'] = gt_bboxes_3d[:, 6]
diff --git a/tools/create_data.py b/tools/create_data.py
index eff511ef4a..4543756193 100644
--- a/tools/create_data.py
+++ b/tools/create_data.py
@@ -208,8 +208,8 @@ def waymo_data_prep(root_path,
     # create_ImageSets_img_ids(osp.join(out_dir, 'kitti_format'), splits)
     # Generate waymo infos
     out_dir = osp.join(out_dir, 'kitti_format')
-    # kitti.create_waymo_info_file(
-    #     out_dir, info_prefix, max_sweeps=max_sweeps, workers=workers)
+    kitti.create_waymo_info_file(
+        out_dir, info_prefix, max_sweeps=max_sweeps, workers=workers)
     info_train_path = osp.join(out_dir, f'{info_prefix}_infos_train.pkl')
     info_val_path = osp.join(out_dir, f'{info_prefix}_infos_val.pkl')
     info_trainval_path = osp.join(out_dir, f'{info_prefix}_infos_trainval.pkl')
diff --git a/tools/create_data.sh b/tools/create_data.sh
index 9a57852f71..9e1fe9c0ce 100755
--- a/tools/create_data.sh
+++ b/tools/create_data.sh
@@ -6,6 +6,7 @@ export PYTHONPATH=`pwd`:$PYTHONPATH
 PARTITION=$1
 JOB_NAME=$2
 DATASET=$3
+WORKERS=$4
 GPUS=${GPUS:-1}
 GPUS_PER_NODE=${GPUS_PER_NODE:-1}
 SRUN_ARGS=${SRUN_ARGS:-""}
@@ -21,4 +22,5 @@ srun -p ${PARTITION} \
     python -u tools/create_data.py ${DATASET} \
             --root-path ./data/${DATASET} \
             --out-dir ./data/${DATASET} \
+            --workers ${WORKERS} \
             --extra-tag ${DATASET}
diff --git a/tools/dataset_converters/kitti_converter.py b/tools/dataset_converters/kitti_converter.py
index 68a84329ab..57453c5ddc 100644
--- a/tools/dataset_converters/kitti_converter.py
+++ b/tools/dataset_converters/kitti_converter.py
@@ -296,9 +296,9 @@ def create_waymo_info_file(data_path,
     filename = save_path / f'{pkl_prefix}_infos_val.pkl'
     print(f'Waymo info val file is saved to {filename}')
     mmengine.dump(waymo_infos_val, filename)
-    # filename = save_path / f'{pkl_prefix}_infos_trainval.pkl'
-    # print(f'Waymo info trainval file is saved to {filename}')
-    # mmengine.dump(waymo_infos_train + waymo_infos_val, filename)
+    filename = save_path / f'{pkl_prefix}_infos_trainval.pkl'
+    print(f'Waymo info trainval file is saved to {filename}')
+    mmengine.dump(waymo_infos_train + waymo_infos_val, filename)
     # waymo_infos_test = waymo_infos_gatherer_test.gather(test_img_ids)
     # filename = save_path / f'{pkl_prefix}_infos_test.pkl'
     # print(f'Waymo info test file is saved to {filename}')
diff --git a/tools/dataset_converters/update_infos_to_v2.py b/tools/dataset_converters/update_infos_to_v2.py
index 42d0f5e8ba..ec4831d80e 100644
--- a/tools/dataset_converters/update_infos_to_v2.py
+++ b/tools/dataset_converters/update_infos_to_v2.py
@@ -1062,9 +1062,9 @@ def update_waymo_infos(pkl_path, out_dir):
                 instance_list.append(empty_instance)
             temp_data_info['cam_sync_instances'] = instance_list
 
-            # cam_instances = generate_waymo_camera_instances(
-            #     ori_info_dict, camera_types)
-            # temp_data_info['cam_instances'] = cam_instances
+            cam_instances = generate_waymo_camera_instances(
+                ori_info_dict, camera_types)
+            temp_data_info['cam_instances'] = cam_instances
 
         temp_data_info, _ = clear_data_info_unused_keys(temp_data_info)
         converted_list.append(temp_data_info)

From 84d9fc787ec35bba9e2bb6b77d1540569bdd74f6 Mon Sep 17 00:00:00 2001
From: sunjiahao1999 <578431509@qq.com>
Date: Wed, 16 Aug 2023 16:57:01 +0800
Subject: [PATCH 05/25] add cam_instances

---
 check_coor.py                                 | 24 +++++++++++++
 get_gt_mini.py                                | 11 ++++++
 mmdet3d/datasets/convert_utils.py             |  8 +++--
 ...ecfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py | 30 ++++++++--------
 tools/create_data.py                          | 35 +++++++++++++------
 .../dataset_converters/update_infos_to_v2.py  | 22 +++++++-----
 tools/dataset_converters/waymo_converter.py   |  8 ++---
 7 files changed, 97 insertions(+), 41 deletions(-)
 create mode 100644 check_coor.py
 create mode 100644 get_gt_mini.py

diff --git a/check_coor.py b/check_coor.py
new file mode 100644
index 0000000000..ed1df35a33
--- /dev/null
+++ b/check_coor.py
@@ -0,0 +1,24 @@
+import mmengine
+import numpy as np
+
+from mmdet3d.structures import (Box3DMode, CameraInstance3DBoxes,
+                                LiDARInstance3DBoxes)
+
+val_infos = mmengine.load('data/waymo_mini/kitti_format/waymo_infos_val.pkl')
+old_val_infos = mmengine.load(
+    'data/waymo_mini/kitti_format/old_pkl/waymo_infos_val.pkl')
+instance = np.array(
+    val_infos['data_list'][0]['cam_sync_instances'][0]['bbox_3d'])[np.newaxis,
+                                                                   ...]
+instance = LiDARInstance3DBoxes(instance)
+lidar2cam = np.array(
+    val_infos['data_list'][0]['images']['CAM_FRONT']['lidar2cam'])
+cam_instance = instance.convert_to(Box3DMode.CAM, lidar2cam)
+
+old_instance = np.array(old_val_infos['data_list'][0]['cam_sync_instances'][0]
+                        ['bbox_3d'])[np.newaxis, ...]
+
+old_lidar2cam = np.array(
+    old_val_infos['data_list'][0]['images']['CAM_FRONT']['lidar2cam'])
+old_instance = CameraInstance3DBoxes(old_instance)
+pass
diff --git a/get_gt_mini.py b/get_gt_mini.py
new file mode 100644
index 0000000000..e300324ed4
--- /dev/null
+++ b/get_gt_mini.py
@@ -0,0 +1,11 @@
+from waymo_open_dataset.protos.metrics_pb2 import Objects
+
+objects = Objects()
+with open('./data/waymo_mini/waymo_format/gt.bin', 'rb') as f:
+    objects.ParseFromString(bytearray(f.read()))
+new = Objects()
+for obj in objects.objects:
+    if obj.context_name == '10203656353524179475_7625_000_7645_000':
+        new.objects.append(obj)
+with open('./data/waymo_mini/waymo_format/gt_mini.bin', 'wb') as f:
+    f.write(new.SerializeToString())
diff --git a/mmdet3d/datasets/convert_utils.py b/mmdet3d/datasets/convert_utils.py
index 84a5c34fa9..9d65369593 100644
--- a/mmdet3d/datasets/convert_utils.py
+++ b/mmdet3d/datasets/convert_utils.py
@@ -319,7 +319,11 @@ def convert_annos(info: dict, cam_idx: int) -> dict:
     """Convert front-cam anns to i-th camera (KITTI-style info)."""
     rect = info['calib']['R0_rect'].astype(np.float32)
     # lidar2cam0 = info['calib']['Tr_velo_to_cam'].astype(np.float32)
-    lidar2cami = info['calib'][f'Tr_velo_to_cam{cam_idx}'].astype(np.float32)
+    if cam_idx == 0:
+        lidar2cami = info['calib']['Tr_velo_to_cam'].astype(np.float32)
+    else:
+        lidar2cami = info['calib'][f'Tr_velo_to_cam{cam_idx}'].astype(
+            np.float32)
     annos = info['annos']
     converted_annos = copy.deepcopy(annos)
     loc = annos['location']
@@ -336,7 +340,7 @@ def convert_annos(info: dict, cam_idx: int) -> dict:
     # gt_bboxes_3d = gt_bboxes_3d.convert_to(
     #     Box3DMode.CAM, rect @ lidar2cami, correct_yaw=True).numpy()
     gt_bboxes_3d = LiDARInstance3DBoxes(gt_bboxes_3d).convert_to(
-        Box3DMode.CAM, rect @ lidar2cami, correct_yaw=True)
+        Box3DMode.CAM, rect @ lidar2cami, correct_yaw=True).numpy()
 
     converted_annos['location'] = gt_bboxes_3d[:, :3]
     converted_annos['dimensions'] = gt_bboxes_3d[:, 3:6]
diff --git a/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py b/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
index 9132b45cb6..1027942d9b 100644
--- a/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
+++ b/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
@@ -5,8 +5,8 @@
 voxel_size = [0.32, 0.32, 6]
 grid_size = [468, 468, 1]
 point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4.0]
-# data_root = 'data/waymo_mini/kitti_format/'
-data_root = 'data/waymo/kitti_format/'
+data_root = 'data/waymo_mini/kitti_format/'
+# data_root = 'data/waymo/kitti_format/'
 class_names = ['Car', 'Pedestrian', 'Cyclist']
 metainfo = dict(classes=class_names)
 input_modality = dict(use_lidar=True, use_camera=False)
@@ -193,7 +193,7 @@
 
 dataset_type = 'WaymoDataset'
 val_dataloader = dict(
-    batch_size=4,
+    batch_size=1,
     num_workers=4,
     persistent_workers=True,
     drop_last=False,
@@ -211,22 +211,22 @@
         backend_args=backend_args))
 test_dataloader = val_dataloader
 
-# val_evaluator = dict(
-#     type='WaymoMetric',
-#     ann_file='./data/waymo_mini/kitti_format/waymo_infos_val.pkl',
-#     waymo_bin_file='./data/waymo_mini/waymo_format/gt_mini.bin',
-#     data_root='./data/waymo_mini/waymo_format',
-#     backend_args=backend_args,
-#     convert_kitti_format=False,
-#     idx2metainfo='./data/waymo_mini/waymo_format/idx2metainfo.pkl')
 val_evaluator = dict(
     type='WaymoMetric',
-    ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
-    waymo_bin_file='./data/waymo/waymo_format/gt.bin',
-    data_root='./data/waymo/waymo_format',
+    ann_file='./data/waymo_mini/kitti_format/waymo_infos_val.pkl',
+    waymo_bin_file='./data/waymo_mini/waymo_format/gt_mini.bin',
+    data_root='./data/waymo_mini/waymo_format',
     backend_args=backend_args,
     convert_kitti_format=False,
-    idx2metainfo='./data/waymo/waymo_format/idx2metainfo.pkl')
+    idx2metainfo='./data/waymo_mini/waymo_format/idx2metainfo.pkl')
+# val_evaluator = dict(
+#     type='WaymoMetric',
+#     ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
+#     waymo_bin_file='./data/waymo/waymo_format/gt.bin',
+#     data_root='./data/waymo/waymo_format',
+#     backend_args=backend_args,
+#     convert_kitti_format=False,
+#     idx2metainfo='./data/waymo/waymo_format/idx2metainfo.pkl')
 test_evaluator = val_evaluator
 
 vis_backends = [dict(type='LocalVisBackend')]
diff --git a/tools/create_data.py b/tools/create_data.py
index 4543756193..5495d9700a 100644
--- a/tools/create_data.py
+++ b/tools/create_data.py
@@ -173,7 +173,7 @@ def waymo_data_prep(root_path,
                     version,
                     out_dir,
                     workers,
-                    max_sweeps=5):
+                    max_sweeps=10):
     """Prepare the info file for waymo dataset.
 
     Args:
@@ -275,11 +275,16 @@ def semantickitti_data_prep(info_prefix, out_dir):
     '--only-gt-database',
     action='store_true',
     help='Whether to only generate ground truth database.')
+parser.add_argument(
+    '--skip-extract-kitti-format',
+    action='store_true',
+    help='''Whether to skip extracting kitti format image and point cloud.
+    Only used when dataset is waymo!''')
 args = parser.parse_args()
 
 if __name__ == '__main__':
-    from mmdet3d.utils import register_all_modules
-    register_all_modules()
+    from mmengine.registry import init_default_scope
+    init_default_scope('mmdet3d')
 
     if args.dataset == 'kitti':
         if args.only_gt_database:
@@ -334,6 +339,22 @@ def semantickitti_data_prep(info_prefix, out_dir):
                 dataset_name='NuScenesDataset',
                 out_dir=args.out_dir,
                 max_sweeps=args.max_sweeps)
+    elif args.dataset == 'waymo' and args.version == 'v1.4':
+        waymo_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            version=args.version,
+            out_dir=args.out_dir,
+            workers=args.workers,
+            max_sweeps=args.max_sweeps)
+    elif args.dataset == 'waymo' and args.version == 'v1.4-mini':
+        waymo_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            version=args.version,
+            out_dir=args.out_dir,
+            workers=args.workers,
+            max_sweeps=args.max_sweeps)
     elif args.dataset == 'lyft':
         train_version = f'{args.version}-train'
         lyft_data_prep(
@@ -347,14 +368,6 @@ def semantickitti_data_prep(info_prefix, out_dir):
             info_prefix=args.extra_tag,
             version=test_version,
             max_sweeps=args.max_sweeps)
-    elif args.dataset == 'waymo':
-        waymo_data_prep(
-            root_path=args.root_path,
-            info_prefix=args.extra_tag,
-            version=args.version,
-            out_dir=args.out_dir,
-            workers=args.workers,
-            max_sweeps=args.max_sweeps)
     elif args.dataset == 'scannet':
         scannet_data_prep(
             root_path=args.root_path,
diff --git a/tools/dataset_converters/update_infos_to_v2.py b/tools/dataset_converters/update_infos_to_v2.py
index ec4831d80e..a779d6b6c7 100644
--- a/tools/dataset_converters/update_infos_to_v2.py
+++ b/tools/dataset_converters/update_infos_to_v2.py
@@ -961,18 +961,22 @@ def update_waymo_infos(pkl_path, out_dir):
             lidar_sweep['timestamp'] = ori_sweep['timestamp']
             lidar_sweep['lidar_points']['lidar_path'] = Path(
                 ori_sweep['velodyne_path']).name
+            # delete the num_pts_feats and lidar2ego
+            del lidar_sweep['lidar_points']['num_pts_feats']
+            del lidar_sweep['lidar_points']['lidar2ego']
+            # TODO: refactor image_sweeps info when we really need it.
             # image sweeps
             # image_sweep = dict()
             # image_sweep['sample_idx']= ori_sweep['image']['image_idx']
-            image_sweep = get_single_image_sweep(camera_types)
-            image_sweep['ego2global'] = ori_sweep['pose']
-            image_sweep['timestamp'] = ori_sweep['timestamp']
-            img_path = Path(ori_sweep['image_path']).name
-            for cam_idx, cam_key in enumerate(camera_types):
-                image_sweep['images'][cam_key]['img_path'] = img_path
+            # image_sweep = get_single_image_sweep(camera_types)
+            # image_sweep['ego2global'] = ori_sweep['pose']
+            # image_sweep['timestamp'] = ori_sweep['timestamp']
+            # img_path = Path(ori_sweep['image_path']).name
+            # for cam_idx, cam_key in enumerate(camera_types):
+            #     image_sweep['images'][cam_key]['img_path'] = img_path
 
             temp_data_info['lidar_sweeps'].append(lidar_sweep)
-            temp_data_info['image_sweeps'].append(image_sweep)
+            # temp_data_info['image_sweeps'].append(image_sweep)
 
         anns = ori_info_dict.get('annos', None)
         ignore_class_name = set()
@@ -1107,8 +1111,8 @@ def generate_waymo_camera_instances(ori_info_dict, cam_keys):
 
     for cam_idx, cam_key in enumerate(cam_keys):
         annos = copy.deepcopy(ori_info_dict['cam_sync_annos'])
-        if cam_idx != 0:
-            annos = convert_annos(ori_info_dict, cam_idx)
+
+        annos = convert_annos(ori_info_dict, cam_idx)
 
         ann_infos = get_kitti_style_2d_boxes(
             ori_info_dict, cam_idx, occluded=[0], annos=annos, dataset='waymo')
diff --git a/tools/dataset_converters/waymo_converter.py b/tools/dataset_converters/waymo_converter.py
index edfff10ca3..c20db23685 100644
--- a/tools/dataset_converters/waymo_converter.py
+++ b/tools/dataset_converters/waymo_converter.py
@@ -92,7 +92,7 @@ def __init__(self,
         self.tfrecord_pathnames = sorted(
             glob(join(self.load_dir, '*.tfrecord')))
 
-        self.label_save_dir = f'{self.save_dir}/label_'
+        # self.label_save_dir = f'{self.save_dir}/label_'
         self.label_all_save_dir = f'{self.save_dir}/label_all'
         self.image_save_dir = f'{self.save_dir}/image_'
         self.calib_save_dir = f'{self.save_dir}/calib'
@@ -100,7 +100,7 @@ def __init__(self,
         self.pose_save_dir = f'{self.save_dir}/pose'
         self.timestamp_save_dir = f'{self.save_dir}/timestamp'
         if self.save_cam_sync_labels:
-            self.cam_sync_label_save_dir = f'{self.save_dir}/cam_sync_label_'
+            # self.cam_sync_label_save_dir = f'{self.save_dir}/cam_sync_label_'
             self.cam_sync_label_all_save_dir = \
                 f'{self.save_dir}/cam_sync_label_all'
 
@@ -450,10 +450,10 @@ def create_folder(self):
                 self.pose_save_dir,
                 self.timestamp_save_dir,
             ]
-            dir_list2 = [self.label_save_dir, self.image_save_dir]
+            dir_list2 = [self.image_save_dir]
             if self.save_cam_sync_labels:
                 dir_list1.append(self.cam_sync_label_all_save_dir)
-                dir_list2.append(self.cam_sync_label_save_dir)
+                # dir_list2.append(self.cam_sync_label_save_dir)
         else:
             dir_list1 = [
                 self.calib_save_dir, self.pose_save_dir,

From b7b3b62a47b835e893d5c2226607ea17ce8ddd7f Mon Sep 17 00:00:00 2001
From: sunjiahao1999 <578431509@qq.com>
Date: Wed, 16 Aug 2023 16:59:38 +0800
Subject: [PATCH 06/25] add description for skip

---
 tools/create_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/create_data.py b/tools/create_data.py
index 5495d9700a..3d07cb6169 100644
--- a/tools/create_data.py
+++ b/tools/create_data.py
@@ -279,7 +279,7 @@ def semantickitti_data_prep(info_prefix, out_dir):
     '--skip-extract-kitti-format',
     action='store_true',
     help='''Whether to skip extracting kitti format image and point cloud.
-    Only used when dataset is waymo!''')
+        Only used when dataset is waymo!''')
 args = parser.parse_args()
 
 if __name__ == '__main__':

From 1995b5a9c736d5bf02f800c0f29d68e861bfc131 Mon Sep 17 00:00:00 2001
From: sjh <sunjiahao1999>
Date: Fri, 18 Aug 2023 16:35:32 +0800
Subject: [PATCH 07/25] fix num_ins_per_cat

---
 mmdet3d/datasets/det3d_dataset.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/mmdet3d/datasets/det3d_dataset.py b/mmdet3d/datasets/det3d_dataset.py
index 11caae4729..8e1570e794 100644
--- a/mmdet3d/datasets/det3d_dataset.py
+++ b/mmdet3d/datasets/det3d_dataset.py
@@ -113,7 +113,7 @@ def __init__(self,
                 ori_label = self.METAINFO['classes'].index(name)
                 self.label_mapping[ori_label] = label_idx
 
-            self.num_ins_per_cat = {name: 0 for name in metainfo['classes']}
+            self.num_ins_per_cat = [0] * len(metainfo['classes'])
         else:
             self.label_mapping = {
                 i: i
@@ -121,10 +121,7 @@ def __init__(self,
             }
             self.label_mapping[-1] = -1
 
-            self.num_ins_per_cat = {
-                name: 0
-                for name in self.METAINFO['classes']
-            }
+            self.num_ins_per_cat = [0] * len(self.METAINFO['classes'])
 
         super().__init__(
             ann_file=ann_file,
@@ -148,7 +145,8 @@ def __init__(self,
             print_log('-' * 30, 'current')
             print_log(f'The length of the dataset: {len(self)}', 'current')
             content_show = [['category', 'number']]
-            for cat_name, num in self.num_ins_per_cat.items():
+            for label, num in enumerate(self.num_ins_per_cat):
+                cat_name = self.metainfo['classes'][label]
                 content_show.append([cat_name, num])
             table = AsciiTable(content_show)
             print_log(
@@ -256,8 +254,7 @@ def parse_ann_info(self, info: dict) -> Union[dict, None]:
 
             for label in ann_info['gt_labels_3d']:
                 if label != -1:
-                    cat_name = self.metainfo['classes'][label]
-                    self.num_ins_per_cat[cat_name] += 1
+                    self.num_ins_per_cat[label] += 1
 
         return ann_info
 

From a9c7cd3cd85b68d3448380ecdeb13ebe0fb85bb5 Mon Sep 17 00:00:00 2001
From: sunjiahao1999 <578431509@qq.com>
Date: Mon, 21 Aug 2023 23:13:15 +0800
Subject: [PATCH 08/25] refactor waymo create

---
 mp.py                                       |  43 +++
 tools/create_data.py                        |  63 ++--
 tools/dataset_converters/waymo_converter.py | 388 ++++++++++++++++----
 3 files changed, 394 insertions(+), 100 deletions(-)
 create mode 100644 mp.py

diff --git a/mp.py b/mp.py
new file mode 100644
index 0000000000..3262dd5eac
--- /dev/null
+++ b/mp.py
@@ -0,0 +1,43 @@
+import multiprocessing
+import time
+
+
+class A(object):
+
+    def __init__(self):
+        self.a = []
+        self.b = None
+
+    def get_num_a(self, num):
+        time.sleep(3)
+        self.a = [0]
+        self.get_num_b(num)
+        print(self.a)
+        return self.a
+
+    def get_num_b(self, num):
+        self.a.append(num)
+
+    def sum(self):
+        print(self.a)
+
+    def run(self):
+        # p1 = multiprocessing.Process(target=self.get_num_a)
+        # p2 = multiprocessing.Process(target=self.get_num_b)
+        # p1.start()
+        # p2.start()
+        # p1.join()
+        # p2.join()
+        p1 = multiprocessing.Pool(2)
+        list_a = p1.map(self.get_num_a, [1, 2])
+        self.sum()
+        print(list_a)
+
+
+if __name__ == '__main__':
+
+    t1 = time.time()
+    a = A()
+    a.run()
+    t2 = time.time()
+    print('cost time :{}'.format(t2 - t1))
diff --git a/tools/create_data.py b/tools/create_data.py
index 3d07cb6169..aa0288ae93 100644
--- a/tools/create_data.py
+++ b/tools/create_data.py
@@ -7,10 +7,8 @@
 from tools.dataset_converters import lyft_converter as lyft_converter
 from tools.dataset_converters import nuscenes_converter as nuscenes_converter
 from tools.dataset_converters import semantickitti_converter
-# from tools.dataset_converters.create_gt_database import (
-#     GTDatabaseCreater, create_groundtruth_database)
-from tools.dataset_converters.create_gt_database import \
-    create_groundtruth_database
+from tools.dataset_converters.create_gt_database import (
+    GTDatabaseCreater, create_groundtruth_database)
 from tools.dataset_converters.update_infos_to_v2 import update_pkl_infos
 
 
@@ -174,7 +172,13 @@ def waymo_data_prep(root_path,
                     out_dir,
                     workers,
                     max_sweeps=10):
-    """Prepare the info file for waymo dataset.
+    """Prepare waymo dataset. There are 3 steps as follows: Step 1. Extract
+    camera images and lidar point clouds from waymo raw data in '*.tfreord' and
+    save as kitti format. Step 2. Generate waymo train/val/test infos and save
+    as pickle file. Step 3. Generate waymo ground truth database (point clouds
+    within each 3D bounding box) for data augmentation in training. Steps 1 and
+    2 will be done in Waymo2KITTI, and step 3 will be done in
+    GTDatabaseCreater.
 
     Args:
         root_path (str): Path of dataset root.
@@ -184,10 +188,12 @@ def waymo_data_prep(root_path,
         max_sweeps (int, optional): Number of input consecutive frames.
             Default: 5. Here we store pose information of these frames
             for later use.
-    """
+    """  # noqa: E501
     from tools.dataset_converters import waymo_converter as waymo
 
-    splits = ['training', 'validation']
+    splits = [
+        'training', 'validation', 'testing', 'testing_3d_camera_only_detection'
+    ]
     for i, split in enumerate(splits):
         load_dir = osp.join(root_path, 'waymo_format', split)
         if split == 'validation':
@@ -200,32 +206,33 @@ def waymo_data_prep(root_path,
             prefix=str(i),
             workers=workers,
             test_mode=(split
-                       in ['testing', 'testing_3d_camera_only_detection']))
+                       in ['testing', 'testing_3d_camera_only_detection']),
+            info_prefix=info_prefix)
         converter.convert()
 
-    # from tools.dataset_converters.waymo_converter import \
-    #     create_ImageSets_img_ids
-    # create_ImageSets_img_ids(osp.join(out_dir, 'kitti_format'), splits)
+    from tools.dataset_converters.waymo_converter import \
+        create_ImageSets_img_ids
+    create_ImageSets_img_ids(osp.join(out_dir, 'kitti_format'), splits)
     # Generate waymo infos
-    out_dir = osp.join(out_dir, 'kitti_format')
-    kitti.create_waymo_info_file(
-        out_dir, info_prefix, max_sweeps=max_sweeps, workers=workers)
-    info_train_path = osp.join(out_dir, f'{info_prefix}_infos_train.pkl')
-    info_val_path = osp.join(out_dir, f'{info_prefix}_infos_val.pkl')
-    info_trainval_path = osp.join(out_dir, f'{info_prefix}_infos_trainval.pkl')
+    # out_dir = osp.join(out_dir, 'kitti_format')
+    # kitti.create_waymo_info_file(
+    #     out_dir, info_prefix, max_sweeps=max_sweeps, workers=workers)
+    # info_train_path = osp.join(out_dir, f'{info_prefix}_infos_train.pkl')
+    # info_val_path = osp.join(out_dir, f'{info_prefix}_infos_val.pkl')
+    # info_trainval_path = osp.join(out_d, f'{info_prefix}_infos_trainval.pkl')
     # info_test_path = osp.join(out_dir, f'{info_prefix}_infos_test.pkl')
-    update_pkl_infos('waymo', out_dir=out_dir, pkl_path=info_train_path)
-    update_pkl_infos('waymo', out_dir=out_dir, pkl_path=info_val_path)
-    update_pkl_infos('waymo', out_dir=out_dir, pkl_path=info_trainval_path)
+    # update_pkl_infos('waymo', out_dir=out_dir, pkl_path=info_train_path)
+    # update_pkl_infos('waymo', out_dir=out_dir, pkl_path=info_val_path)
+    # update_pkl_infos('waymo', out_dir=out_dir, pkl_path=info_trainval_path)
     # update_pkl_infos('waymo', out_dir=out_dir, pkl_path=info_test_path)
-    # GTDatabaseCreater(
-    #     'WaymoDataset',
-    #     out_dir,
-    #     info_prefix,
-    #     f'{info_prefix}_infos_train.pkl',
-    #     relative_path=False,
-    #     with_mask=False,
-    #     num_worker=workers).create()
+    GTDatabaseCreater(
+        'WaymoDataset',
+        out_dir,
+        info_prefix,
+        f'{info_prefix}_infos_train.pkl',
+        relative_path=False,
+        with_mask=False,
+        num_worker=workers).create()
 
 
 def semantickitti_data_prep(info_prefix, out_dir):
diff --git a/tools/dataset_converters/waymo_converter.py b/tools/dataset_converters/waymo_converter.py
index c20db23685..e269274540 100644
--- a/tools/dataset_converters/waymo_converter.py
+++ b/tools/dataset_converters/waymo_converter.py
@@ -9,23 +9,30 @@
     raise ImportError('Please run "pip install waymo-open-dataset-tf-2-6-0" '
                       '>1.4.5 to install the official devkit first.')
 
+import copy
 import os
+import os.path as osp
 from glob import glob
 from os.path import exists, join
 
 import mmengine
 import numpy as np
 import tensorflow as tf
+from nuscenes.utils.geometry_utils import view_points
+from PIL import Image
 from waymo_open_dataset.utils import range_image_utils, transform_utils
 from waymo_open_dataset.utils.frame_utils import \
     parse_range_image_and_camera_projection
 
+from mmdet3d.datasets.convert_utils import post_process_coords
+from mmdet3d.structures import Box3DMode, LiDARInstance3DBoxes, points_cam2img
 
-class Waymo2KITTI(object):
-    """Waymo to KITTI converter.
 
-    This class serves as the converter to change the waymo raw data to KITTI
-    format.
+class Waymo2KITTI(object):
+    """Waymo to KITTI converter.There are 2 steps as follows: Step 1. Extract
+    camera images and lidar point clouds from waymo raw data in '*.tfreord' and
+    save as kitti format. Step 2. Generate waymo train/val/test infos and save
+    as pickle file.
 
     Args:
         load_dir (str): Directory to load waymo raw data.
@@ -36,8 +43,10 @@ class Waymo2KITTI(object):
             Defaults to 64.
         test_mode (bool, optional): Whether in the test_mode.
             Defaults to False.
-        save_cam_sync_labels (bool, optional): Whether to save cam sync labels.
-            Defaults to True.
+        save_cam_sync_instances (bool, optional): Whether to save cam sync
+            instances. Defaults to True.
+        save_cam_instances (bool, optional): Whether to save cam instances.
+            Defaults to False.
     """
 
     def __init__(self,
@@ -46,18 +55,9 @@ def __init__(self,
                  prefix,
                  workers=64,
                  test_mode=False,
-                 save_cam_sync_labels=True):
-        self.filter_empty_3dboxes = True
-        self.filter_no_label_zone_points = True
-
-        self.selected_waymo_classes = ['VEHICLE', 'PEDESTRIAN', 'CYCLIST']
-
-        # Only data collected in specific locations will be converted
-        # If set None, this filter is disabled
-        # Available options: location_sf (main dataset)
-        self.selected_waymo_locations = None
-        self.save_track_id = False
-
+                 save_cam_sync_instances=True,
+                 save_cam_instances=False,
+                 info_prefix='waymo'):
         # turn on eager execution for older tensorflow versions
         if int(tf.__version__.split('.')[0]) < 2:
             tf.enable_eager_execution()
@@ -74,55 +74,82 @@ def __init__(self,
         self.type_list = [
             'UNKNOWN', 'VEHICLE', 'PEDESTRIAN', 'SIGN', 'CYCLIST'
         ]
-        self.waymo_to_kitti_class_map = {
-            'UNKNOWN': 'DontCare',
-            'PEDESTRIAN': 'Pedestrian',
-            'VEHICLE': 'Car',
-            'CYCLIST': 'Cyclist',
-            'SIGN': 'Sign'  # not in kitti
-        }
+
+        # MMDetection3D unified camera keys & class names
+        self.camera_types = [
+            'CAM_FRONT',
+            'CAM_FRONT_LEFT',
+            'CAM_FRONT_RIGHT',
+            'CAM_SIDE_LEFT',
+            'CAM_SIDE_RIGHT',
+        ]
+        self.selected_waymo_classes = ['VEHICLE', 'PEDESTRIAN', 'CYCLIST']
 
         self.load_dir = load_dir
         self.save_dir = save_dir
         self.prefix = prefix
         self.workers = int(workers)
         self.test_mode = test_mode
-        self.save_cam_sync_labels = save_cam_sync_labels
+        self.save_cam_sync_instances = save_cam_sync_instances
+        self.save_cam_instances = save_cam_instances
+
+        # TODO: Discuss filter_empty_3dboxes and filter_no_label_zone_points
+        self.filter_empty_3dboxes = True
+        self.filter_no_label_zone_points = True
+        self.save_track_id = False
 
         self.tfrecord_pathnames = sorted(
             glob(join(self.load_dir, '*.tfrecord')))
 
-        # self.label_save_dir = f'{self.save_dir}/label_'
-        self.label_all_save_dir = f'{self.save_dir}/label_all'
         self.image_save_dir = f'{self.save_dir}/image_'
-        self.calib_save_dir = f'{self.save_dir}/calib'
         self.point_cloud_save_dir = f'{self.save_dir}/velodyne'
-        self.pose_save_dir = f'{self.save_dir}/pose'
-        self.timestamp_save_dir = f'{self.save_dir}/timestamp'
-        if self.save_cam_sync_labels:
-            # self.cam_sync_label_save_dir = f'{self.save_dir}/cam_sync_label_'
-            self.cam_sync_label_all_save_dir = \
-                f'{self.save_dir}/cam_sync_label_all'
 
-        self.create_folder()
+        # Create folder for saving KITTI format camera images and
+        # lidar point clouds.
+        if 'testing_3d_camera_only_detection' not in self.load_dir:
+            mmengine.mkdir_or_exist(self.point_cloud_save_dir)
+        for i in range(5):
+            mmengine.mkdir_or_exist(f'{self.image_save_dir}{str(i)}')
 
     def convert(self):
         """Convert action."""
         print('Start converting ...')
         # mmengine.track_progress(self.convert_one, range(len(self)))
-        mmengine.track_parallel_progress(self.convert_one, range(len(self)),
-                                         self.workers)
+        data_infos = mmengine.track_parallel_progress(self.convert_one,
+                                                      range(len(self)),
+                                                      self.workers)
+        data_list = []
+        for data_info in data_infos:
+            data_list.extend(data_info)
+        metainfo = dict()
+        metainfo['dataset'] = 'waymo'
+        metainfo['version'] = '1.4'
+        metainfo['info_version'] = '1.1'
+        waymo_infos = dict(data_infos=data_list, metainfo=metainfo)
+        filenames = osp.join(
+            osp.dirname(self.save_dir),
+            f'{self.prefix}_infos_f{osp.basename(self.save_dir)}.pkl')
+        mmengine.dump(waymo_infos, filenames)
         print('\nFinished ...')
 
     def convert_one(self, file_idx):
-        """Convert action for single file.
+        """Convert one '*.tfrecord' file to kitti format. Each file stores all
+        the frames (about 200 frames) in current scene. We treat each frame as
+        a sample, save their images and point clouds in kitti format, and then
+        create info for all frames.
 
         Args:
             file_idx (int): Index of the file to be converted.
+
+        Returns:
+            file_infos (list): Waymo infos for all frames in current file.
         """
         pathname = self.tfrecord_pathnames[file_idx]
         dataset = tf.data.TFRecordDataset(pathname, compression_type='')
 
+        # NOTE: file_infos is not shared between processes, only stores frame
+        # infos within the current file.
+        self.file_infos = []
         for frame_idx, data in enumerate(dataset):
 
             frame = dataset_pb2.Frame()
@@ -132,14 +159,15 @@ def convert_one(self, file_idx):
                     not in self.selected_waymo_locations):
                 continue
 
-            # self.save_image(frame, file_idx, frame_idx)
-            # self.save_calib(frame, file_idx, frame_idx)
-            # self.save_lidar(frame, file_idx, frame_idx)
-            # self.save_pose(frame, file_idx, frame_idx)
-            # self.save_timestamp(frame, file_idx, frame_idx)
+            # Step 1.
+            self.save_image(frame, file_idx, frame_idx)
+            self.save_lidar(frame, file_idx, frame_idx)
 
+            # Step 2.
+            # TODO save the depth image for waymo challenge solution.
+            self.create_waymo_info_file(frame, file_idx, frame_idx)
             if not self.test_mode:
-                # TODO save the depth image for waymo challenge solution.
+
                 self.save_label(frame, file_idx, frame_idx)
                 if self.save_cam_sync_labels:
                     self.save_label(frame, file_idx, frame_idx, cam_sync=True)
@@ -441,33 +469,6 @@ def save_timestamp(self, frame, file_idx, frame_idx):
                 'w') as f:
             f.write(str(frame.timestamp_micros))
 
-    def create_folder(self):
-        """Create folder for data preprocessing."""
-        if not self.test_mode:
-            dir_list1 = [
-                self.label_all_save_dir,
-                self.calib_save_dir,
-                self.pose_save_dir,
-                self.timestamp_save_dir,
-            ]
-            dir_list2 = [self.image_save_dir]
-            if self.save_cam_sync_labels:
-                dir_list1.append(self.cam_sync_label_all_save_dir)
-                # dir_list2.append(self.cam_sync_label_save_dir)
-        else:
-            dir_list1 = [
-                self.calib_save_dir, self.pose_save_dir,
-                self.timestamp_save_dir
-            ]
-            dir_list2 = [self.image_save_dir]
-        if 'testing_3d_camera_only_detection' not in self.load_dir:
-            dir_list1.append(self.point_cloud_save_dir)
-        for d in dir_list1:
-            mmengine.mkdir_or_exist(d)
-        for d in dir_list2:
-            for i in range(5):
-                mmengine.mkdir_or_exist(f'{d}{str(i)}')
-
     def convert_range_image_to_point_cloud(self,
                                            frame,
                                            range_images,
@@ -609,6 +610,249 @@ def cart_to_homo(self, mat):
             raise ValueError(mat.shape)
         return ret
 
+    def create_waymo_info_file(self, frame, file_idx, frame_idx):
+        frame_infos = dict()
+
+        # Gather frame infos
+        sample_idx = int(
+            f'{self.prefix}{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}')
+        frame_infos['sample_idx'] = sample_idx
+        frame_infos['timestamp'] = frame.timestamp_micros
+        frame_infos['ego2global'] = np.array(frame.pose.transform).reshape(
+            4, 4).astype(np.float32).tolist()
+        frame_infos['context_name'] = frame.context_name
+
+        # Gather camera infos
+        frame_infos['images'] = dict()
+        # waymo front camera to kitti reference camera
+        T_front_cam_to_ref = np.array([[0.0, -1.0, 0.0], [0.0, 0.0, -1.0],
+                                       [1.0, 0.0, 0.0]])
+        camera_calibs = []
+        Tr_velo_to_cams = []
+        for camera in frame.context.camera_calibrations:
+            # extrinsic parameters
+            T_cam_to_vehicle = np.array(camera.extrinsic.transform).reshape(
+                4, 4)
+            T_vehicle_to_cam = np.linalg.inv(T_cam_to_vehicle)
+            Tr_velo_to_cam = \
+                self.cart_to_homo(T_front_cam_to_ref) @ T_vehicle_to_cam
+            Tr_velo_to_cams.append(Tr_velo_to_cam)
+
+            # intrinsic parameters
+            camera_calib = np.zeros((3, 4))
+            camera_calib[0, 0] = camera.intrinsic[0]
+            camera_calib[1, 1] = camera.intrinsic[1]
+            camera_calib[0, 2] = camera.intrinsic[2]
+            camera_calib[1, 2] = camera.intrinsic[3]
+            camera_calib[2, 2] = 1
+            camera_calib = list(camera_calib.reshape(12))
+            camera_calib = [f'{i:e}' for i in camera_calib]
+            camera_calibs.append(camera_calib)
+
+        for cam_key, camera_calib, Tr_velo_to_cam, img in zip(
+                self.camera_types, camera_calibs, Tr_velo_to_cams,
+                frame.images):
+            cam_infos = dict()
+            cam_infos['img_path'] = str(sample_idx) + '.jpg'
+            width, height = Image.open(img).size
+            cam_infos['height'] = height
+            cam_infos['width'] = width
+            cam_infos['lidar2cam'] = Tr_velo_to_cam.astype(np.float32).tolist()
+            cam_infos['cam2img'] = camera_calib.astype(np.float32).tolist()
+            cam_infos['lidar2img'] = (camera_calib @ Tr_velo_to_cam).astype(
+                np.float32).tolist()
+            frame_infos['images'][cam_key] = cam_infos
+
+        # Gather lidar infos
+        lidar_infos = dict()
+        lidar_infos['lidar_path'] = str(sample_idx) + '.bin'
+        lidar_infos['num_pts_feats'] = 6
+        frame_infos['lidar_points'] = lidar_infos
+
+        # Gather lidar sweeps and camera sweeps infos
+        # TODO: Add lidar2img in image sweeps infos when we need it.
+        # TODO: Consider merging lidar sweeps infos and image sweeps infos.
+        lidar_sweeps_infos, image_sweeps_infos = [], []
+        while len(lidar_sweeps_infos) < self.max_sweeps:
+            prev_lidar_infos = dict()
+            prev_image_infos = dict()
+            if frame_idx - len(lidar_sweeps_infos) - 1 >= 0:
+                prev_frame_infos = self.file_infos[-len(lidar_sweeps_infos) -
+                                                   1]
+                prev_lidar_infos['timestamp'] = prev_frame_infos['timestamp']
+                prev_lidar_infos['ego2global'] = prev_frame_infos['ego2global']
+                lidar_path = prev_frame_infos['lidar_points']['lidar_path']
+                prev_lidar_infos['lidar_path'] = lidar_path
+                lidar_sweeps_infos.append(prev_lidar_infos)
+
+                prev_image_infos['timestamp'] = prev_frame_infos['timestamp']
+                prev_image_infos['ego2global'] = prev_frame_infos['ego2global']
+                prev_image_infos['images'] = dict()
+                for cam_key in self.camera_types:
+                    prev_image_infos['images'][cam_key] = dict()
+                    img_path = prev_frame_infos['images'][cam_key]['img_path']
+                    prev_image_infos['images'][cam_key]['img_path'] = img_path
+                image_sweeps_infos.append(prev_image_infos[cam_key])
+        frame_infos['lidar_sweeps'] = lidar_sweeps_infos
+        frame_infos['images_sweeps'] = image_sweeps_infos
+
+        if not self.test_mode:
+            # Gather instances infos which is used for lidar-based 3D detection
+            frame_infos['instances'] = self.gather_instance_info(frame)
+            # Gather cam_sync_instances infos which is used for image-based
+            # (multi-view) 3D detection.
+            if self.save_cam_sync_instances:
+                frame_infos['cam_sync_instances'] = self.gather_instance_info(
+                    frame, cam_sync=True)
+            # Gather cam_instances infos which is used for image-based
+            # (monocular) 3D detection (optional).
+            if self.save_cam_instances:
+                frame_infos['cam_instances'] = self.gather_cam_instance_info(
+                    copy.deepcopy(frame_infos['cam_sync_instances']),
+                    frame_infos['images'])
+
+    def gather_instance_info(self, frame, cam_sync=False):
+        id_to_bbox = dict()
+        id_to_name = dict()
+        for labels in frame.projected_lidar_labels:
+            name = labels.name
+            for label in labels.labels:
+                # TODO: need a workaround as bbox may not belong to front cam
+                bbox = [
+                    label.box.center_x - label.box.length / 2,
+                    label.box.center_y - label.box.width / 2,
+                    label.box.center_x + label.box.length / 2,
+                    label.box.center_y + label.box.width / 2
+                ]
+                id_to_bbox[label.id] = bbox
+                id_to_name[label.id] = name - 1
+
+        group_id = 0
+        instance_infos = []
+        for obj in frame.laser_labels:
+            instance_info = dict()
+            bounding_box = None
+            name = None
+            id = obj.id
+            for proj_cam in self.cam_list:
+                if id + proj_cam in id_to_bbox:
+                    bounding_box = id_to_bbox.get(id + proj_cam)
+                    name = id_to_name.get(id + proj_cam)
+                    break
+
+            # NOTE: the 2D labels do not have strict correspondence with
+            # the projected 2D lidar labels
+            # e.g.: the projected 2D labels can be in camera 2
+            # while the most_visible_camera can have id 4
+            if cam_sync:
+                if obj.most_visible_camera_name:
+                    name = self.cam_list.index(
+                        f'_{obj.most_visible_camera_name}')
+                    box3d = obj.camera_synced_box
+                else:
+                    continue
+            else:
+                box3d = obj.box
+
+            if bounding_box is None or name is None:
+                name = 0
+                bounding_box = [0.0, 0.0, 0.0, 0.0]
+
+            my_type = self.type_list[obj.type]
+
+            if my_type not in self.selected_waymo_classes:
+                continue
+
+            if self.filter_empty_3dboxes and obj.num_lidar_points_in_box < 1:
+                continue
+
+            group_id += 1
+            instance_info['group_id'] = group_id
+            instance_info['camera_id'] = name
+            instance_info['bbox'] = bounding_box
+            instance_info['bbox_label'] = obj.type - 1
+
+            height = box3d.height
+            width = box3d.width
+            length = box3d.length
+
+            # NOTE: We save the bottom center of 3D bboxes.
+            x = box3d.center_x
+            y = box3d.center_y
+            z = box3d.center_z - height / 2
+
+            rotation_y = box3d.heading
+
+            instance_info['bbox3d'] = np.array(
+                [x, y, z, length, width, height,
+                 rotation_y]).astype(np.float32).tolist()
+            instance_info['bbox_label_3d'] = obj.type - 1
+            instance_info['num_lidar_pts'] = obj.num_lidar_points_in_box
+
+            if self.save_track_id:
+                instance_info['track_id'] = obj.id
+            instance_infos.append(instance_info)
+        return instance_infos
+
+    def gather_cam_instance_info(self, cam_sync_instances: dict, images: dict):
+
+        cam_instances = dict()
+        for cam_type in self.camera_types:
+            lidar2cam = images[cam_type]['lidar2cam']
+            cam2img = images[cam_type]['cam2img']
+            cam_instances[cam_type] = []
+            for instance in cam_sync_instances:
+                cam_instance = dict()
+                gt_bboxes_3d = instance['bbox3d']
+                # Convert lidar coordinates to camera coordinates
+                gt_bboxes_3d = LiDARInstance3DBoxes(gt_bboxes_3d).convert_to(
+                    Box3DMode.CAM, lidar2cam, correct_yaw=True).numpy()
+                corners_3d = gt_bboxes_3d.corners
+                corners_3d = corners_3d[0].T  # (1, 8, 3) -> (3, 8)
+                in_camera = np.argwhere(corners_3d[2, :] > 0).flatten()
+                corners_3d = corners_3d[:, in_camera]
+                # Project 3d box to 2d.
+                corner_coords = view_points(corners_3d, cam2img,
+                                            True).T[:, :2].tolist()
+
+                # Keep only corners that fall within the image.
+                final_coords = post_process_coords(
+                    corner_coords,
+                    imsize=(images[cam_type]['weight'],
+                            images[cam_type]['height']))
+
+                # Skip if the convex hull of the re-projected corners
+                # does not intersect the image canvas.
+                if final_coords is None:
+                    continue
+                else:
+                    min_x, min_y, max_x, max_y = final_coords
+
+                cam_instance['bbox'] = [min_x, min_y, max_x, max_y]
+                cam_instance['bbox_label'] = instance['bbox_label']
+                cam_instance['bbox3d'] = gt_bboxes_3d.astype(
+                    np.float32).tolist()
+                cam_instance['bbox_label_3d'] = instance['bbox_label_3d']
+
+                center_3d = gt_bboxes_3d.gravity_center.numpy()
+                center_2d_with_depth = points_cam2img(
+                    center_3d, lidar2cam, with_depth=True)
+                center_2d_with_depth = center_2d_with_depth.squeeze().tolist()
+
+                # normalized center2D + depth
+                # if samples with depth < 0 will be removed
+                if center_2d_with_depth[2] <= 0:
+                    continue
+                cam_instance['center_2d'] = center_2d_with_depth[:2]
+                cam_instance['depth'] = center_2d_with_depth[2]
+
+                # TODO: Discuss whether following info is necessary
+                cam_instance['bbox_3d_isvalid'] = True
+                cam_instance['velocity'] = -1
+            cam_instances[cam_type].append(cam_instance)
+
+        return cam_instances
+
 
 def create_ImageSets_img_ids(root_dir, splits):
     save_dir = join(root_dir, 'ImageSets/')

From 257bcb3c346ece67126acf754cc7e3d5b8ccc68a Mon Sep 17 00:00:00 2001
From: sunjiahao1999 <578431509@qq.com>
Date: Wed, 23 Aug 2023 15:58:22 +0800
Subject: [PATCH 09/25] fix waymo create

---
 tools/create_data.py                        | 51 +++++++------
 tools/dataset_converters/waymo_converter.py | 85 +++++++++++----------
 2 files changed, 73 insertions(+), 63 deletions(-)

diff --git a/tools/create_data.py b/tools/create_data.py
index aa0288ae93..afb75ef484 100644
--- a/tools/create_data.py
+++ b/tools/create_data.py
@@ -171,13 +171,14 @@ def waymo_data_prep(root_path,
                     version,
                     out_dir,
                     workers,
-                    max_sweeps=10):
+                    max_sweeps=10,
+                    skip_gather_cam_instances_info=False):
     """Prepare waymo dataset. There are 3 steps as follows: Step 1. Extract
     camera images and lidar point clouds from waymo raw data in '*.tfreord' and
     save as kitti format. Step 2. Generate waymo train/val/test infos and save
-    as pickle file. Step 3. Generate waymo ground truth database (point clouds
-    within each 3D bounding box) for data augmentation in training. Steps 1 and
-    2 will be done in Waymo2KITTI, and step 3 will be done in
+    as pickle file. Step 3. Generate waymo ground truth database (point
+    cloudswithin each 3D bounding box) for data augmentation in training. Steps
+    1 and 2 will be done in Waymo2KITTI, and step 3 will be done in
     GTDatabaseCreater.
 
     Args:
@@ -186,14 +187,22 @@ def waymo_data_prep(root_path,
         out_dir (str): Output directory of the generated info file.
         workers (int): Number of threads to be used.
         max_sweeps (int, optional): Number of input consecutive frames.
-            Default: 5. Here we store pose information of these frames
-            for later use.
-    """  # noqa: E501
+            Default to 10. Here we store ego2global information of these
+            frames for later use.
+        skip_gather_cam_instances_info (bool, optional): Whether to skip
+            gather cam_instances infos in Step 2. Default to False.
+    """  # noqa
     from tools.dataset_converters import waymo_converter as waymo
 
-    splits = [
-        'training', 'validation', 'testing', 'testing_3d_camera_only_detection'
-    ]
+    if version == 'v1.4':
+        splits = [
+            'training', 'validation', 'testing',
+            'testing_3d_camera_only_detection'
+        ]
+    elif version == 'v1.4-mini':
+        splits = ['training', 'validation']
+    else:
+        raise NotImplementedError(f'Unsupported Waymo version {version}!')
     for i, split in enumerate(splits):
         load_dir = osp.join(root_path, 'waymo_format', split)
         if split == 'validation':
@@ -207,7 +216,10 @@ def waymo_data_prep(root_path,
             workers=workers,
             test_mode=(split
                        in ['testing', 'testing_3d_camera_only_detection']),
-            info_prefix=info_prefix)
+            info_prefix=info_prefix,
+            max_sweeps=max_sweeps,
+            split=split,
+            save_cam_instances=not skip_gather_cam_instances_info)
         converter.convert()
 
     from tools.dataset_converters.waymo_converter import \
@@ -283,9 +295,9 @@ def semantickitti_data_prep(info_prefix, out_dir):
     action='store_true',
     help='Whether to only generate ground truth database.')
 parser.add_argument(
-    '--skip-extract-kitti-format',
+    '--skip-gather-cam_instances-info',
     action='store_true',
-    help='''Whether to skip extracting kitti format image and point cloud.
+    help='''Whether to skip gathering cam_instances info.
         Only used when dataset is waymo!''')
 args = parser.parse_args()
 
@@ -346,22 +358,15 @@ def semantickitti_data_prep(info_prefix, out_dir):
                 dataset_name='NuScenesDataset',
                 out_dir=args.out_dir,
                 max_sweeps=args.max_sweeps)
-    elif args.dataset == 'waymo' and args.version == 'v1.4':
+    elif args.dataset == 'waymo':
         waymo_data_prep(
             root_path=args.root_path,
             info_prefix=args.extra_tag,
             version=args.version,
             out_dir=args.out_dir,
             workers=args.workers,
-            max_sweeps=args.max_sweeps)
-    elif args.dataset == 'waymo' and args.version == 'v1.4-mini':
-        waymo_data_prep(
-            root_path=args.root_path,
-            info_prefix=args.extra_tag,
-            version=args.version,
-            out_dir=args.out_dir,
-            workers=args.workers,
-            max_sweeps=args.max_sweeps)
+            max_sweeps=args.max_sweeps,
+            skip_gather_cam_instances_info=args.skip_gather_cam_instances_info)
     elif args.dataset == 'lyft':
         train_version = f'{args.version}-train'
         lyft_data_prep(
diff --git a/tools/dataset_converters/waymo_converter.py b/tools/dataset_converters/waymo_converter.py
index e269274540..a8c63815de 100644
--- a/tools/dataset_converters/waymo_converter.py
+++ b/tools/dataset_converters/waymo_converter.py
@@ -13,6 +13,7 @@
 import os
 import os.path as osp
 from glob import glob
+from io import BytesIO
 from os.path import exists, join
 
 import mmengine
@@ -47,7 +48,11 @@ class Waymo2KITTI(object):
             instances. Defaults to True.
         save_cam_instances (bool, optional): Whether to save cam instances.
             Defaults to False.
-    """
+        info_prefix (str, optional): Prefix of info filename.
+            Defaults to 'waymo'.
+        max_sweeps (int, optional): Max length of sweeps. Defaults to 10.
+        split (str, optional): Split of the data. Defaults to 'training'.
+    """  # noqa
 
     def __init__(self,
                  load_dir,
@@ -56,8 +61,10 @@ def __init__(self,
                  workers=64,
                  test_mode=False,
                  save_cam_sync_instances=True,
-                 save_cam_instances=False,
-                 info_prefix='waymo'):
+                 save_cam_instances=True,
+                 info_prefix='waymo',
+                 max_sweeps=10,
+                 split='training'):
         # turn on eager execution for older tensorflow versions
         if int(tf.__version__.split('.')[0]) < 2:
             tf.enable_eager_execution()
@@ -92,6 +99,9 @@ def __init__(self,
         self.test_mode = test_mode
         self.save_cam_sync_instances = save_cam_sync_instances
         self.save_cam_instances = save_cam_instances
+        self.info_prefix = info_prefix
+        self.max_sweeps = max_sweeps
+        self.split = split
 
         # TODO: Discuss filter_empty_3dboxes and filter_no_label_zone_points
         self.filter_empty_3dboxes = True
@@ -128,7 +138,7 @@ def convert(self):
         waymo_infos = dict(data_infos=data_list, metainfo=metainfo)
         filenames = osp.join(
             osp.dirname(self.save_dir),
-            f'{self.prefix}_infos_f{osp.basename(self.save_dir)}.pkl')
+            f'{self.info_prefix}_infos_{self.split}.pkl')
         mmengine.dump(waymo_infos, filenames)
         print('\nFinished ...')
 
@@ -149,15 +159,11 @@ def convert_one(self, file_idx):
 
         # NOTE: file_infos is not shared between processes, only stores frame
         # infos within the current file.
-        self.file_infos = []
+        file_infos = []
         for frame_idx, data in enumerate(dataset):
 
             frame = dataset_pb2.Frame()
             frame.ParseFromString(bytearray(data.numpy()))
-            if (self.selected_waymo_locations is not None
-                    and frame.context.stats.location
-                    not in self.selected_waymo_locations):
-                continue
 
             # Step 1.
             self.save_image(frame, file_idx, frame_idx)
@@ -165,12 +171,8 @@ def convert_one(self, file_idx):
 
             # Step 2.
             # TODO save the depth image for waymo challenge solution.
-            self.create_waymo_info_file(frame, file_idx, frame_idx)
-            if not self.test_mode:
-
-                self.save_label(frame, file_idx, frame_idx)
-                if self.save_cam_sync_labels:
-                    self.save_label(frame, file_idx, frame_idx, cam_sync=True)
+            self.create_waymo_info_file(frame, file_idx, frame_idx, file_infos)
+        return file_infos
 
     def __len__(self):
         """Length of the filename list."""
@@ -610,17 +612,17 @@ def cart_to_homo(self, mat):
             raise ValueError(mat.shape)
         return ret
 
-    def create_waymo_info_file(self, frame, file_idx, frame_idx):
+    def create_waymo_info_file(self, frame, file_idx, frame_idx, file_infos):
         frame_infos = dict()
 
         # Gather frame infos
-        sample_idx = int(
-            f'{self.prefix}{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}')
-        frame_infos['sample_idx'] = sample_idx
+        sample_idx = \
+            f'{self.prefix}{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}'
+        frame_infos['sample_idx'] = int(sample_idx)
         frame_infos['timestamp'] = frame.timestamp_micros
         frame_infos['ego2global'] = np.array(frame.pose.transform).reshape(
             4, 4).astype(np.float32).tolist()
-        frame_infos['context_name'] = frame.context_name
+        frame_infos['context_name'] = frame.context.name
 
         # Gather camera infos
         frame_infos['images'] = dict()
@@ -645,8 +647,6 @@ def create_waymo_info_file(self, frame, file_idx, frame_idx):
             camera_calib[0, 2] = camera.intrinsic[2]
             camera_calib[1, 2] = camera.intrinsic[3]
             camera_calib[2, 2] = 1
-            camera_calib = list(camera_calib.reshape(12))
-            camera_calib = [f'{i:e}' for i in camera_calib]
             camera_calibs.append(camera_calib)
 
         for cam_key, camera_calib, Tr_velo_to_cam, img in zip(
@@ -654,7 +654,7 @@ def create_waymo_info_file(self, frame, file_idx, frame_idx):
                 frame.images):
             cam_infos = dict()
             cam_infos['img_path'] = str(sample_idx) + '.jpg'
-            width, height = Image.open(img).size
+            width, height = Image.open(BytesIO(img.image)).size
             cam_infos['height'] = height
             cam_infos['width'] = width
             cam_infos['lidar2cam'] = Tr_velo_to_cam.astype(np.float32).tolist()
@@ -673,12 +673,11 @@ def create_waymo_info_file(self, frame, file_idx, frame_idx):
         # TODO: Add lidar2img in image sweeps infos when we need it.
         # TODO: Consider merging lidar sweeps infos and image sweeps infos.
         lidar_sweeps_infos, image_sweeps_infos = [], []
-        while len(lidar_sweeps_infos) < self.max_sweeps:
+        for prev_offset in range(-1, -self.max_sweeps - 1, -1):
             prev_lidar_infos = dict()
             prev_image_infos = dict()
-            if frame_idx - len(lidar_sweeps_infos) - 1 >= 0:
-                prev_frame_infos = self.file_infos[-len(lidar_sweeps_infos) -
-                                                   1]
+            if frame_idx + prev_offset >= 0:
+                prev_frame_infos = file_infos[prev_offset]
                 prev_lidar_infos['timestamp'] = prev_frame_infos['timestamp']
                 prev_lidar_infos['ego2global'] = prev_frame_infos['ego2global']
                 lidar_path = prev_frame_infos['lidar_points']['lidar_path']
@@ -692,9 +691,11 @@ def create_waymo_info_file(self, frame, file_idx, frame_idx):
                     prev_image_infos['images'][cam_key] = dict()
                     img_path = prev_frame_infos['images'][cam_key]['img_path']
                     prev_image_infos['images'][cam_key]['img_path'] = img_path
-                image_sweeps_infos.append(prev_image_infos[cam_key])
-        frame_infos['lidar_sweeps'] = lidar_sweeps_infos
-        frame_infos['images_sweeps'] = image_sweeps_infos
+                image_sweeps_infos.append(prev_image_infos)
+        if lidar_sweeps_infos:
+            frame_infos['lidar_sweeps'] = lidar_sweeps_infos
+        if image_sweeps_infos:
+            frame_infos['images_sweeps'] = image_sweeps_infos
 
         if not self.test_mode:
             # Gather instances infos which is used for lidar-based 3D detection
@@ -710,6 +711,7 @@ def create_waymo_info_file(self, frame, file_idx, frame_idx):
                 frame_infos['cam_instances'] = self.gather_cam_instance_info(
                     copy.deepcopy(frame_infos['cam_sync_instances']),
                     frame_infos['images'])
+        file_infos.append(frame_infos)
 
     def gather_instance_info(self, frame, cam_sync=False):
         id_to_bbox = dict()
@@ -762,6 +764,8 @@ def gather_instance_info(self, frame, cam_sync=False):
 
             if my_type not in self.selected_waymo_classes:
                 continue
+            else:
+                label = self.selected_waymo_classes.index(my_type)
 
             if self.filter_empty_3dboxes and obj.num_lidar_points_in_box < 1:
                 continue
@@ -770,7 +774,7 @@ def gather_instance_info(self, frame, cam_sync=False):
             instance_info['group_id'] = group_id
             instance_info['camera_id'] = name
             instance_info['bbox'] = bounding_box
-            instance_info['bbox_label'] = obj.type - 1
+            instance_info['bbox_label'] = label
 
             height = box3d.height
             width = box3d.width
@@ -786,7 +790,7 @@ def gather_instance_info(self, frame, cam_sync=False):
             instance_info['bbox3d'] = np.array(
                 [x, y, z, length, width, height,
                  rotation_y]).astype(np.float32).tolist()
-            instance_info['bbox_label_3d'] = obj.type - 1
+            instance_info['bbox_label_3d'] = label
             instance_info['num_lidar_pts'] = obj.num_lidar_points_in_box
 
             if self.save_track_id:
@@ -798,16 +802,17 @@ def gather_cam_instance_info(self, cam_sync_instances: dict, images: dict):
 
         cam_instances = dict()
         for cam_type in self.camera_types:
-            lidar2cam = images[cam_type]['lidar2cam']
-            cam2img = images[cam_type]['cam2img']
+            lidar2cam = np.array(images[cam_type]['lidar2cam'])
+            cam2img = np.array(images[cam_type]['cam2img'])
             cam_instances[cam_type] = []
             for instance in cam_sync_instances:
                 cam_instance = dict()
-                gt_bboxes_3d = instance['bbox3d']
+                gt_bboxes_3d = np.array(instance['bbox3d'])
                 # Convert lidar coordinates to camera coordinates
-                gt_bboxes_3d = LiDARInstance3DBoxes(gt_bboxes_3d).convert_to(
-                    Box3DMode.CAM, lidar2cam, correct_yaw=True).numpy()
-                corners_3d = gt_bboxes_3d.corners
+                gt_bboxes_3d = LiDARInstance3DBoxes(
+                    gt_bboxes_3d[None, :]).convert_to(
+                        Box3DMode.CAM, lidar2cam, correct_yaw=True)
+                corners_3d = gt_bboxes_3d.corners.numpy()
                 corners_3d = corners_3d[0].T  # (1, 8, 3) -> (3, 8)
                 in_camera = np.argwhere(corners_3d[2, :] > 0).flatten()
                 corners_3d = corners_3d[:, in_camera]
@@ -818,7 +823,7 @@ def gather_cam_instance_info(self, cam_sync_instances: dict, images: dict):
                 # Keep only corners that fall within the image.
                 final_coords = post_process_coords(
                     corner_coords,
-                    imsize=(images[cam_type]['weight'],
+                    imsize=(images[cam_type]['width'],
                             images[cam_type]['height']))
 
                 # Skip if the convex hull of the re-projected corners
@@ -830,7 +835,7 @@ def gather_cam_instance_info(self, cam_sync_instances: dict, images: dict):
 
                 cam_instance['bbox'] = [min_x, min_y, max_x, max_y]
                 cam_instance['bbox_label'] = instance['bbox_label']
-                cam_instance['bbox3d'] = gt_bboxes_3d.astype(
+                cam_instance['bbox3d'] = gt_bboxes_3d.numpy().astype(
                     np.float32).tolist()
                 cam_instance['bbox_label_3d'] = instance['bbox_label_3d']
 

From 53620aab56f8da7a8e8a159391f4702db6698c26 Mon Sep 17 00:00:00 2001
From: sunjiahao1999 <578431509@qq.com>
Date: Thu, 24 Aug 2023 14:53:03 +0800
Subject: [PATCH 10/25] remove some function not use

---
 tools/create_data.py                        |  31 +--
 tools/dataset_converters/waymo_converter.py | 274 +++-----------------
 2 files changed, 52 insertions(+), 253 deletions(-)

diff --git a/tools/create_data.py b/tools/create_data.py
index afb75ef484..409aa5cc0b 100644
--- a/tools/create_data.py
+++ b/tools/create_data.py
@@ -173,12 +173,14 @@ def waymo_data_prep(root_path,
                     workers,
                     max_sweeps=10,
                     skip_gather_cam_instances_info=False):
-    """Prepare waymo dataset. There are 3 steps as follows: Step 1. Extract
-    camera images and lidar point clouds from waymo raw data in '*.tfreord' and
-    save as kitti format. Step 2. Generate waymo train/val/test infos and save
-    as pickle file. Step 3. Generate waymo ground truth database (point
-    cloudswithin each 3D bounding box) for data augmentation in training. Steps
-    1 and 2 will be done in Waymo2KITTI, and step 3 will be done in
+    """Prepare waymo dataset. There are 3 steps as follows:
+
+    Step 1. Extract camera images and lidar point clouds from waymo raw
+        data in '*.tfreord' and save as kitti format.
+    Step 2. Generate waymo train/val/test infos and save as pickle file.
+    Step 3. Generate waymo ground truth database (point clouds within
+        each 3D bounding box) for data augmentation in training.
+    Steps 1 and 2 will be done in Waymo2KITTI, and step 3 will be done in
     GTDatabaseCreater.
 
     Args:
@@ -191,7 +193,7 @@ def waymo_data_prep(root_path,
             frames for later use.
         skip_gather_cam_instances_info (bool, optional): Whether to skip
             gather cam_instances infos in Step 2. Default to False.
-    """  # noqa
+    """
     from tools.dataset_converters import waymo_converter as waymo
 
     if version == 'v1.4':
@@ -221,22 +223,13 @@ def waymo_data_prep(root_path,
             split=split,
             save_cam_instances=not skip_gather_cam_instances_info)
         converter.convert()
+        if split == 'validation':
+            converter.merge_trainval_infos()
 
     from tools.dataset_converters.waymo_converter import \
         create_ImageSets_img_ids
     create_ImageSets_img_ids(osp.join(out_dir, 'kitti_format'), splits)
-    # Generate waymo infos
-    # out_dir = osp.join(out_dir, 'kitti_format')
-    # kitti.create_waymo_info_file(
-    #     out_dir, info_prefix, max_sweeps=max_sweeps, workers=workers)
-    # info_train_path = osp.join(out_dir, f'{info_prefix}_infos_train.pkl')
-    # info_val_path = osp.join(out_dir, f'{info_prefix}_infos_val.pkl')
-    # info_trainval_path = osp.join(out_d, f'{info_prefix}_infos_trainval.pkl')
-    # info_test_path = osp.join(out_dir, f'{info_prefix}_infos_test.pkl')
-    # update_pkl_infos('waymo', out_dir=out_dir, pkl_path=info_train_path)
-    # update_pkl_infos('waymo', out_dir=out_dir, pkl_path=info_val_path)
-    # update_pkl_infos('waymo', out_dir=out_dir, pkl_path=info_trainval_path)
-    # update_pkl_infos('waymo', out_dir=out_dir, pkl_path=info_test_path)
+
     GTDatabaseCreater(
         'WaymoDataset',
         out_dir,
diff --git a/tools/dataset_converters/waymo_converter.py b/tools/dataset_converters/waymo_converter.py
index a8c63815de..a91780d78e 100644
--- a/tools/dataset_converters/waymo_converter.py
+++ b/tools/dataset_converters/waymo_converter.py
@@ -30,10 +30,11 @@
 
 
 class Waymo2KITTI(object):
-    """Waymo to KITTI converter.There are 2 steps as follows: Step 1. Extract
-    camera images and lidar point clouds from waymo raw data in '*.tfreord' and
-    save as kitti format. Step 2. Generate waymo train/val/test infos and save
-    as pickle file.
+    """Waymo to KITTI converter. There are 2 steps as follows:
+
+    Step 1. Extract camera images and lidar point clouds from waymo raw data in
+        '*.tfreord' and save as kitti format.
+    Step 2. Generate waymo train/val/test infos and save as pickle file.
 
     Args:
         load_dir (str): Directory to load waymo raw data.
@@ -52,7 +53,7 @@ class Waymo2KITTI(object):
             Defaults to 'waymo'.
         max_sweeps (int, optional): Max length of sweeps. Defaults to 10.
         split (str, optional): Split of the data. Defaults to 'training'.
-    """  # noqa
+    """
 
     def __init__(self,
                  load_dir,
@@ -91,6 +92,12 @@ def __init__(self,
             'CAM_SIDE_RIGHT',
         ]
         self.selected_waymo_classes = ['VEHICLE', 'PEDESTRIAN', 'CYCLIST']
+        self.info_map = {
+            'training': '_infos_train.pkl',
+            'validation': '_infos_val.pkl',
+            'testing': '_infos_test.pkl',
+            'testing_3d_camera_only_detection': '_infos_test_cam_only.pkl'
+        }
 
         self.load_dir = load_dir
         self.save_dir = save_dir
@@ -124,10 +131,12 @@ def __init__(self,
     def convert(self):
         """Convert action."""
         print('Start converting ...')
-        # mmengine.track_progress(self.convert_one, range(len(self)))
-        data_infos = mmengine.track_parallel_progress(self.convert_one,
-                                                      range(len(self)),
-                                                      self.workers)
+        if self.workers == 0:
+            data_infos = mmengine.track_progress(self.convert_one,
+                                                 range(len(self)))
+        else:
+            data_infos = mmengine.track_parallel_progress(
+                self.convert_one, range(len(self)), self.workers)
         data_list = []
         for data_info in data_infos:
             data_list.extend(data_info)
@@ -138,7 +147,7 @@ def convert(self):
         waymo_infos = dict(data_infos=data_list, metainfo=metainfo)
         filenames = osp.join(
             osp.dirname(self.save_dir),
-            f'{self.info_prefix}_infos_{self.split}.pkl')
+            f'{self.info_prefix + self.info_map[self.split]}')
         mmengine.dump(waymo_infos, filenames)
         print('\nFinished ...')
 
@@ -193,62 +202,6 @@ def save_image(self, frame, file_idx, frame_idx):
             with open(img_path, 'wb') as fp:
                 fp.write(img.image)
 
-    def save_calib(self, frame, file_idx, frame_idx):
-        """Parse and save the calibration data.
-
-        Args:
-            frame (:obj:`Frame`): Open dataset frame proto.
-            file_idx (int): Current file index.
-            frame_idx (int): Current frame index.
-        """
-        # waymo front camera to kitti reference camera
-        T_front_cam_to_ref = np.array([[0.0, -1.0, 0.0], [0.0, 0.0, -1.0],
-                                       [1.0, 0.0, 0.0]])
-        camera_calibs = []
-        R0_rect = [f'{i:e}' for i in np.eye(3).flatten()]
-        Tr_velo_to_cams = []
-        calib_context = ''
-
-        for camera in frame.context.camera_calibrations:
-            # extrinsic parameters
-            T_cam_to_vehicle = np.array(camera.extrinsic.transform).reshape(
-                4, 4)
-            T_vehicle_to_cam = np.linalg.inv(T_cam_to_vehicle)
-            Tr_velo_to_cam = \
-                self.cart_to_homo(T_front_cam_to_ref) @ T_vehicle_to_cam
-            if camera.name == 1:  # FRONT = 1, see dataset.proto for details
-                self.T_velo_to_front_cam = Tr_velo_to_cam.copy()
-            Tr_velo_to_cam = Tr_velo_to_cam[:3, :].reshape((12, ))
-            Tr_velo_to_cams.append([f'{i:e}' for i in Tr_velo_to_cam])
-
-            # intrinsic parameters
-            camera_calib = np.zeros((3, 4))
-            camera_calib[0, 0] = camera.intrinsic[0]
-            camera_calib[1, 1] = camera.intrinsic[1]
-            camera_calib[0, 2] = camera.intrinsic[2]
-            camera_calib[1, 2] = camera.intrinsic[3]
-            camera_calib[2, 2] = 1
-            camera_calib = list(camera_calib.reshape(12))
-            camera_calib = [f'{i:e}' for i in camera_calib]
-            camera_calibs.append(camera_calib)
-
-        # all camera ids are saved as id-1 in the result because
-        # camera 0 is unknown in the proto
-        for i in range(5):
-            calib_context += 'P' + str(i) + ': ' + \
-                ' '.join(camera_calibs[i]) + '\n'
-        calib_context += 'R0_rect' + ': ' + ' '.join(R0_rect) + '\n'
-        for i in range(5):
-            calib_context += 'Tr_velo_to_cam_' + str(i) + ': ' + \
-                ' '.join(Tr_velo_to_cams[i]) + '\n'
-
-        with open(
-                f'{self.calib_save_dir}/{self.prefix}' +
-                f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt',
-                'w+') as fp_calib:
-            fp_calib.write(calib_context)
-            fp_calib.close()
-
     def save_lidar(self, frame, file_idx, frame_idx):
         """Parse and save the lidar data in psd format.
 
@@ -306,171 +259,6 @@ def save_lidar(self, frame, file_idx, frame_idx):
             f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.bin'
         point_cloud.astype(np.float32).tofile(pc_path)
 
-    def save_label(self, frame, file_idx, frame_idx, cam_sync=False):
-        """Parse and save the label data in txt format.
-        The relation between waymo and kitti coordinates is noteworthy:
-        1. x, y, z correspond to l, w, h (waymo) -> l, h, w (kitti)
-        2. x-y-z: front-left-up (waymo) -> right-down-front(kitti)
-        3. bbox origin at volumetric center (waymo) -> bottom center (kitti)
-        4. rotation: +x around y-axis (kitti) -> +x around z-axis (waymo)
-
-        Args:
-            frame (:obj:`Frame`): Open dataset frame proto.
-            file_idx (int): Current file index.
-            frame_idx (int): Current frame index.
-            cam_sync (bool, optional): Whether to save the cam sync labels.
-                Defaults to False.
-        """
-        label_all_path = f'{self.label_all_save_dir}/{self.prefix}' + \
-            f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt'
-        if cam_sync:
-            label_all_path = label_all_path.replace('label_',
-                                                    'cam_sync_label_')
-        fp_label_all = open(label_all_path, 'w+')
-        id_to_bbox = dict()
-        id_to_name = dict()
-        for labels in frame.projected_lidar_labels:
-            name = labels.name
-            for label in labels.labels:
-                # TODO: need a workaround as bbox may not belong to front cam
-                bbox = [
-                    label.box.center_x - label.box.length / 2,
-                    label.box.center_y - label.box.width / 2,
-                    label.box.center_x + label.box.length / 2,
-                    label.box.center_y + label.box.width / 2
-                ]
-                id_to_bbox[label.id] = bbox
-                id_to_name[label.id] = name - 1
-
-        for obj in frame.laser_labels:
-            bounding_box = None
-            name = None
-            id = obj.id
-            for proj_cam in self.cam_list:
-                if id + proj_cam in id_to_bbox:
-                    bounding_box = id_to_bbox.get(id + proj_cam)
-                    name = str(id_to_name.get(id + proj_cam))
-                    break
-
-            # NOTE: the 2D labels do not have strict correspondence with
-            # the projected 2D lidar labels
-            # e.g.: the projected 2D labels can be in camera 2
-            # while the most_visible_camera can have id 4
-            if cam_sync:
-                if obj.most_visible_camera_name:
-                    name = str(
-                        self.cam_list.index(
-                            f'_{obj.most_visible_camera_name}'))
-                    box3d = obj.camera_synced_box
-                else:
-                    continue
-            else:
-                box3d = obj.box
-
-            if bounding_box is None or name is None:
-                name = '0'
-                bounding_box = (0, 0, 0, 0)
-
-            my_type = self.type_list[obj.type]
-
-            if my_type not in self.selected_waymo_classes:
-                continue
-
-            if self.filter_empty_3dboxes and obj.num_lidar_points_in_box < 1:
-                continue
-
-            my_type = self.waymo_to_kitti_class_map[my_type]
-
-            height = box3d.height
-            width = box3d.width
-            length = box3d.length
-
-            # TODO: Discuss if we should keep kitti_format
-            x = box3d.center_x
-            y = box3d.center_y
-            z = box3d.center_z - height / 2
-
-            # # project bounding box to the virtual reference frame
-            # pt_ref = self.T_velo_to_front_cam @ \
-            #     np.array([x, y, z, 1]).reshape((4, 1))
-            # x, y, z, _ = pt_ref.flatten().tolist()
-
-            rotation_y = box3d.heading
-            track_id = obj.id
-
-            # not available
-            truncated = 0
-            occluded = 0
-            alpha = -10
-
-            line = my_type + \
-                ' {} {} {} {} {} {} {} {} {} {} {} {} {} {}\n'.format(
-                    truncated, occluded, alpha,
-                    bounding_box[0], bounding_box[1],
-                    bounding_box[2], bounding_box[3],
-                    width, height, length,
-                    x, y, z, rotation_y)
-
-            if self.save_track_id:
-                line_all = line[:-1] + ' ' + name + ' ' + track_id + '\n'
-            else:
-                line_all = line[:-1] + ' ' + name + '\n'
-
-            # Save num_lidar_points_in_box
-            line_all = line_all[:-1] + ' ' + str(
-                obj.num_lidar_points_in_box) + '\n'
-
-            # label_path = f'{self.label_save_dir}{name}/{self.prefix}' + \
-            #     f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt'
-            # if cam_sync:
-            #     label_path = label_path.replace('label_', 'cam_sync_label_')
-            # fp_label = open(label_path, 'a')
-            # fp_label.write(line)
-            # fp_label.close()
-
-            fp_label_all.write(line_all)
-
-        fp_label_all.close()
-
-    def save_pose(self, frame, file_idx, frame_idx):
-        """Parse and save the pose data.
-
-        Note that SDC's own pose is not included in the regular training
-        of KITTI dataset. KITTI raw dataset contains ego motion files
-        but are not often used. Pose is important for algorithms that
-        take advantage of the temporal information.
-
-        Args:
-            frame (:obj:`Frame`): Open dataset frame proto.
-            file_idx (int): Current file index.
-            frame_idx (int): Current frame index.
-        """
-        pose = np.array(frame.pose.transform).reshape(4, 4)
-        np.savetxt(
-            join(f'{self.pose_save_dir}/{self.prefix}' +
-                 f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt'),
-            pose)
-
-    def save_timestamp(self, frame, file_idx, frame_idx):
-        """Save the timestamp data in a separate file instead of the
-        pointcloud.
-
-        Note that SDC's own pose is not included in the regular training
-        of KITTI dataset. KITTI raw dataset contains ego motion files
-        but are not often used. Pose is important for algorithms that
-        take advantage of the temporal information.
-
-        Args:
-            frame (:obj:`Frame`): Open dataset frame proto.
-            file_idx (int): Current file index.
-            frame_idx (int): Current frame index.
-        """
-        with open(
-                join(f'{self.timestamp_save_dir}/{self.prefix}' +
-                     f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt'),
-                'w') as f:
-            f.write(str(frame.timestamp_micros))
-
     def convert_range_image_to_point_cloud(self,
                                            frame,
                                            range_images,
@@ -858,15 +646,33 @@ def gather_cam_instance_info(self, cam_sync_instances: dict, images: dict):
 
         return cam_instances
 
+    def merge_trainval_infos(self):
+        """Merge training and validation infos into a single file."""
+        train_infos_path = osp.join(
+            osp.dirname(self.save_dir), f'{self.info_prefix}_infos_train.pkl')
+        val_infos_path = osp.join(
+            osp.dirname(self.save_dir), f'{self.info_prefix}_infos_val.pkl')
+        train_infos = mmengine.load(train_infos_path)
+        val_infos = mmengine.load(val_infos_path)
+        trainval_infos = dict(
+            metainfo=train_infos['metainfo'],
+            data_list=train_infos['data_list'] + val_infos['data_list'])
+        mmengine.dump(
+            trainval_infos,
+            osp.join(
+                osp.dirname(self.save_dir),
+                f'{self.info_prefix}_infos_trainval.pkl'))
+
 
 def create_ImageSets_img_ids(root_dir, splits):
+    """Create txt files indicating what to collect in each split."""
     save_dir = join(root_dir, 'ImageSets/')
     if not exists(save_dir):
         os.mkdir(save_dir)
 
     idx_all = [[] for i in splits]
     for i, split in enumerate(splits):
-        path = join(root_dir, splits[i], 'calib')
+        path = join(root_dir, splits[i], 'image_0')
         if not exists(path):
             RawNames = []
         else:
@@ -881,6 +687,6 @@ def create_ImageSets_img_ids(root_dir, splits):
     open(save_dir + 'train.txt', 'w').writelines(idx_all[0])
     open(save_dir + 'val.txt', 'w').writelines(idx_all[1])
     open(save_dir + 'trainval.txt', 'w').writelines(idx_all[0] + idx_all[1])
-    # open(save_dir + 'test.txt', 'w').writelines(idx_all[2])
-    # open(save_dir+'test_cam_only.txt','w').writelines(idx_all[3])
+    open(save_dir + 'test.txt', 'w').writelines(idx_all[2])
+    open(save_dir + 'test_cam_only.txt', 'w').writelines(idx_all[3])
     print('created txt files indicating what to collect in ', splits)

From 061d9e6dd1e5a9cc276e2ad8503e4315379ac0e9 Mon Sep 17 00:00:00 2001
From: sunjiahao1999 <578431509@qq.com>
Date: Thu, 24 Aug 2023 20:15:33 +0800
Subject: [PATCH 11/25] fix cam_instances after refactor

---
 tools/create_data.py                          |  4 +-
 .../dataset_converters/create_gt_database.py  | 26 ++++++----
 tools/dataset_converters/waymo_converter.py   | 52 +++++++++++--------
 3 files changed, 49 insertions(+), 33 deletions(-)

diff --git a/tools/create_data.py b/tools/create_data.py
index 409aa5cc0b..d4358bb376 100644
--- a/tools/create_data.py
+++ b/tools/create_data.py
@@ -225,10 +225,10 @@ def waymo_data_prep(root_path,
         converter.convert()
         if split == 'validation':
             converter.merge_trainval_infos()
-
+    out_dir = osp.join(out_dir, 'kitti_format')
     from tools.dataset_converters.waymo_converter import \
         create_ImageSets_img_ids
-    create_ImageSets_img_ids(osp.join(out_dir, 'kitti_format'), splits)
+    create_ImageSets_img_ids(out_dir, splits)
 
     GTDatabaseCreater(
         'WaymoDataset',
diff --git a/tools/dataset_converters/create_gt_database.py b/tools/dataset_converters/create_gt_database.py
index ae452eb543..bbf357e6cb 100644
--- a/tools/dataset_converters/create_gt_database.py
+++ b/tools/dataset_converters/create_gt_database.py
@@ -7,7 +7,7 @@
 import numpy as np
 from mmcv.ops import roi_align
 from mmdet.evaluation import bbox_overlaps
-from mmengine import track_iter_progress
+from mmengine import print_log, track_iter_progress
 from pycocotools import mask as maskUtils
 from pycocotools.coco import COCO
 
@@ -504,7 +504,9 @@ def create_single(self, input_dict):
         return single_db_infos
 
     def create(self):
-        print(f'Create GT Database of {self.dataset_class_name}')
+        print_log(
+            f'Create GT Database of {self.dataset_class_name}',
+            logger='current')
         dataset_cfg = dict(
             type=self.dataset_class_name,
             data_root=self.data_path,
@@ -610,12 +612,18 @@ def loop_dataset(i):
             input_dict['box_mode_3d'] = self.dataset.box_mode_3d
             return input_dict
 
-        multi_db_infos = mmengine.track_parallel_progress(
-            self.create_single,
-            ((loop_dataset(i)
-              for i in range(len(self.dataset))), len(self.dataset)),
-            self.num_worker)
-        print('Make global unique group id')
+        if self.num_worker == 0:
+            multi_db_infos = mmengine.track_progress(
+                self.create_single,
+                ((loop_dataset(i)
+                  for i in range(len(self.dataset))), len(self.dataset)))
+        else:
+            multi_db_infos = mmengine.track_parallel_progress(
+                self.create_single,
+                ((loop_dataset(i)
+                  for i in range(len(self.dataset))), len(self.dataset)),
+                self.num_worker)
+        print_log('Make global unique group id', logger='current')
         group_counter_offset = 0
         all_db_infos = dict()
         for single_db_infos in track_iter_progress(multi_db_infos):
@@ -630,7 +638,7 @@ def loop_dataset(i):
             group_counter_offset += (group_id + 1)
 
         for k, v in all_db_infos.items():
-            print(f'load {len(v)} {k} database infos')
+            print_log(f'load {len(v)} {k} database infos', logger='current')
 
         with open(self.db_info_save_path, 'wb') as f:
             pickle.dump(all_db_infos, f)
diff --git a/tools/dataset_converters/waymo_converter.py b/tools/dataset_converters/waymo_converter.py
index a91780d78e..fc6f5a9114 100644
--- a/tools/dataset_converters/waymo_converter.py
+++ b/tools/dataset_converters/waymo_converter.py
@@ -19,6 +19,7 @@
 import mmengine
 import numpy as np
 import tensorflow as tf
+from mmengine import print_log
 from nuscenes.utils.geometry_utils import view_points
 from PIL import Image
 from waymo_open_dataset.utils import range_image_utils, transform_utils
@@ -130,7 +131,7 @@ def __init__(self,
 
     def convert(self):
         """Convert action."""
-        print('Start converting ...')
+        print_log(f'Start converting {self.split} dataset', logger='current')
         if self.workers == 0:
             data_infos = mmengine.track_progress(self.convert_one,
                                                  range(len(self)))
@@ -144,12 +145,11 @@ def convert(self):
         metainfo['dataset'] = 'waymo'
         metainfo['version'] = '1.4'
         metainfo['info_version'] = '1.1'
-        waymo_infos = dict(data_infos=data_list, metainfo=metainfo)
+        waymo_infos = dict(data_list=data_list, metainfo=metainfo)
         filenames = osp.join(
             osp.dirname(self.save_dir),
             f'{self.info_prefix + self.info_map[self.split]}')
         mmengine.dump(waymo_infos, filenames)
-        print('\nFinished ...')
 
     def convert_one(self, file_idx):
         """Convert one '*.tfrecord' file to kitti format. Each file stores all
@@ -437,12 +437,14 @@ def create_waymo_info_file(self, frame, file_idx, frame_idx, file_infos):
             camera_calib[2, 2] = 1
             camera_calibs.append(camera_calib)
 
-        for cam_key, camera_calib, Tr_velo_to_cam, img in zip(
-                self.camera_types, camera_calibs, Tr_velo_to_cams,
-                frame.images):
+        for i, (cam_key, camera_calib, Tr_velo_to_cam) in enumerate(
+                zip(self.camera_types, camera_calibs, Tr_velo_to_cams)):
             cam_infos = dict()
             cam_infos['img_path'] = str(sample_idx) + '.jpg'
-            width, height = Image.open(BytesIO(img.image)).size
+            # NOTE: frames.images order is different
+            for img in frame.images:
+                if img.name == i + 1:
+                    width, height = Image.open(BytesIO(img.image)).size
             cam_infos['height'] = height
             cam_infos['width'] = width
             cam_infos['lidar2cam'] = Tr_velo_to_cam.astype(np.float32).tolist()
@@ -468,8 +470,9 @@ def create_waymo_info_file(self, frame, file_idx, frame_idx, file_infos):
                 prev_frame_infos = file_infos[prev_offset]
                 prev_lidar_infos['timestamp'] = prev_frame_infos['timestamp']
                 prev_lidar_infos['ego2global'] = prev_frame_infos['ego2global']
+                prev_lidar_infos['lidar_points'] = dict()
                 lidar_path = prev_frame_infos['lidar_points']['lidar_path']
-                prev_lidar_infos['lidar_path'] = lidar_path
+                prev_lidar_infos['lidar_points']['lidar_path'] = lidar_path
                 lidar_sweeps_infos.append(prev_lidar_infos)
 
                 prev_image_infos['timestamp'] = prev_frame_infos['timestamp']
@@ -483,7 +486,7 @@ def create_waymo_info_file(self, frame, file_idx, frame_idx, file_infos):
         if lidar_sweeps_infos:
             frame_infos['lidar_sweeps'] = lidar_sweeps_infos
         if image_sweeps_infos:
-            frame_infos['images_sweeps'] = image_sweeps_infos
+            frame_infos['image_sweeps'] = image_sweeps_infos
 
         if not self.test_mode:
             # Gather instances infos which is used for lidar-based 3D detection
@@ -495,9 +498,10 @@ def create_waymo_info_file(self, frame, file_idx, frame_idx, file_infos):
                     frame, cam_sync=True)
             # Gather cam_instances infos which is used for image-based
             # (monocular) 3D detection (optional).
+            # TODO: Should we use cam_sync_instances to generate cam_instances?
             if self.save_cam_instances:
                 frame_infos['cam_instances'] = self.gather_cam_instance_info(
-                    copy.deepcopy(frame_infos['cam_sync_instances']),
+                    copy.deepcopy(frame_infos['instances']),
                     frame_infos['images'])
         file_infos.append(frame_infos)
 
@@ -575,7 +579,7 @@ def gather_instance_info(self, frame, cam_sync=False):
 
             rotation_y = box3d.heading
 
-            instance_info['bbox3d'] = np.array(
+            instance_info['bbox_3d'] = np.array(
                 [x, y, z, length, width, height,
                  rotation_y]).astype(np.float32).tolist()
             instance_info['bbox_label_3d'] = label
@@ -586,16 +590,16 @@ def gather_instance_info(self, frame, cam_sync=False):
             instance_infos.append(instance_info)
         return instance_infos
 
-    def gather_cam_instance_info(self, cam_sync_instances: dict, images: dict):
+    def gather_cam_instance_info(self, instances: dict, images: dict):
 
         cam_instances = dict()
         for cam_type in self.camera_types:
             lidar2cam = np.array(images[cam_type]['lidar2cam'])
             cam2img = np.array(images[cam_type]['cam2img'])
             cam_instances[cam_type] = []
-            for instance in cam_sync_instances:
+            for instance in instances:
                 cam_instance = dict()
-                gt_bboxes_3d = np.array(instance['bbox3d'])
+                gt_bboxes_3d = np.array(instance['bbox_3d'])
                 # Convert lidar coordinates to camera coordinates
                 gt_bboxes_3d = LiDARInstance3DBoxes(
                     gt_bboxes_3d[None, :]).convert_to(
@@ -609,10 +613,14 @@ def gather_cam_instance_info(self, cam_sync_instances: dict, images: dict):
                                             True).T[:, :2].tolist()
 
                 # Keep only corners that fall within the image.
+                # TODO: imsize should be determined by the current image size
+                # CAM_FRONT: (1920, 1280)
+                # CAM_FRONT_LEFT: (1920, 1280)
+                # CAM_SIDE_LEFT: (1920, 886)
                 final_coords = post_process_coords(
                     corner_coords,
-                    imsize=(images[cam_type]['width'],
-                            images[cam_type]['height']))
+                    imsize=(images['CAM_FRONT']['width'],
+                            images['CAM_FRONT']['height']))
 
                 # Skip if the convex hull of the re-projected corners
                 # does not intersect the image canvas.
@@ -623,13 +631,13 @@ def gather_cam_instance_info(self, cam_sync_instances: dict, images: dict):
 
                 cam_instance['bbox'] = [min_x, min_y, max_x, max_y]
                 cam_instance['bbox_label'] = instance['bbox_label']
-                cam_instance['bbox3d'] = gt_bboxes_3d.numpy().astype(
+                cam_instance['bbox_3d'] = gt_bboxes_3d.numpy().astype(
                     np.float32).tolist()
                 cam_instance['bbox_label_3d'] = instance['bbox_label_3d']
 
                 center_3d = gt_bboxes_3d.gravity_center.numpy()
                 center_2d_with_depth = points_cam2img(
-                    center_3d, lidar2cam, with_depth=True)
+                    center_3d, cam2img, with_depth=True)
                 center_2d_with_depth = center_2d_with_depth.squeeze().tolist()
 
                 # normalized center2D + depth
@@ -642,7 +650,7 @@ def gather_cam_instance_info(self, cam_sync_instances: dict, images: dict):
                 # TODO: Discuss whether following info is necessary
                 cam_instance['bbox_3d_isvalid'] = True
                 cam_instance['velocity'] = -1
-            cam_instances[cam_type].append(cam_instance)
+                cam_instances[cam_type].append(cam_instance)
 
         return cam_instances
 
@@ -670,7 +678,7 @@ def create_ImageSets_img_ids(root_dir, splits):
     if not exists(save_dir):
         os.mkdir(save_dir)
 
-    idx_all = [[] for i in splits]
+    idx_all = [[] for _ in range(4)]
     for i, split in enumerate(splits):
         path = join(root_dir, splits[i], 'image_0')
         if not exists(path):
@@ -679,8 +687,8 @@ def create_ImageSets_img_ids(root_dir, splits):
             RawNames = os.listdir(path)
 
         for name in RawNames:
-            if name.endswith('.txt'):
-                idx = name.replace('.txt', '\n')
+            if name.endswith('.jpg'):
+                idx = name.replace('.jpg', '\n')
                 idx_all[int(idx[0])].append(idx)
         idx_all[i].sort()
 

From 3c7d4e9f527fb66b874058babde67e4cfb2a1492 Mon Sep 17 00:00:00 2001
From: sunjiahao1999 <578431509@qq.com>
Date: Thu, 24 Aug 2023 20:41:23 +0800
Subject: [PATCH 12/25] remove unused .py

---
 .gitignore     |  3 ---
 check_coor.py  | 24 ------------------------
 get_gt_mini.py | 11 -----------
 mp.py          | 43 -------------------------------------------
 4 files changed, 81 deletions(-)
 delete mode 100644 check_coor.py
 delete mode 100644 get_gt_mini.py
 delete mode 100644 mp.py

diff --git a/.gitignore b/.gitignore
index 36817b21a6..2fefc6a904 100644
--- a/.gitignore
+++ b/.gitignore
@@ -135,6 +135,3 @@ data/sunrgbd/OFFICIAL_SUNRGBD/
 mmdet3d/evaluation/functional/waymo_utils/compute_detection_metrics_main
 mmdet3d/evaluation/functional/waymo_utils/compute_detection_let_metrics_main
 mmdet3d/evaluation/functional/waymo_utils/compute_segmentation_metrics_main
-
-check_waymo.py
-remove_infos.py
diff --git a/check_coor.py b/check_coor.py
deleted file mode 100644
index ed1df35a33..0000000000
--- a/check_coor.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import mmengine
-import numpy as np
-
-from mmdet3d.structures import (Box3DMode, CameraInstance3DBoxes,
-                                LiDARInstance3DBoxes)
-
-val_infos = mmengine.load('data/waymo_mini/kitti_format/waymo_infos_val.pkl')
-old_val_infos = mmengine.load(
-    'data/waymo_mini/kitti_format/old_pkl/waymo_infos_val.pkl')
-instance = np.array(
-    val_infos['data_list'][0]['cam_sync_instances'][0]['bbox_3d'])[np.newaxis,
-                                                                   ...]
-instance = LiDARInstance3DBoxes(instance)
-lidar2cam = np.array(
-    val_infos['data_list'][0]['images']['CAM_FRONT']['lidar2cam'])
-cam_instance = instance.convert_to(Box3DMode.CAM, lidar2cam)
-
-old_instance = np.array(old_val_infos['data_list'][0]['cam_sync_instances'][0]
-                        ['bbox_3d'])[np.newaxis, ...]
-
-old_lidar2cam = np.array(
-    old_val_infos['data_list'][0]['images']['CAM_FRONT']['lidar2cam'])
-old_instance = CameraInstance3DBoxes(old_instance)
-pass
diff --git a/get_gt_mini.py b/get_gt_mini.py
deleted file mode 100644
index e300324ed4..0000000000
--- a/get_gt_mini.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from waymo_open_dataset.protos.metrics_pb2 import Objects
-
-objects = Objects()
-with open('./data/waymo_mini/waymo_format/gt.bin', 'rb') as f:
-    objects.ParseFromString(bytearray(f.read()))
-new = Objects()
-for obj in objects.objects:
-    if obj.context_name == '10203656353524179475_7625_000_7645_000':
-        new.objects.append(obj)
-with open('./data/waymo_mini/waymo_format/gt_mini.bin', 'wb') as f:
-    f.write(new.SerializeToString())
diff --git a/mp.py b/mp.py
deleted file mode 100644
index 3262dd5eac..0000000000
--- a/mp.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import multiprocessing
-import time
-
-
-class A(object):
-
-    def __init__(self):
-        self.a = []
-        self.b = None
-
-    def get_num_a(self, num):
-        time.sleep(3)
-        self.a = [0]
-        self.get_num_b(num)
-        print(self.a)
-        return self.a
-
-    def get_num_b(self, num):
-        self.a.append(num)
-
-    def sum(self):
-        print(self.a)
-
-    def run(self):
-        # p1 = multiprocessing.Process(target=self.get_num_a)
-        # p2 = multiprocessing.Process(target=self.get_num_b)
-        # p1.start()
-        # p2.start()
-        # p1.join()
-        # p2.join()
-        p1 = multiprocessing.Pool(2)
-        list_a = p1.map(self.get_num_a, [1, 2])
-        self.sum()
-        print(list_a)
-
-
-if __name__ == '__main__':
-
-    t1 = time.time()
-    a = A()
-    a.run()
-    t2 = time.time()
-    print('cost time :{}'.format(t2 - t1))

From 38e47b1d9637efb86245f80adc5e6b9b11d650ac Mon Sep 17 00:00:00 2001
From: sunjiahao1999 <578431509@qq.com>
Date: Mon, 28 Aug 2023 15:14:07 +0800
Subject: [PATCH 13/25] add fast eval

---
 .../waymo_utils/prediction_to_waymo.py        | 265 +-----------------
 mmdet3d/evaluation/metrics/waymo_metric.py    |  73 +++--
 ...ecfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py |   9 +-
 3 files changed, 52 insertions(+), 295 deletions(-)

diff --git a/mmdet3d/evaluation/functional/waymo_utils/prediction_to_waymo.py b/mmdet3d/evaluation/functional/waymo_utils/prediction_to_waymo.py
index b9da8043d2..7b47d85075 100644
--- a/mmdet3d/evaluation/functional/waymo_utils/prediction_to_waymo.py
+++ b/mmdet3d/evaluation/functional/waymo_utils/prediction_to_waymo.py
@@ -4,7 +4,6 @@
 """
 
 try:
-    from waymo_open_dataset import dataset_pb2 as open_dataset
     from waymo_open_dataset import label_pb2
     from waymo_open_dataset.protos import metrics_pb2
     from waymo_open_dataset.protos.metrics_pb2 import Objects
@@ -14,13 +13,10 @@
         'Please run "pip install waymo-open-dataset-tf-2-1-0==1.2.0" '
         'to install the official devkit first.')
 
-from glob import glob
-from os.path import join
 from typing import List, Optional
 
 import mmengine
 import numpy as np
-import tensorflow as tf
 
 
 class Prediction2Waymo(object):
@@ -45,39 +41,22 @@ class Prediction2Waymo(object):
             corresponding backend. Defaults to None.
         from_kitti_format (bool, optional): Whether the reuslts are kitti
             format. Defaults to False.
-        idx2metainfo (Optional[dict], optional): The mapping from sample_idx to
-            metainfo. The metainfo must contain the keys: 'idx2contextname' and
-            'idx2timestamp'. Defaults to None.
     """
 
     def __init__(self,
                  results: List[dict],
-                 waymo_tfrecords_dir: str,
                  waymo_results_save_dir: str,
                  waymo_results_final_path: str,
-                 prefix: str,
                  classes: dict,
                  workers: int = 2,
-                 backend_args: Optional[dict] = None,
-                 from_kitti_format: bool = False,
-                 idx2metainfo: Optional[dict] = None):
+                 backend_args: Optional[dict] = None):
 
         self.results = results
-        self.waymo_tfrecords_dir = waymo_tfrecords_dir
         self.waymo_results_save_dir = waymo_results_save_dir
         self.waymo_results_final_path = waymo_results_final_path
-        self.prefix = prefix
         self.classes = classes
         self.workers = int(workers)
         self.backend_args = backend_args
-        self.from_kitti_format = from_kitti_format
-        if idx2metainfo is not None:
-            self.idx2metainfo = idx2metainfo
-            # If ``fast_eval``, the metainfo does not need to be read from
-            # original data online. It's preprocessed offline.
-            self.fast_eval = True
-        else:
-            self.fast_eval = False
 
         self.name2idx = {}
 
@@ -88,192 +67,12 @@ def __init__(self,
             'Cyclist': label_pb2.Label.TYPE_CYCLIST,
         }
 
-        if self.from_kitti_format:
-            self.T_ref_to_front_cam = np.array([[0.0, 0.0, 1.0, 0.0],
-                                                [-1.0, 0.0, 0.0, 0.0],
-                                                [0.0, -1.0, 0.0, 0.0],
-                                                [0.0, 0.0, 0.0, 1.0]])
-            # ``sample_idx`` of the sample in kitti-format is an array
-            for idx, result in enumerate(results):
-                if len(result['sample_idx']) > 0:
-                    self.name2idx[str(result['sample_idx'][0])] = idx
-        else:
-            # ``sample_idx`` of the sample in the original prediction
-            # is an int value.
-            for idx, result in enumerate(results):
-                self.name2idx[str(result['sample_idx'])] = idx
-
-        if not self.fast_eval:
-            # need to read original '.tfrecord' file
-            self.get_file_names()
-            # turn on eager execution for older tensorflow versions
-            if int(tf.__version__.split('.')[0]) < 2:
-                tf.enable_eager_execution()
-
         self.create_folder()
 
-    def get_file_names(self):
-        """Get file names of waymo raw data."""
-        if 'path_mapping' in self.backend_args:
-            for path in self.backend_args['path_mapping'].keys():
-                if path in self.waymo_tfrecords_dir:
-                    self.waymo_tfrecords_dir = \
-                        self.waymo_tfrecords_dir.replace(
-                            path, self.backend_args['path_mapping'][path])
-            from petrel_client.client import Client
-            client = Client()
-            contents = client.list(self.waymo_tfrecords_dir)
-            self.waymo_tfrecord_pathnames = list()
-            for content in sorted(list(contents)):
-                if content.endswith('tfrecord'):
-                    self.waymo_tfrecord_pathnames.append(
-                        join(self.waymo_tfrecords_dir, content))
-        else:
-            self.waymo_tfrecord_pathnames = sorted(
-                glob(join(self.waymo_tfrecords_dir, '*.tfrecord')))
-        print(len(self.waymo_tfrecord_pathnames), 'tfrecords found.')
-
     def create_folder(self):
         """Create folder for data conversion."""
         mmengine.mkdir_or_exist(self.waymo_results_save_dir)
 
-    def parse_objects(self, kitti_result, T_k2w, context_name,
-                      frame_timestamp_micros):
-        """Parse one prediction with several instances in kitti format and
-        convert them to `Object` proto.
-
-        Args:
-            kitti_result (dict): Predictions in kitti format.
-
-                - name (np.ndarray): Class labels of predictions.
-                - dimensions (np.ndarray): Height, width, length of boxes.
-                - location (np.ndarray): Bottom center of boxes (x, y, z).
-                - rotation_y (np.ndarray): Orientation of boxes.
-                - score (np.ndarray): Scores of predictions.
-            T_k2w (np.ndarray): Transformation matrix from kitti to waymo.
-            context_name (str): Context name of the frame.
-            frame_timestamp_micros (int): Frame timestamp.
-
-        Returns:
-            :obj:`Object`: Predictions in waymo dataset Object proto.
-        """
-
-        def parse_one_object(instance_idx):
-            """Parse one instance in kitti format and convert them to `Object`
-            proto.
-
-            Args:
-                instance_idx (int): Index of the instance to be converted.
-
-            Returns:
-                :obj:`Object`: Predicted instance in waymo dataset
-                    Object proto.
-            """
-            cls = kitti_result['name'][instance_idx]
-            length = round(kitti_result['dimensions'][instance_idx, 0], 4)
-            height = round(kitti_result['dimensions'][instance_idx, 1], 4)
-            width = round(kitti_result['dimensions'][instance_idx, 2], 4)
-            x = round(kitti_result['location'][instance_idx, 0], 4)
-            y = round(kitti_result['location'][instance_idx, 1], 4)
-            z = round(kitti_result['location'][instance_idx, 2], 4)
-            rotation_y = round(kitti_result['rotation_y'][instance_idx], 4)
-            score = round(kitti_result['score'][instance_idx], 4)
-
-            # y: downwards; move box origin from bottom center (kitti) to
-            # true center (waymo)
-            y -= height / 2
-            # frame transformation: kitti -> waymo
-            x, y, z = self.transform(T_k2w, x, y, z)
-
-            # different conventions
-            heading = -(rotation_y + np.pi / 2)
-            while heading < -np.pi:
-                heading += 2 * np.pi
-            while heading > np.pi:
-                heading -= 2 * np.pi
-
-            box = label_pb2.Label.Box()
-            box.center_x = x
-            box.center_y = y
-            box.center_z = z
-            box.length = length
-            box.width = width
-            box.height = height
-            box.heading = heading
-
-            o = metrics_pb2.Object()
-            o.object.box.CopyFrom(box)
-            o.object.type = self.k2w_cls_map[cls]
-            o.score = score
-
-            o.context_name = context_name
-            o.frame_timestamp_micros = frame_timestamp_micros
-
-            return o
-
-        objects = metrics_pb2.Objects()
-
-        for instance_idx in range(len(kitti_result['name'])):
-            o = parse_one_object(instance_idx)
-            objects.objects.append(o)
-
-        return objects
-
-    def convert_one(self, file_idx):
-        """Convert action for single file.
-
-        Args:
-            file_idx (int): Index of the file to be converted.
-        """
-        file_pathname = self.waymo_tfrecord_pathnames[file_idx]
-        if 's3://' in file_pathname and tf.__version__ >= '2.6.0':
-            try:
-                import tensorflow_io as tfio  # noqa: F401
-            except ImportError:
-                raise ImportError(
-                    "Please run 'pip install tensorflow-io' to install tensorflow_io first."  # noqa: E501
-                )
-        file_data = tf.data.TFRecordDataset(file_pathname, compression_type='')
-
-        for frame_num, frame_data in enumerate(file_data):
-            frame = open_dataset.Frame()
-            frame.ParseFromString(bytearray(frame_data.numpy()))
-
-            filename = f'{self.prefix}{file_idx:03d}{frame_num:03d}'
-
-            context_name = frame.context.name
-            frame_timestamp_micros = frame.timestamp_micros
-
-            if filename in self.name2idx:
-                if self.from_kitti_format:
-                    for camera in frame.context.camera_calibrations:
-                        # FRONT = 1, see dataset.proto for details
-                        if camera.name == 1:
-                            T_front_cam_to_vehicle = np.array(
-                                camera.extrinsic.transform).reshape(4, 4)
-
-                    T_k2w = T_front_cam_to_vehicle @ self.T_ref_to_front_cam
-
-                    kitti_result = \
-                        self.results[self.name2idx[filename]]
-                    objects = self.parse_objects(kitti_result, T_k2w,
-                                                 context_name,
-                                                 frame_timestamp_micros)
-                else:
-                    index = self.name2idx[filename]
-                    objects = self.parse_objects_from_origin(
-                        self.results[index], context_name,
-                        frame_timestamp_micros)
-
-            else:
-                print(filename, 'not found.')
-                objects = metrics_pb2.Objects()
-
-            with open(
-                    join(self.waymo_results_save_dir, f'{filename}.bin'),
-                    'wb') as f:
-                f.write(objects.SerializeToString())
-
     def convert_one_fast(self, res_index: int):
         """Convert action for single file. It read the metainfo from the
         preprocessed file offline and will be faster.
@@ -285,16 +84,13 @@ def convert_one_fast(self, res_index: int):
         if len(self.results[res_index]['pred_instances_3d']) > 0:
             objects = self.parse_objects_from_origin(
                 self.results[res_index],
-                self.idx2metainfo[str(sample_idx)]['contextname'],
-                self.idx2metainfo[str(sample_idx)]['timestamp'])
+                self.results[res_index]['context_name'],
+                self.results[res_index]['timestamp'])
         else:
             print(sample_idx, 'not found.')
             objects = metrics_pb2.Objects()
 
-        with open(
-                join(self.waymo_results_save_dir, f'{sample_idx}.bin'),
-                'wb') as f:
-            f.write(objects.SerializeToString())
+        return objects
 
     def parse_objects_from_origin(self, result: dict, contextname: str,
                                   timestamp: str) -> Objects:
@@ -350,8 +146,6 @@ def parse_one_object(index):
     def convert(self):
         """Convert action."""
         print('Start converting ...')
-        convert_func = self.convert_one_fast if self.fast_eval else \
-            self.convert_one
 
         # from torch.multiprocessing import set_sharing_strategy
         # # Force using "file_system" sharing strategy for stability
@@ -363,57 +157,22 @@ def convert(self):
         # TODO: Support multiprocessing. Now, multiprocessing evaluation will
         # cause shared memory error in torch-1.10 and torch-1.11. Details can
         # be seen in https://github.com/pytorch/pytorch/issues/67864.
+        objects_list = []
         prog_bar = mmengine.ProgressBar(len(self))
         for i in range(len(self)):
-            convert_func(i)
+            objects = self.convert_one_fast(i)
+            objects_list.append(objects)
             prog_bar.update()
 
+        combined = metrics_pb2.Objects()
+        for objects in objects_list:
+            for o in objects.objects:
+                combined.objects.append(o)
         print('\nFinished ...')
 
-        # combine all files into one .bin
-        pathnames = sorted(glob(join(self.waymo_results_save_dir, '*.bin')))
-        combined = self.combine(pathnames)
-
         with open(self.waymo_results_final_path, 'wb') as f:
             f.write(combined.SerializeToString())
 
     def __len__(self):
         """Length of the filename list."""
-        return len(self.results) if self.fast_eval else len(
-            self.waymo_tfrecord_pathnames)
-
-    def transform(self, T, x, y, z):
-        """Transform the coordinates with matrix T.
-
-        Args:
-            T (np.ndarray): Transformation matrix.
-            x(float): Coordinate in x axis.
-            y(float): Coordinate in y axis.
-            z(float): Coordinate in z axis.
-
-        Returns:
-            list: Coordinates after transformation.
-        """
-        pt_bef = np.array([x, y, z, 1.0]).reshape(4, 1)
-        pt_aft = np.matmul(T, pt_bef)
-        return pt_aft[:3].flatten().tolist()
-
-    def combine(self, pathnames):
-        """Combine predictions in waymo format for each sample together.
-
-        Args:
-            pathnames (str): Paths to save predictions.
-
-        Returns:
-            :obj:`Objects`: Combined predictions in Objects proto.
-        """
-        combined = metrics_pb2.Objects()
-
-        for pathname in pathnames:
-            objects = metrics_pb2.Objects()
-            with open(pathname, 'rb') as f:
-                objects.ParseFromString(f.read())
-            for o in objects.objects:
-                combined.objects.append(o)
-
-        return combined
+        return len(self.results)
diff --git a/mmdet3d/evaluation/metrics/waymo_metric.py b/mmdet3d/evaluation/metrics/waymo_metric.py
index 0dd69a5c24..b923b6566a 100644
--- a/mmdet3d/evaluation/metrics/waymo_metric.py
+++ b/mmdet3d/evaluation/metrics/waymo_metric.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import tempfile
 from os import path as osp
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Sequence, Tuple, Union
 
 import mmengine
 import numpy as np
@@ -79,7 +79,6 @@ class WaymoMetric(KittiMetric):
     def __init__(self,
                  ann_file: str,
                  waymo_bin_file: str,
-                 data_root: str,
                  split: str = 'training',
                  metric: Union[str, List[str]] = 'mAP',
                  pcd_limit_range: List[float] = [-85, -85, -5, 85, 85, 5],
@@ -92,20 +91,13 @@ def __init__(self,
                  default_cam_key: str = 'CAM_FRONT',
                  use_pred_sample_idx: bool = False,
                  collect_device: str = 'cpu',
-                 backend_args: Optional[dict] = None,
-                 idx2metainfo: Optional[str] = None) -> None:
+                 backend_args: Optional[dict] = None) -> None:
         self.waymo_bin_file = waymo_bin_file
-        self.data_root = data_root
         self.split = split
         self.load_type = load_type
         self.use_pred_sample_idx = use_pred_sample_idx
         self.convert_kitti_format = convert_kitti_format
 
-        if idx2metainfo is not None:
-            self.idx2metainfo = mmengine.load(idx2metainfo)
-        else:
-            self.idx2metainfo = None
-
         super(WaymoMetric, self).__init__(
             ann_file=ann_file,
             metric=metric,
@@ -124,6 +116,32 @@ def __init__(self,
 
         self.default_prefix = 'Waymo metric'
 
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+
+        for data_sample in data_samples:
+            result = dict()
+            pred_3d = data_sample['pred_instances_3d']
+            pred_2d = data_sample['pred_instances']
+            for attr_name in pred_3d:
+                pred_3d[attr_name] = pred_3d[attr_name].to('cpu')
+            result['pred_instances_3d'] = pred_3d
+            for attr_name in pred_2d:
+                pred_2d[attr_name] = pred_2d[attr_name].to('cpu')
+            result['pred_instances'] = pred_2d
+            result['sample_idx'] = data_sample['sample_idx']
+            result['context_name'] = data_sample['context_name']
+            result['timestamp'] = data_sample['timestamp']
+            self.results.append(result)
+
     def compute_metrics(self, results: List[dict]) -> Dict[str, float]:
         """Compute the metrics from processed results.
 
@@ -137,13 +155,13 @@ def compute_metrics(self, results: List[dict]) -> Dict[str, float]:
         logger: MMLogger = MMLogger.get_current_instance()
         self.classes = self.dataset_meta['classes']
 
-        # load annotations
-        self.data_infos = load(self.ann_file)['data_list']
-        assert len(results) == len(self.data_infos), \
-            'invalid list length of network outputs'
         # different from kitti, waymo do not need to convert the ann file
         # handle the mv_image_based load_mode
         if self.load_type == 'mv_image_based':
+            # load annotations
+            self.data_infos = load(self.ann_file)['data_list']
+            assert len(results) == len(self.data_infos), \
+                'invalid list length of network outputs'
             new_data_infos = []
             for info in self.data_infos:
                 height = info['images'][self.default_cam_key]['height']
@@ -179,7 +197,7 @@ def compute_metrics(self, results: List[dict]) -> Dict[str, float]:
             eval_tmp_dir = None
             pklfile_prefix = self.pklfile_prefix
 
-        result_dict, tmp_dir = self.format_results(
+        self.format_results(
             results,
             pklfile_prefix=pklfile_prefix,
             submission_prefix=self.submission_prefix,
@@ -199,8 +217,6 @@ def compute_metrics(self, results: List[dict]) -> Dict[str, float]:
         if eval_tmp_dir is not None:
             eval_tmp_dir.cleanup()
 
-        if tmp_dir is not None:
-            tmp_dir.cleanup()
         return metric_dict
 
     def waymo_evaluate(self,
@@ -359,40 +375,21 @@ def format_results(
             final_results = results_kitti_format['pred_instances_3d']
         else:
             final_results = results
-            for i, res in enumerate(final_results):
-                # Actually, `sample_idx` here is the filename without suffix.
-                # It's for identitying the sample in formating.
-                res['sample_idx'] = self.data_infos[i]['sample_idx']
+            for res in final_results:
                 res['pred_instances_3d']['bboxes_3d'].limit_yaw(
                     offset=0.5, period=np.pi * 2)
 
-        waymo_root = self.data_root
-        if self.split == 'training':
-            waymo_tfrecords_dir = osp.join(waymo_root, 'validation')
-            prefix = '1'
-        elif self.split == 'testing':
-            waymo_tfrecords_dir = osp.join(waymo_root, 'testing')
-            prefix = '2'
-        else:
-            raise ValueError('Not supported split value.')
-
         from ..functional.waymo_utils.prediction_to_waymo import \
             Prediction2Waymo
         converter = Prediction2Waymo(
             final_results,
-            waymo_tfrecords_dir,
             waymo_results_save_dir,
             waymo_results_final_path,
-            prefix,
             classes,
-            backend_args=self.backend_args,
-            from_kitti_format=self.convert_kitti_format,
-            idx2metainfo=self.idx2metainfo)
+            backend_args=self.backend_args)
         converter.convert()
         waymo_save_tmp_dir.cleanup()
 
-        return final_results, waymo_save_tmp_dir
-
     def merge_multi_view_boxes(self, box_dict_per_frame: List[dict],
                                cam0_info: dict) -> dict:
         """Merge bounding boxes predicted from multi-view images.
diff --git a/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py b/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
index 1027942d9b..ec734d8f48 100644
--- a/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
+++ b/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
@@ -188,7 +188,10 @@
             dict(
                 type='PointsRangeFilter', point_cloud_range=point_cloud_range)
         ]),
-    dict(type='Pack3DDetInputs', keys=['points'])
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points'],
+        meta_keys=['box_type_3d', 'sample_idx', 'context_name', 'timestamp']),
 ]
 
 dataset_type = 'WaymoDataset'
@@ -215,10 +218,8 @@
     type='WaymoMetric',
     ann_file='./data/waymo_mini/kitti_format/waymo_infos_val.pkl',
     waymo_bin_file='./data/waymo_mini/waymo_format/gt_mini.bin',
-    data_root='./data/waymo_mini/waymo_format',
     backend_args=backend_args,
-    convert_kitti_format=False,
-    idx2metainfo='./data/waymo_mini/waymo_format/idx2metainfo.pkl')
+    convert_kitti_format=False)
 # val_evaluator = dict(
 #     type='WaymoMetric',
 #     ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',

From 8c6e8c034ea433abee6ccb8a64cd964f4a8a5b10 Mon Sep 17 00:00:00 2001
From: sunjiahao1999 <578431509@qq.com>
Date: Tue, 29 Aug 2023 11:58:19 +0800
Subject: [PATCH 14/25] add prallel eval

---
 .../waymo_utils/prediction_to_waymo.py        | 73 ++++++-----------
 mmdet3d/evaluation/metrics/waymo_metric.py    | 26 +++---
 tools/create_data.py                          | 80 ++++++++++++-------
 tools/dataset_converters/waymo_converter.py   |  9 ++-
 4 files changed, 92 insertions(+), 96 deletions(-)

diff --git a/mmdet3d/evaluation/functional/waymo_utils/prediction_to_waymo.py b/mmdet3d/evaluation/functional/waymo_utils/prediction_to_waymo.py
index 7b47d85075..765b1bfa02 100644
--- a/mmdet3d/evaluation/functional/waymo_utils/prediction_to_waymo.py
+++ b/mmdet3d/evaluation/functional/waymo_utils/prediction_to_waymo.py
@@ -16,7 +16,7 @@
 from typing import List, Optional
 
 import mmengine
-import numpy as np
+from mmengine import print_log
 
 
 class Prediction2Waymo(object):
@@ -48,7 +48,7 @@ def __init__(self,
                  waymo_results_save_dir: str,
                  waymo_results_final_path: str,
                  classes: dict,
-                 workers: int = 2,
+                 workers: int = 4,
                  backend_args: Optional[dict] = None):
 
         self.results = results
@@ -104,71 +104,48 @@ def parse_objects_from_origin(self, result: dict, contextname: str,
         Returns:
             metrics_pb2.Objects: The parsed object.
         """
-        lidar_boxes = result['pred_instances_3d']['bboxes_3d'].tensor
+        lidar_boxes = result['pred_instances_3d']['bboxes_3d']
         scores = result['pred_instances_3d']['scores_3d']
         labels = result['pred_instances_3d']['labels_3d']
 
-        def parse_one_object(index):
-            class_name = self.classes[labels[index].item()]
-
+        objects = metrics_pb2.Objects()
+        for lidar_box, score, label in zip(lidar_boxes, scores, labels):
+            # Parse one object
             box = label_pb2.Label.Box()
-            height = lidar_boxes[index][5].item()
-            heading = lidar_boxes[index][6].item()
-
-            while heading < -np.pi:
-                heading += 2 * np.pi
-            while heading > np.pi:
-                heading -= 2 * np.pi
-
-            box.center_x = lidar_boxes[index][0].item()
-            box.center_y = lidar_boxes[index][1].item()
-            box.center_z = lidar_boxes[index][2].item() + height / 2
-            box.length = lidar_boxes[index][3].item()
-            box.width = lidar_boxes[index][4].item()
+            height = lidar_box[5]
+            heading = lidar_box[6]
+
+            box.center_x = lidar_box[0]
+            box.center_y = lidar_box[1]
+            box.center_z = lidar_box[2] + height / 2
+            box.length = lidar_box[3]
+            box.width = lidar_box[4]
             box.height = height
             box.heading = heading
 
-            o = metrics_pb2.Object()
-            o.object.box.CopyFrom(box)
-            o.object.type = self.k2w_cls_map[class_name]
-            o.score = scores[index].item()
-            o.context_name = contextname
-            o.frame_timestamp_micros = timestamp
-
-            return o
+            object = metrics_pb2.Object()
+            object.object.box.CopyFrom(box)
 
-        objects = metrics_pb2.Objects()
-        for i in range(len(lidar_boxes)):
-            objects.objects.append(parse_one_object(i))
+            class_name = self.classes[label]
+            object.object.type = self.k2w_cls_map[class_name]
+            object.score = score
+            object.context_name = contextname
+            object.frame_timestamp_micros = timestamp
+            objects.objects.append(object)
 
         return objects
 
     def convert(self):
         """Convert action."""
-        print('Start converting ...')
-
-        # from torch.multiprocessing import set_sharing_strategy
-        # # Force using "file_system" sharing strategy for stability
-        # set_sharing_strategy("file_system")
-
-        # mmengine.track_parallel_progress(convert_func, range(len(self)),
-        #                                  self.workers)
+        print_log('Start converting ...', logger='current')
 
-        # TODO: Support multiprocessing. Now, multiprocessing evaluation will
-        # cause shared memory error in torch-1.10 and torch-1.11. Details can
-        # be seen in https://github.com/pytorch/pytorch/issues/67864.
-        objects_list = []
-        prog_bar = mmengine.ProgressBar(len(self))
-        for i in range(len(self)):
-            objects = self.convert_one_fast(i)
-            objects_list.append(objects)
-            prog_bar.update()
+        objects_list = mmengine.track_parallel_progress(
+            self.convert_one_fast, range(len(self)), self.workers)
 
         combined = metrics_pb2.Objects()
         for objects in objects_list:
             for o in objects.objects:
                 combined.objects.append(o)
-        print('\nFinished ...')
 
         with open(self.waymo_results_final_path, 'wb') as f:
             f.write(combined.SerializeToString())
diff --git a/mmdet3d/evaluation/metrics/waymo_metric.py b/mmdet3d/evaluation/metrics/waymo_metric.py
index b923b6566a..fb8ac495e6 100644
--- a/mmdet3d/evaluation/metrics/waymo_metric.py
+++ b/mmdet3d/evaluation/metrics/waymo_metric.py
@@ -129,14 +129,15 @@ def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
 
         for data_sample in data_samples:
             result = dict()
-            pred_3d = data_sample['pred_instances_3d']
-            pred_2d = data_sample['pred_instances']
-            for attr_name in pred_3d:
-                pred_3d[attr_name] = pred_3d[attr_name].to('cpu')
-            result['pred_instances_3d'] = pred_3d
-            for attr_name in pred_2d:
-                pred_2d[attr_name] = pred_2d[attr_name].to('cpu')
-            result['pred_instances'] = pred_2d
+            result['pred_instances_3d'] = dict()
+            data_sample['pred_instances_3d']['bboxes_3d'].limit_yaw(
+                offset=0.5, period=np.pi * 2)
+            result['pred_instances_3d']['bboxes_3d'] = data_sample[
+                'pred_instances_3d']['bboxes_3d'].tensor.cpu().numpy()
+            result['pred_instances_3d']['scores_3d'] = data_sample[
+                'pred_instances_3d']['scores_3d'].cpu().numpy()
+            result['pred_instances_3d']['labels_3d'] = data_sample[
+                'pred_instances_3d']['labels_3d'].cpu().numpy()
             result['sample_idx'] = data_sample['sample_idx']
             result['context_name'] = data_sample['context_name']
             result['timestamp'] = data_sample['timestamp']
@@ -372,17 +373,12 @@ def format_results(
         if self.convert_kitti_format:
             results_kitti_format, tmp_dir = super().format_results(
                 results, pklfile_prefix, submission_prefix, classes)
-            final_results = results_kitti_format['pred_instances_3d']
-        else:
-            final_results = results
-            for res in final_results:
-                res['pred_instances_3d']['bboxes_3d'].limit_yaw(
-                    offset=0.5, period=np.pi * 2)
+            results = results_kitti_format['pred_instances_3d']
 
         from ..functional.waymo_utils.prediction_to_waymo import \
             Prediction2Waymo
         converter = Prediction2Waymo(
-            final_results,
+            results,
             waymo_results_save_dir,
             waymo_results_final_path,
             classes,
diff --git a/tools/create_data.py b/tools/create_data.py
index d4358bb376..1eadaeb6af 100644
--- a/tools/create_data.py
+++ b/tools/create_data.py
@@ -172,7 +172,9 @@ def waymo_data_prep(root_path,
                     out_dir,
                     workers,
                     max_sweeps=10,
-                    skip_gather_cam_instances_info=False):
+                    only_gt_database=False,
+                    skip_image_and_lidar=False,
+                    skip_cam_instances_infos=False):
     """Prepare waymo dataset. There are 3 steps as follows:
 
     Step 1. Extract camera images and lidar point clouds from waymo raw
@@ -191,8 +193,12 @@ def waymo_data_prep(root_path,
         max_sweeps (int, optional): Number of input consecutive frames.
             Default to 10. Here we store ego2global information of these
             frames for later use.
-        skip_gather_cam_instances_info (bool, optional): Whether to skip
-            gather cam_instances infos in Step 2. Default to False.
+        only_gt_database (bool, optional): Whether to only generate ground
+            truth database. Default to False.
+        skip_image_and_lidar (bool, optional): Whether to skip saving
+            image and lidar. Default to False.
+        skip_cam_instances_infos (bool, optional): Whether to skip
+            gathering cam_instances infos in Step 2. Default to False.
     """
     from tools.dataset_converters import waymo_converter as waymo
 
@@ -205,30 +211,34 @@ def waymo_data_prep(root_path,
         splits = ['training', 'validation']
     else:
         raise NotImplementedError(f'Unsupported Waymo version {version}!')
-    for i, split in enumerate(splits):
-        load_dir = osp.join(root_path, 'waymo_format', split)
-        if split == 'validation':
-            save_dir = osp.join(out_dir, 'kitti_format', 'training')
-        else:
-            save_dir = osp.join(out_dir, 'kitti_format', split)
-        converter = waymo.Waymo2KITTI(
-            load_dir,
-            save_dir,
-            prefix=str(i),
-            workers=workers,
-            test_mode=(split
-                       in ['testing', 'testing_3d_camera_only_detection']),
-            info_prefix=info_prefix,
-            max_sweeps=max_sweeps,
-            split=split,
-            save_cam_instances=not skip_gather_cam_instances_info)
-        converter.convert()
-        if split == 'validation':
-            converter.merge_trainval_infos()
     out_dir = osp.join(out_dir, 'kitti_format')
-    from tools.dataset_converters.waymo_converter import \
-        create_ImageSets_img_ids
-    create_ImageSets_img_ids(out_dir, splits)
+
+    if not only_gt_database:
+        for i, split in enumerate(splits):
+            load_dir = osp.join(root_path, 'waymo_format', split)
+            if split == 'validation':
+                save_dir = osp.join(out_dir, 'training')
+            else:
+                save_dir = osp.join(out_dir, split)
+            converter = waymo.Waymo2KITTI(
+                load_dir,
+                save_dir,
+                prefix=str(i),
+                workers=workers,
+                test_mode=(split
+                           in ['testing', 'testing_3d_camera_only_detection']),
+                info_prefix=info_prefix,
+                max_sweeps=max_sweeps,
+                split=split,
+                save_image_and_lidar=not skip_image_and_lidar,
+                save_cam_instances=not skip_cam_instances_infos)
+            converter.convert()
+            if split == 'validation':
+                converter.merge_trainval_infos()
+
+        from tools.dataset_converters.waymo_converter import \
+            create_ImageSets_img_ids
+        create_ImageSets_img_ids(out_dir, splits)
 
     GTDatabaseCreater(
         'WaymoDataset',
@@ -286,12 +296,18 @@ def semantickitti_data_prep(info_prefix, out_dir):
 parser.add_argument(
     '--only-gt-database',
     action='store_true',
-    help='Whether to only generate ground truth database.')
+    help='''Whether to only generate ground truth database.
+        Only used when dataset is NuScenes or Waymo!''')
+parser.add_argument(
+    '--skip-cam_instances-infos',
+    action='store_true',
+    help='''Whether to skip gathering cam_instances infos.
+        Only used when dataset is Waymo!''')
 parser.add_argument(
-    '--skip-gather-cam_instances-info',
+    '--skip-image-and-lidar',
     action='store_true',
-    help='''Whether to skip gathering cam_instances info.
-        Only used when dataset is waymo!''')
+    help='''Whether to skip saving image and lidar.
+        Only used when dataset is Waymo!''')
 args = parser.parse_args()
 
 if __name__ == '__main__':
@@ -359,7 +375,9 @@ def semantickitti_data_prep(info_prefix, out_dir):
             out_dir=args.out_dir,
             workers=args.workers,
             max_sweeps=args.max_sweeps,
-            skip_gather_cam_instances_info=args.skip_gather_cam_instances_info)
+            only_gt_database=args.only_gt_database,
+            skip_image_and_lidar=args.skip_image_and_lidar,
+            skip_cam_instances_infos=args.skip_cam_instances_infos)
     elif args.dataset == 'lyft':
         train_version = f'{args.version}-train'
         lyft_data_prep(
diff --git a/tools/dataset_converters/waymo_converter.py b/tools/dataset_converters/waymo_converter.py
index fc6f5a9114..10cc15fc40 100644
--- a/tools/dataset_converters/waymo_converter.py
+++ b/tools/dataset_converters/waymo_converter.py
@@ -46,6 +46,8 @@ class Waymo2KITTI(object):
             Defaults to 64.
         test_mode (bool, optional): Whether in the test_mode.
             Defaults to False.
+        save_image_and_lidar (bool, optional): Whether to save image and lidar
+            data. Defaults to True.
         save_cam_sync_instances (bool, optional): Whether to save cam sync
             instances. Defaults to True.
         save_cam_instances (bool, optional): Whether to save cam instances.
@@ -62,6 +64,7 @@ def __init__(self,
                  prefix,
                  workers=64,
                  test_mode=False,
+                 save_image_and_lidar=True,
                  save_cam_sync_instances=True,
                  save_cam_instances=True,
                  info_prefix='waymo',
@@ -105,6 +108,7 @@ def __init__(self,
         self.prefix = prefix
         self.workers = int(workers)
         self.test_mode = test_mode
+        self.save_image_and_lidar = save_image_and_lidar
         self.save_cam_sync_instances = save_cam_sync_instances
         self.save_cam_instances = save_cam_instances
         self.info_prefix = info_prefix
@@ -175,8 +179,9 @@ def convert_one(self, file_idx):
             frame.ParseFromString(bytearray(data.numpy()))
 
             # Step 1.
-            self.save_image(frame, file_idx, frame_idx)
-            self.save_lidar(frame, file_idx, frame_idx)
+            if self.save_image_and_lidar:
+                self.save_image(frame, file_idx, frame_idx)
+                self.save_lidar(frame, file_idx, frame_idx)
 
             # Step 2.
             # TODO save the depth image for waymo challenge solution.

From eadd6fc629fe26e8e4669aaa6b60c099fa1519ec Mon Sep 17 00:00:00 2001
From: sunjiahao1999 <578431509@qq.com>
Date: Wed, 30 Aug 2023 13:26:19 +0800
Subject: [PATCH 15/25] fail use parallel

---
 .../functional/waymo_utils/prediction_to_waymo.py  |  6 ++++--
 ...ond_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py | 14 ++++++++++----
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/mmdet3d/evaluation/functional/waymo_utils/prediction_to_waymo.py b/mmdet3d/evaluation/functional/waymo_utils/prediction_to_waymo.py
index 765b1bfa02..7cb1b6efeb 100644
--- a/mmdet3d/evaluation/functional/waymo_utils/prediction_to_waymo.py
+++ b/mmdet3d/evaluation/functional/waymo_utils/prediction_to_waymo.py
@@ -139,8 +139,10 @@ def convert(self):
         """Convert action."""
         print_log('Start converting ...', logger='current')
 
-        objects_list = mmengine.track_parallel_progress(
-            self.convert_one_fast, range(len(self)), self.workers)
+        # objects_list = mmengine.track_parallel_progress(
+        #     self.convert_one_fast, range(len(self)), self.workers)
+        objects_list = mmengine.track_progress(self.convert_one_fast,
+                                               range(len(self)))
 
         combined = metrics_pb2.Objects()
         for objects in objects_list:
diff --git a/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py b/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
index ec734d8f48..87c9c3da16 100644
--- a/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
+++ b/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
@@ -5,8 +5,8 @@
 voxel_size = [0.32, 0.32, 6]
 grid_size = [468, 468, 1]
 point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4.0]
-data_root = 'data/waymo_mini/kitti_format/'
-# data_root = 'data/waymo/kitti_format/'
+# data_root = 'data/waymo_mini/kitti_format/'
+data_root = 'data/waymo/kitti_format/'
 class_names = ['Car', 'Pedestrian', 'Cyclist']
 metainfo = dict(classes=class_names)
 input_modality = dict(use_lidar=True, use_camera=False)
@@ -214,10 +214,16 @@
         backend_args=backend_args))
 test_dataloader = val_dataloader
 
+# val_evaluator = dict(
+#     type='WaymoMetric',
+#     ann_file='./data/waymo_mini/kitti_format/waymo_infos_val.pkl',
+#     waymo_bin_file='./data/waymo_mini/waymo_format/gt_mini.bin',
+#     backend_args=backend_args,
+#     convert_kitti_format=False)
 val_evaluator = dict(
     type='WaymoMetric',
-    ann_file='./data/waymo_mini/kitti_format/waymo_infos_val.pkl',
-    waymo_bin_file='./data/waymo_mini/waymo_format/gt_mini.bin',
+    ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
+    waymo_bin_file='./data/waymo/waymo_format/gt.bin',
     backend_args=backend_args,
     convert_kitti_format=False)
 # val_evaluator = dict(

From 170554052629bfdd417d9b7e498554f7270f02c6 Mon Sep 17 00:00:00 2001
From: sunjiahao1999 <578431509@qq.com>
Date: Wed, 30 Aug 2023 15:55:58 +0800
Subject: [PATCH 16/25] remove unused code

---
 mmdet3d/datasets/convert_utils.py             |  7 ---
 mmdet3d/datasets/waymo_dataset.py             |  8 ---
 tools/create_data.sh                          |  5 +-
 tools/dataset_converters/kitti_converter.py   | 58 ++++++++++---------
 .../dataset_converters/update_infos_to_v2.py  | 40 +++++--------
 tools/dataset_converters/waymo_converter.py   | 10 ++--
 6 files changed, 57 insertions(+), 71 deletions(-)

diff --git a/mmdet3d/datasets/convert_utils.py b/mmdet3d/datasets/convert_utils.py
index 9d65369593..66dba29117 100644
--- a/mmdet3d/datasets/convert_utils.py
+++ b/mmdet3d/datasets/convert_utils.py
@@ -318,7 +318,6 @@ def get_kitti_style_2d_boxes(info: dict,
 def convert_annos(info: dict, cam_idx: int) -> dict:
     """Convert front-cam anns to i-th camera (KITTI-style info)."""
     rect = info['calib']['R0_rect'].astype(np.float32)
-    # lidar2cam0 = info['calib']['Tr_velo_to_cam'].astype(np.float32)
     if cam_idx == 0:
         lidar2cami = info['calib']['Tr_velo_to_cam'].astype(np.float32)
     else:
@@ -331,14 +330,8 @@ def convert_annos(info: dict, cam_idx: int) -> dict:
     rots = annos['rotation_y']
     gt_bboxes_3d = np.concatenate([loc, dims, rots[..., np.newaxis]],
                                   axis=1).astype(np.float32)
-    # convert gt_bboxes_3d to velodyne coordinates
     # BC-breaking: gt_bboxes_3d is already in lidar coordinates
-    # TODO: Discuss correcet_yaw if necessary
-    # gt_bboxes_3d = CameraInstance3DBoxes(gt_bboxes_3d).convert_to(
-    #     Box3DMode.LIDAR, np.linalg.inv(rect @ lidar2cam0), correct_yaw=True)
     # convert gt_bboxes_3d to cam coordinates
-    # gt_bboxes_3d = gt_bboxes_3d.convert_to(
-    #     Box3DMode.CAM, rect @ lidar2cami, correct_yaw=True).numpy()
     gt_bboxes_3d = LiDARInstance3DBoxes(gt_bboxes_3d).convert_to(
         Box3DMode.CAM, rect @ lidar2cami, correct_yaw=True).numpy()
 
diff --git a/mmdet3d/datasets/waymo_dataset.py b/mmdet3d/datasets/waymo_dataset.py
index 243630d006..f5a35f5af0 100644
--- a/mmdet3d/datasets/waymo_dataset.py
+++ b/mmdet3d/datasets/waymo_dataset.py
@@ -163,14 +163,6 @@ def parse_ann_info(self, info: dict) -> dict:
             centers_2d = np.zeros((0, 2), dtype=np.float32)
             depths = np.zeros((0), dtype=np.float32)
 
-        # in waymo, lidar2cam = R0_rect @ Tr_velo_to_cam
-        # convert gt_bboxes_3d to velodyne coordinates with `lidar2cam`
-        # lidar2cam = np.array(info['images'][self.default_cam_key]
-        #   ['lidar2cam'])
-        # gt_bboxes_3d = CameraInstance3DBoxes(
-        #     ann_info['gt_bboxes_3d']).convert_to(self.box_mode_3d,
-        #                                          np.linalg.inv(lidar2cam))
-        # ann_info['gt_bboxes_3d'] = gt_bboxes_3d
         gt_bboxes_3d = LiDARInstance3DBoxes(ann_info['gt_bboxes_3d'])
 
         anns_results = dict(
diff --git a/tools/create_data.sh b/tools/create_data.sh
index 9e1fe9c0ce..0a1946585d 100755
--- a/tools/create_data.sh
+++ b/tools/create_data.sh
@@ -10,7 +10,7 @@ WORKERS=$4
 GPUS=${GPUS:-1}
 GPUS_PER_NODE=${GPUS_PER_NODE:-1}
 SRUN_ARGS=${SRUN_ARGS:-""}
-JOB_NAME=create_data
+PY_ARGS=${@:5}
 
 srun -p ${PARTITION} \
     --job-name=${JOB_NAME} \
@@ -23,4 +23,5 @@ srun -p ${PARTITION} \
             --root-path ./data/${DATASET} \
             --out-dir ./data/${DATASET} \
             --workers ${WORKERS} \
-            --extra-tag ${DATASET}
+            --extra-tag ${DATASET} \
+            ${PY_ARGS}
diff --git a/tools/dataset_converters/kitti_converter.py b/tools/dataset_converters/kitti_converter.py
index 57453c5ddc..e904918f60 100644
--- a/tools/dataset_converters/kitti_converter.py
+++ b/tools/dataset_converters/kitti_converter.py
@@ -5,6 +5,7 @@
 import mmcv
 import mmengine
 import numpy as np
+from mmengine import logging, print_log
 from nuscenes.utils.geometry_utils import view_points
 
 from mmdet3d.structures import points_cam2img
@@ -109,9 +110,8 @@ def calculate_single(self, info):
         return info
 
     def calculate(self, infos):
-        # ret_infos = mmengine.track_parallel_progress(self.calculate_single,
-        #                                              infos, self.num_worker)
-        ret_infos = [self.calculate_single(info) for info in infos]
+        ret_infos = mmengine.track_parallel_progress(self.calculate_single,
+                                                     infos, self.num_worker)
         for i, ret_info in enumerate(ret_infos):
             infos[i] = ret_info
 
@@ -250,10 +250,16 @@ def create_waymo_info_file(data_path,
         max_sweeps (int, optional): Max sweeps before the detection frame
             to be used. Default: 5.
     """
+    print_log(
+        'Deprecation Warning: related functions has been migrated to '
+        '`Waymo2KITTI.create_waymo_info_file`. It will be removed in '
+        'the future!',
+        logger='current',
+        level=logging.WARNING)
     imageset_folder = Path(data_path) / 'ImageSets'
     train_img_ids = _read_imageset_file(str(imageset_folder / 'train.txt'))
     val_img_ids = _read_imageset_file(str(imageset_folder / 'val.txt'))
-    # test_img_ids = _read_imageset_file(str(imageset_folder / 'test.txt'))
+    test_img_ids = _read_imageset_file(str(imageset_folder / 'test.txt'))
 
     print('Generate info. this may take several minutes.')
     if save_path is None:
@@ -269,40 +275,40 @@ def create_waymo_info_file(data_path,
         relative_path=relative_path,
         max_sweeps=max_sweeps,
         num_worker=workers)
-    # waymo_infos_gatherer_test = WaymoInfoGatherer(
-    #     data_path,
-    #     training=False,
-    #     label_info=False,
-    #     velodyne=True,
-    #     calib=True,
-    #     pose=True,
-    #     relative_path=relative_path,
-    #     max_sweeps=max_sweeps,
-    #     num_worker=workers)
-    # num_points_in_gt_calculater = _NumPointsInGTCalculater(
-    #     data_path,
-    #     relative_path,
-    #     num_features=6,
-    #     remove_outside=False,
-    #     num_worker=workers)
+    waymo_infos_gatherer_test = WaymoInfoGatherer(
+        data_path,
+        training=False,
+        label_info=False,
+        velodyne=True,
+        calib=True,
+        pose=True,
+        relative_path=relative_path,
+        max_sweeps=max_sweeps,
+        num_worker=workers)
+    num_points_in_gt_calculater = _NumPointsInGTCalculater(
+        data_path,
+        relative_path,
+        num_features=6,
+        remove_outside=False,
+        num_worker=workers)
 
     waymo_infos_train = waymo_infos_gatherer_trainval.gather(train_img_ids)
-    # num_points_in_gt_calculater.calculate(waymo_infos_train)
+    num_points_in_gt_calculater.calculate(waymo_infos_train)
     filename = save_path / f'{pkl_prefix}_infos_train.pkl'
     print(f'Waymo info train file is saved to {filename}')
     mmengine.dump(waymo_infos_train, filename)
     waymo_infos_val = waymo_infos_gatherer_trainval.gather(val_img_ids)
-    # num_points_in_gt_calculater.calculate(waymo_infos_val)
+    num_points_in_gt_calculater.calculate(waymo_infos_val)
     filename = save_path / f'{pkl_prefix}_infos_val.pkl'
     print(f'Waymo info val file is saved to {filename}')
     mmengine.dump(waymo_infos_val, filename)
     filename = save_path / f'{pkl_prefix}_infos_trainval.pkl'
     print(f'Waymo info trainval file is saved to {filename}')
     mmengine.dump(waymo_infos_train + waymo_infos_val, filename)
-    # waymo_infos_test = waymo_infos_gatherer_test.gather(test_img_ids)
-    # filename = save_path / f'{pkl_prefix}_infos_test.pkl'
-    # print(f'Waymo info test file is saved to {filename}')
-    # mmengine.dump(waymo_infos_test, filename)
+    waymo_infos_test = waymo_infos_gatherer_test.gather(test_img_ids)
+    filename = save_path / f'{pkl_prefix}_infos_test.pkl'
+    print(f'Waymo info test file is saved to {filename}')
+    mmengine.dump(waymo_infos_test, filename)
 
 
 def _create_reduced_point_cloud(data_path,
diff --git a/tools/dataset_converters/update_infos_to_v2.py b/tools/dataset_converters/update_infos_to_v2.py
index a779d6b6c7..a2ddbd0688 100644
--- a/tools/dataset_converters/update_infos_to_v2.py
+++ b/tools/dataset_converters/update_infos_to_v2.py
@@ -923,8 +923,8 @@ def update_waymo_infos(pkl_path, out_dir):
         base_img_path = Path(ori_info_dict['image']['image_path']).name
 
         for cam_idx, cam_key in enumerate(camera_types):
-            # temp_data_info['images'][cam_key]['timestamp'] = ori_info_dict[
-            #     'timestamp']
+            temp_data_info['images'][cam_key]['timestamp'] = ori_info_dict[
+                'timestamp']
             temp_data_info['images'][cam_key]['img_path'] = base_img_path
 
         h, w = ori_info_dict['image']['image_shape']
@@ -934,16 +934,16 @@ def update_waymo_infos(pkl_path, out_dir):
         temp_data_info['images'][camera_types[0]]['width'] = w
         temp_data_info['lidar_points']['num_pts_feats'] = ori_info_dict[
             'point_cloud']['num_features']
-        # temp_data_info['lidar_points']['timestamp'] = ori_info_dict[
-        #     'timestamp']
+        temp_data_info['lidar_points']['timestamp'] = ori_info_dict[
+            'timestamp']
         velo_path = ori_info_dict['point_cloud'].get('velodyne_path')
         if velo_path is not None:
             temp_data_info['lidar_points']['lidar_path'] = Path(velo_path).name
 
         # TODO discuss the usage of Tr_velo_to_cam in lidar
-        # Trv2c = ori_info_dict['calib']['Tr_velo_to_cam'].astype(np.float32)
+        Trv2c = ori_info_dict['calib']['Tr_velo_to_cam'].astype(np.float32)
 
-        # temp_data_info['lidar_points']['Tr_velo_to_cam'] = Trv2c.tolist()
+        temp_data_info['lidar_points']['Tr_velo_to_cam'] = Trv2c.tolist()
 
         # for potential usage
         # temp_data_info['images']['R0_rect'] = ori_info_dict['calib'][
@@ -951,8 +951,7 @@ def update_waymo_infos(pkl_path, out_dir):
 
         # for the sweeps part:
         temp_data_info['timestamp'] = ori_info_dict['timestamp']
-        temp_data_info['ego2global'] = ori_info_dict['pose'].astype(
-            np.float32).tolist()
+        temp_data_info['ego2global'] = ori_info_dict['pose']
 
         for ori_sweep in ori_info_dict['sweeps']:
             # lidar sweeps
@@ -961,22 +960,16 @@ def update_waymo_infos(pkl_path, out_dir):
             lidar_sweep['timestamp'] = ori_sweep['timestamp']
             lidar_sweep['lidar_points']['lidar_path'] = Path(
                 ori_sweep['velodyne_path']).name
-            # delete the num_pts_feats and lidar2ego
-            del lidar_sweep['lidar_points']['num_pts_feats']
-            del lidar_sweep['lidar_points']['lidar2ego']
-            # TODO: refactor image_sweeps info when we really need it.
             # image sweeps
-            # image_sweep = dict()
-            # image_sweep['sample_idx']= ori_sweep['image']['image_idx']
-            # image_sweep = get_single_image_sweep(camera_types)
-            # image_sweep['ego2global'] = ori_sweep['pose']
-            # image_sweep['timestamp'] = ori_sweep['timestamp']
-            # img_path = Path(ori_sweep['image_path']).name
-            # for cam_idx, cam_key in enumerate(camera_types):
-            #     image_sweep['images'][cam_key]['img_path'] = img_path
+            image_sweep = get_single_image_sweep(camera_types)
+            image_sweep['ego2global'] = ori_sweep['pose']
+            image_sweep['timestamp'] = ori_sweep['timestamp']
+            img_path = Path(ori_sweep['image_path']).name
+            for cam_idx, cam_key in enumerate(camera_types):
+                image_sweep['images'][cam_key]['img_path'] = img_path
 
             temp_data_info['lidar_sweeps'].append(lidar_sweep)
-            # temp_data_info['image_sweeps'].append(image_sweep)
+            temp_data_info['image_sweeps'].append(image_sweep)
 
         anns = ori_info_dict.get('annos', None)
         ignore_class_name = set()
@@ -1072,7 +1065,6 @@ def update_waymo_infos(pkl_path, out_dir):
 
         temp_data_info, _ = clear_data_info_unused_keys(temp_data_info)
         converted_list.append(temp_data_info)
-    del data_list
     pkl_name = Path(pkl_path).name
     out_path = osp.join(out_dir, pkl_name)
     print(f'Writing to output file: {out_path}.')
@@ -1111,8 +1103,8 @@ def generate_waymo_camera_instances(ori_info_dict, cam_keys):
 
     for cam_idx, cam_key in enumerate(cam_keys):
         annos = copy.deepcopy(ori_info_dict['cam_sync_annos'])
-
-        annos = convert_annos(ori_info_dict, cam_idx)
+        if cam_idx != 0:
+            annos = convert_annos(ori_info_dict, cam_idx)
 
         ann_infos = get_kitti_style_2d_boxes(
             ori_info_dict, cam_idx, occluded=[0], annos=annos, dataset='waymo')
diff --git a/tools/dataset_converters/waymo_converter.py b/tools/dataset_converters/waymo_converter.py
index 10cc15fc40..704ce391e9 100644
--- a/tools/dataset_converters/waymo_converter.py
+++ b/tools/dataset_converters/waymo_converter.py
@@ -683,9 +683,9 @@ def create_ImageSets_img_ids(root_dir, splits):
     if not exists(save_dir):
         os.mkdir(save_dir)
 
-    idx_all = [[] for _ in range(4)]
+    idx_all = [[] for _ in splits]
     for i, split in enumerate(splits):
-        path = join(root_dir, splits[i], 'image_0')
+        path = join(root_dir, split, 'image_0')
         if not exists(path):
             RawNames = []
         else:
@@ -700,6 +700,8 @@ def create_ImageSets_img_ids(root_dir, splits):
     open(save_dir + 'train.txt', 'w').writelines(idx_all[0])
     open(save_dir + 'val.txt', 'w').writelines(idx_all[1])
     open(save_dir + 'trainval.txt', 'w').writelines(idx_all[0] + idx_all[1])
-    open(save_dir + 'test.txt', 'w').writelines(idx_all[2])
-    open(save_dir + 'test_cam_only.txt', 'w').writelines(idx_all[3])
+    if idx_all[2]:
+        open(save_dir + 'test.txt', 'w').writelines(idx_all[2])
+    if idx_all[3]:
+        open(save_dir + 'test_cam_only.txt', 'w').writelines(idx_all[3])
     print('created txt files indicating what to collect in ', splits)

From 83426854d89edcc08db42735a1068015145429df Mon Sep 17 00:00:00 2001
From: sunjiahao1999 <578431509@qq.com>
Date: Tue, 12 Sep 2023 11:39:27 +0800
Subject: [PATCH 17/25] fix train iter

---
 convert_ckpt.py                               |  34 -----
 mmdet3d/datasets/det3d_dataset.py             |   4 +-
 mmdet3d/datasets/waymo_dataset.py             |  58 ++++++++-
 .../models/dense_heads/centerpoint_head.py    |   2 +-
 mmdet3d/structures/bbox_3d/base_box3d.py      |  13 +-
 ...ecfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py |  30 ++---
 projects/DSVT/dsvt/__init__.py                |   4 +-
 projects/DSVT/dsvt/dsvt_head.py               |  14 ++-
 projects/DSVT/dsvt/dynamic_pillar_vfe.py      |   3 +
 projects/DSVT/dsvt/transforms_3d.py           | 116 ++++++++++++++++++
 tools/train.py                                |  10 ++
 11 files changed, 219 insertions(+), 69 deletions(-)
 delete mode 100644 convert_ckpt.py
 create mode 100644 projects/DSVT/dsvt/transforms_3d.py

diff --git a/convert_ckpt.py b/convert_ckpt.py
deleted file mode 100644
index f5c3c9317e..0000000000
--- a/convert_ckpt.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# from mmengine.config import Config
-# cfg = Config.fromfile(path)
-# print(cfg)
-import torch
-
-mm3d_model = torch.load('checkpoints/dsvt_convert.pth')
-dsvt_model = dict()
-dsvt_model['model_state'] = dict()
-for k, v in mm3d_model.items():
-    if 'voxel_encoder' in k:
-        k = k.replace('voxel_encoder', 'vfe')
-    if 'middle_encoder' in k:
-        k = k.replace('middle_encoder', 'backbone_3d')
-    if 'backbone.' in k:
-        k = k.replace('backbone', 'backbone_2d')
-    if 'neck' in k:
-        k = k.replace('neck', 'backbone_2d')
-    if 'bbox_head.shared_conv' in k:
-        k = k.replace('bbox_head.shared_conv.conv', 'dense_head.shared_conv.0')
-        k = k.replace('bbox_head.shared_conv.bn', 'dense_head.shared_conv.1')
-    if 'bbox_head.task_heads' in k:
-        k = k.replace('bbox_head.task_heads', 'dense_head.heads_list')
-        if 'reg' in k:
-            k = k.replace('reg', 'center')
-        if 'height' in k:
-            k = k.replace('height', 'center_z')
-        if 'heatmap' in k:
-            k = k.replace('heatmap', 'hm')
-        if '0.conv' in k:
-            k = k.replace('0.conv', '0.0')
-        if '0.bn' in k:
-            k = k.replace('0.bn', '0.1')
-    dsvt_model['model_state'][k] = v
-torch.save(dsvt_model, 'dsvt_ckpt.pth')
diff --git a/mmdet3d/datasets/det3d_dataset.py b/mmdet3d/datasets/det3d_dataset.py
index 8e1570e794..c701a893fd 100644
--- a/mmdet3d/datasets/det3d_dataset.py
+++ b/mmdet3d/datasets/det3d_dataset.py
@@ -143,7 +143,9 @@ def __init__(self,
 
             # show statistics of this dataset
             print_log('-' * 30, 'current')
-            print_log(f'The length of the dataset: {len(self)}', 'current')
+            print_log(
+                f'The length of {"test" if self.test_mode else "training"} dataset: {len(self)}',  # noqa: E501
+                'current')
             content_show = [['category', 'number']]
             for label, num in enumerate(self.num_ins_per_cat):
                 cat_name = self.metainfo['classes'][label]
diff --git a/mmdet3d/datasets/waymo_dataset.py b/mmdet3d/datasets/waymo_dataset.py
index f5a35f5af0..58093a355d 100644
--- a/mmdet3d/datasets/waymo_dataset.py
+++ b/mmdet3d/datasets/waymo_dataset.py
@@ -2,7 +2,10 @@
 import os.path as osp
 from typing import Callable, List, Union
 
+import mmengine
 import numpy as np
+from mmengine import print_log
+from mmengine.fileio import load
 
 from mmdet3d.registry import DATASETS
 from mmdet3d.structures import LiDARInstance3DBoxes
@@ -176,9 +179,58 @@ def parse_ann_info(self, info: dict) -> dict:
         return anns_results
 
     def load_data_list(self) -> List[dict]:
-        """Add the load interval."""
-        data_list = super().load_data_list()
-        data_list = data_list[::self.load_interval]
+        """Add the load interval.
+
+        Returns:
+            list[dict]: A list of annotation.
+        """  # noqa: E501
+        # `self.ann_file` denotes the absolute annotation file path if
+        # `self.root=None` or relative path if `self.root=/path/to/data/`.
+        annotations = load(self.ann_file)
+        if not isinstance(annotations, dict):
+            raise TypeError(f'The annotations loaded from annotation file '
+                            f'should be a dict, but got {type(annotations)}!')
+        if 'data_list' not in annotations or 'metainfo' not in annotations:
+            raise ValueError('Annotation must have data_list and metainfo '
+                             'keys')
+        metainfo = annotations['metainfo']
+        raw_data_list = annotations['data_list']
+        raw_data_list = raw_data_list[::self.load_interval]
+        if self.load_interval > 1:
+            print_log(
+                f'Sample size will be reduced to 1/{self.load_interval} of'
+                'the original data sample',
+                logger='current')
+
+        # Meta information load from annotation file will not influence the
+        # existed meta information load from `BaseDataset.METAINFO` and
+        # `metainfo` arguments defined in constructor.
+        for k, v in metainfo.items():
+            self._metainfo.setdefault(k, v)
+
+        # load and parse data_infos.
+        data_list = []
+        for raw_data_info in mmengine.track_iter_progress(raw_data_list):
+            # parse raw data information to target format
+            data_info = self.parse_data_info(raw_data_info)
+            if isinstance(data_info, dict):
+                # For image tasks, `data_info` should information if single
+                # image, such as dict(img_path='xxx', width=360, ...)
+                data_list.append(data_info)
+            elif isinstance(data_info, list):
+                # For video tasks, `data_info` could contain image
+                # information of multiple frames, such as
+                # [dict(video_path='xxx', timestamps=...),
+                #  dict(video_path='xxx', timestamps=...)]
+                for item in data_info:
+                    if not isinstance(item, dict):
+                        raise TypeError('data_info must be list of dict, but '
+                                        f'got {type(item)}')
+                data_list.extend(data_info)
+            else:
+                raise TypeError('data_info should be a dict or list of dict, '
+                                f'but got {type(data_info)}')
+
         return data_list
 
     def parse_data_info(self, info: dict) -> Union[dict, List[dict]]:
diff --git a/mmdet3d/models/dense_heads/centerpoint_head.py b/mmdet3d/models/dense_heads/centerpoint_head.py
index 12ba84234e..301db0eee0 100644
--- a/mmdet3d/models/dense_heads/centerpoint_head.py
+++ b/mmdet3d/models/dense_heads/centerpoint_head.py
@@ -101,7 +101,7 @@ def forward(self, x):
         Returns:
             dict[str: torch.Tensor]: contains the following keys:
 
-                -reg （torch.Tensor): 2D regression value with the
+                -reg (torch.Tensor): 2D regression value with the
                     shape of [B, 2, H, W].
                 -height (torch.Tensor): Height value with the
                     shape of [B, 1, H, W].
diff --git a/mmdet3d/structures/bbox_3d/base_box3d.py b/mmdet3d/structures/bbox_3d/base_box3d.py
index 50b092c06e..7fb703c731 100644
--- a/mmdet3d/structures/bbox_3d/base_box3d.py
+++ b/mmdet3d/structures/bbox_3d/base_box3d.py
@@ -275,12 +275,13 @@ def in_range_3d(
             Tensor: A binary vector indicating whether each point is inside the
             reference range.
         """
-        in_range_flags = ((self.tensor[:, 0] > box_range[0])
-                          & (self.tensor[:, 1] > box_range[1])
-                          & (self.tensor[:, 2] > box_range[2])
-                          & (self.tensor[:, 0] < box_range[3])
-                          & (self.tensor[:, 1] < box_range[4])
-                          & (self.tensor[:, 2] < box_range[5]))
+        gravity_center = self.gravity_center
+        in_range_flags = ((gravity_center[:, 0] > box_range[0])
+                          & (gravity_center[:, 1] > box_range[1])
+                          & (gravity_center[:, 2] > box_range[2])
+                          & (gravity_center[:, 0] < box_range[3])
+                          & (gravity_center[:, 1] < box_range[4])
+                          & (gravity_center[:, 2] < box_range[5]))
         return in_range_flags
 
     @abstractmethod
diff --git a/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py b/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
index abcb15bf26..8429a2dc6c 100644
--- a/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
+++ b/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
@@ -2,6 +2,7 @@
 custom_imports = dict(
     imports=['projects.DSVT.dsvt'], allow_failed_imports=False)
 
+# load_from = 'checkpoints/dsvt_init_mm3d.pth'
 voxel_size = [0.32, 0.32, 6]
 grid_size = [468, 468, 1]
 point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4.0]
@@ -89,7 +90,7 @@
         loss_cls=dict(
             type='mmdet.GaussianFocalLoss', reduction='mean', loss_weight=1.0),
         loss_bbox=dict(type='mmdet.L1Loss', reduction='mean', loss_weight=2.0),
-        loss_iou=dict(type='mmdet.L1Loss', reduction='none', loss_weight=1.0),
+        loss_iou=dict(type='mmdet.L1Loss', reduction='sum', loss_weight=1.0),
         loss_reg_iou=dict(
             type='mmdet3d.DIoU3DLoss', reduction='mean', loss_weight=2.0),
         norm_bbox=True),
@@ -163,9 +164,9 @@
         rot_range=[-0.78539816, 0.78539816],
         scale_ratio_range=[0.95, 1.05],
         translation_std=[0.5, 0.5, 0.5]),
-    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
-    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
-    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='DSVTPointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='DSVTObjectRangeFilter', point_cloud_range=point_cloud_range),
+    # dict(type='ObjectNameFilter', classes=class_names),
     dict(type='PointShuffle'),
     dict(
         type='Pack3DDetInputs',
@@ -181,8 +182,11 @@
         norm_intensity=True,
         norm_elongation=True,
         backend_args=backend_args),
-    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
-    dict(type='Pack3DDetInputs', keys=['points'])
+    dict(type='DSVTPointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points'],
+        meta_keys=['box_type_3d', 'sample_idx', 'context_name', 'timestamp'])
 ]
 
 dataset_type = 'WaymoDataset'
@@ -237,17 +241,9 @@
     waymo_bin_file='./data/waymo/waymo_format/gt.bin',
     backend_args=backend_args,
     convert_kitti_format=False)
-# val_evaluator = dict(
-#     type='WaymoMetric',
-#     ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
-#     waymo_bin_file='./data/waymo/waymo_format/gt.bin',
-#     data_root='./data/waymo/waymo_format',
-#     backend_args=backend_args,
-#     convert_kitti_format=False,
-#     idx2metainfo='./data/waymo/waymo_format/idx2metainfo.pkl')
 test_evaluator = val_evaluator
 
-vis_backends = [dict(type='LocalVisBackend')]
+vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]
 visualizer = dict(
     type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
 lr = 1e-5
@@ -295,7 +291,7 @@
 ]
 
 # runtime settings
-train_cfg = dict(by_epoch=True, max_epochs=12, val_interval=12)
+train_cfg = dict(by_epoch=True, max_epochs=12, val_interval=1)
 
 # runtime settings
 val_cfg = dict()
@@ -309,5 +305,5 @@
 
 default_hooks = dict(
     logger=dict(type='LoggerHook', interval=50),
-    checkpoint=dict(type='CheckpointHook', interval=5))
+    checkpoint=dict(type='CheckpointHook', interval=1))
 custom_hooks = [dict(type='DisableObjectSampleHook', disable_after_epoch=11)]
diff --git a/projects/DSVT/dsvt/__init__.py b/projects/DSVT/dsvt/__init__.py
index 4fff505fb3..ed602d6a8a 100644
--- a/projects/DSVT/dsvt/__init__.py
+++ b/projects/DSVT/dsvt/__init__.py
@@ -4,9 +4,11 @@
 from .dynamic_pillar_vfe import DynamicPillarVFE3D
 from .map2bev import PointPillarsScatter3D
 from .res_second import ResSECOND
+from .transforms_3d import DSVTObjectRangeFilter, DSVTPointsRangeFilter
 from .utils import DSVTBBoxCoder
 
 __all__ = [
     'DSVTCenterHead', 'DSVT', 'DSVTMiddleEncoder', 'DynamicPillarVFE3D',
-    'PointPillarsScatter3D', 'ResSECOND', 'DSVTBBoxCoder'
+    'PointPillarsScatter3D', 'ResSECOND', 'DSVTBBoxCoder',
+    'DSVTObjectRangeFilter', 'DSVTPointsRangeFilter'
 ]
diff --git a/projects/DSVT/dsvt/dsvt_head.py b/projects/DSVT/dsvt/dsvt_head.py
index 2d7d8d03db..e6ff075305 100644
--- a/projects/DSVT/dsvt/dsvt_head.py
+++ b/projects/DSVT/dsvt/dsvt_head.py
@@ -161,6 +161,7 @@ def calc_iou_loss(self, iou_preds, batch_box_preds, mask, ind, gt_boxes):
         iou_target = iou_target * 2 - 1  # [0, 1] ==> [-1, 1]
 
         loss = self.loss_iou(selected_iou_preds.view(-1), iou_target)
+        loss = loss / torch.clamp(mask.sum(), min=1e-4)
         return loss
 
     def calc_iou_reg_loss(self, batch_box_preds, mask, ind, gt_boxes):
@@ -222,10 +223,10 @@ def get_targets(
         # Transpose inds
         inds = list(map(list, zip(*inds)))
         inds = [torch.stack(inds_) for inds_ in inds]
-        # Transpose inds
+        # Transpose masks
         masks = list(map(list, zip(*masks)))
         masks = [torch.stack(masks_) for masks_ in masks]
-        # Transpose inds
+        # Transpose task_gt_bboxes
         task_gt_bboxes = list(map(list, zip(*task_gt_bboxes)))
         return heatmaps, anno_boxes, inds, masks, task_gt_bboxes
 
@@ -358,8 +359,8 @@ def get_targets_single(self,
                     anno_box[new_idx] = torch.cat([
                         center - torch.tensor([x, y], device=device),
                         z.unsqueeze(0), box_dim,
-                        torch.sin(rot).unsqueeze(0),
-                        torch.cos(rot).unsqueeze(0)
+                        torch.cos(rot).unsqueeze(0),
+                        torch.sin(rot).unsqueeze(0)
                     ])
 
             heatmaps.append(heatmap)
@@ -432,7 +433,7 @@ def loss_by_feat(self, preds_dicts: Tuple[List[dict]],
                 loss_dict[f'task{task_id}.loss_iou'] = self.calc_iou_loss(
                     iou_preds=preds_dict[0]['iou'],
                     batch_box_preds=batch_box_preds_for_iou.clone().detach(),
-                    mask=mask.all(dim=-1),
+                    mask=masks[task_id],
                     ind=ind,
                     gt_boxes=task_gt_bboxes[task_id])
 
@@ -440,7 +441,7 @@ def loss_by_feat(self, preds_dicts: Tuple[List[dict]],
                     loss_dict[f'task{task_id}.loss_reg_iou'] = \
                         self.calc_iou_reg_loss(
                             batch_box_preds=batch_box_preds_for_iou,
-                            mask=mask.all(dim=-1),
+                            mask=masks[task_id],
                             ind=ind,
                             gt_boxes=task_gt_bboxes[task_id])
 
@@ -517,6 +518,7 @@ def predict_by_feat(self, preds_dicts: Tuple[List[dict]],
             else:
                 batch_dim = preds_dict[0]['dim']
 
+            # It's different from CenterHead
             batch_rotc = preds_dict[0]['rot'][:, 0].unsqueeze(1)
             batch_rots = preds_dict[0]['rot'][:, 1].unsqueeze(1)
             batch_iou = (preds_dict[0]['iou'] +
diff --git a/projects/DSVT/dsvt/dynamic_pillar_vfe.py b/projects/DSVT/dsvt/dynamic_pillar_vfe.py
index 97c75aaf00..6d21a69713 100644
--- a/projects/DSVT/dsvt/dynamic_pillar_vfe.py
+++ b/projects/DSVT/dsvt/dynamic_pillar_vfe.py
@@ -1,4 +1,5 @@
 # modified from https://github.com/Haiyang-W/DSVT
+import numpy as np
 import torch
 import torch.nn as nn
 import torch_scatter
@@ -76,6 +77,8 @@ def __init__(self, with_distance, use_absolute_xyz, use_norm, num_filters,
         self.voxel_x = voxel_size[0]
         self.voxel_y = voxel_size[1]
         self.voxel_z = voxel_size[2]
+        # TODO: remove it after 对齐精度
+        point_cloud_range = np.array(point_cloud_range).astype(np.float32)
         self.x_offset = self.voxel_x / 2 + point_cloud_range[0]
         self.y_offset = self.voxel_y / 2 + point_cloud_range[1]
         self.z_offset = self.voxel_z / 2 + point_cloud_range[2]
diff --git a/projects/DSVT/dsvt/transforms_3d.py b/projects/DSVT/dsvt/transforms_3d.py
new file mode 100644
index 0000000000..03d2c38dd7
--- /dev/null
+++ b/projects/DSVT/dsvt/transforms_3d.py
@@ -0,0 +1,116 @@
+from typing import List
+
+import numpy as np
+from mmcv import BaseTransform
+
+from mmdet3d.registry import TRANSFORMS
+
+
+@TRANSFORMS.register_module()
+class DSVTObjectRangeFilter(BaseTransform):
+    """Filter objects by the range. It differs from `ObjectRangeFilter` by
+    using `in_range_3d` instead of `in_range_bev`.
+
+    Required Keys:
+
+    - gt_bboxes_3d
+
+    Modified Keys:
+
+    - gt_bboxes_3d
+
+    Args:
+        point_cloud_range (list[float]): Point cloud range.
+    """
+
+    def __init__(self, point_cloud_range: List[float]) -> None:
+        self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
+
+    def transform(self, input_dict: dict) -> dict:
+        """Transform function to filter objects by the range.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d'
+            keys are updated in the result dict.
+        """
+        gt_bboxes_3d = input_dict['gt_bboxes_3d']
+        gt_labels_3d = input_dict['gt_labels_3d']
+        mask = gt_bboxes_3d.in_range_3d(self.pcd_range)
+        gt_bboxes_3d = gt_bboxes_3d[mask]
+        # mask is a torch tensor but gt_labels_3d is still numpy array
+        # using mask to index gt_labels_3d will cause bug when
+        # len(gt_labels_3d) == 1, where mask=1 will be interpreted
+        # as gt_labels_3d[1] and cause out of index error
+        gt_labels_3d = gt_labels_3d[mask.numpy().astype(bool)]
+
+        # limit rad to [-pi, pi]
+        gt_bboxes_3d.limit_yaw(offset=0.5, period=2 * np.pi)
+        input_dict['gt_bboxes_3d'] = gt_bboxes_3d
+        input_dict['gt_labels_3d'] = gt_labels_3d
+
+        return input_dict
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class DSVTPointsRangeFilter(BaseTransform):
+    """Filter points by the range. It differs from `PointRangeFilter` by using
+    `in_range_bev` instead of `in_range_3d`.
+
+    Required Keys:
+
+    - points
+    - pts_instance_mask (optional)
+
+    Modified Keys:
+
+    - points
+    - pts_instance_mask (optional)
+
+    Args:
+        point_cloud_range (list[float]): Point cloud range.
+    """
+
+    def __init__(self, point_cloud_range: List[float]) -> None:
+        self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
+
+    def transform(self, input_dict: dict) -> dict:
+        """Transform function to filter points by the range.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'points', 'pts_instance_mask'
+            and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        points = input_dict['points']
+        points_mask = points.in_range_bev(self.pcd_range[[0, 1, 3, 4]])
+        clean_points = points[points_mask]
+        input_dict['points'] = clean_points
+        points_mask = points_mask.numpy()
+
+        pts_instance_mask = input_dict.get('pts_instance_mask', None)
+        pts_semantic_mask = input_dict.get('pts_semantic_mask', None)
+
+        if pts_instance_mask is not None:
+            input_dict['pts_instance_mask'] = pts_instance_mask[points_mask]
+
+        if pts_semantic_mask is not None:
+            input_dict['pts_semantic_mask'] = pts_semantic_mask[points_mask]
+
+        return input_dict
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
+        return repr_str
diff --git a/tools/train.py b/tools/train.py
index b2ced54b05..6b9c3b0842 100644
--- a/tools/train.py
+++ b/tools/train.py
@@ -21,6 +21,12 @@ def parse_args():
         action='store_true',
         default=False,
         help='enable automatic-mixed-precision training')
+    parser.add_argument(
+        '--sync_bn',
+        choices=['none', 'torch', 'mmcv'],
+        default='none',
+        help='convert all BatchNorm layers in the model to SyncBatchNorm '
+        '(SyncBN) or mmcv.ops.sync_bn.SyncBatchNorm (MMSyncBN) layers.')
     parser.add_argument(
         '--auto-scale-lr',
         action='store_true',
@@ -98,6 +104,10 @@ def main():
             cfg.optim_wrapper.type = 'AmpOptimWrapper'
             cfg.optim_wrapper.loss_scale = 'dynamic'
 
+    # convert BatchNorm layers
+    if args.sync_bn != 'none':
+        cfg.sync_bn = args.sync_bn
+
     # enable automatically scaling LR
     if args.auto_scale_lr:
         if 'auto_scale_lr' in cfg and \

From 2c9b33392718d3bca4c7c3ead4b54c4509d3b191 Mon Sep 17 00:00:00 2001
From: sunjiahao1999 <578431509@qq.com>
Date: Fri, 15 Sep 2023 14:46:21 +0800
Subject: [PATCH 18/25] fix dis aug and model init

---
 ...ecfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py | 28 ++++----
 projects/DSVT/dsvt/__init__.py                |  3 +-
 projects/DSVT/dsvt/disable_aug_hook.py        | 69 +++++++++++++++++++
 projects/DSVT/dsvt/dsvt_head.py               |  5 ++
 4 files changed, 91 insertions(+), 14 deletions(-)
 create mode 100644 projects/DSVT/dsvt/disable_aug_hook.py

diff --git a/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py b/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
index 8429a2dc6c..8220057a83 100644
--- a/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
+++ b/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
@@ -2,7 +2,7 @@
 custom_imports = dict(
     imports=['projects.DSVT.dsvt'], allow_failed_imports=False)
 
-# load_from = 'checkpoints/dsvt_init_mm3d.pth'
+# load_from = 'checkpoints/dsvt_init.pth'
 voxel_size = [0.32, 0.32, 6]
 grid_size = [468, 468, 1]
 point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4.0]
@@ -143,15 +143,8 @@
         load_dim=6,
         use_dim=5,
         norm_intensity=True,
+        norm_elongation=True,
         backend_args=backend_args),
-    # Add this if using `MultiFrameDeformableDecoderRPN`
-    # dict(
-    #     type='LoadPointsFromMultiSweeps',
-    #     sweeps_num=9,
-    #     load_dim=6,
-    #     use_dim=[0, 1, 2, 3, 4],
-    #     pad_empty_sweeps=True,
-    #     remove_close=True),
     dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
     dict(type='ObjectSample', db_sampler=db_sampler),
     dict(
@@ -194,6 +187,7 @@
     batch_size=1,
     num_workers=4,
     persistent_workers=True,
+    # sampler=dict(type='DefaultSampler', shuffle=False),
     sampler=dict(type='DefaultSampler', shuffle=True),
     dataset=dict(
         type=dataset_type,
@@ -211,8 +205,8 @@
         load_interval=5,
         backend_args=backend_args))
 val_dataloader = dict(
-    batch_size=1,
-    num_workers=1,
+    batch_size=4,
+    num_workers=4,
     persistent_workers=True,
     drop_last=False,
     sampler=dict(type='DefaultSampler', shuffle=False),
@@ -244,6 +238,7 @@
 test_evaluator = val_evaluator
 
 vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]
+# vis_backends = [dict(type='LocalVisBackend')]
 visualizer = dict(
     type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
 lr = 1e-5
@@ -251,7 +246,7 @@
 # max_norm=10 is better for SECOND
 optim_wrapper = dict(
     type='OptimWrapper',
-    optimizer=dict(type='Adam', lr=lr, weight_decay=0.05, betas=(0.9, 0.999)),
+    optimizer=dict(type='AdamW', lr=lr, weight_decay=0.05, betas=(0.9, 0.99)),
     clip_grad=dict(max_norm=10, norm_type=2))
 # learning rate
 param_scheduler = [
@@ -306,4 +301,11 @@
 default_hooks = dict(
     logger=dict(type='LoggerHook', interval=50),
     checkpoint=dict(type='CheckpointHook', interval=1))
-custom_hooks = [dict(type='DisableObjectSampleHook', disable_after_epoch=11)]
+custom_hooks = [
+    dict(
+        type='DisableAugHook',
+        disable_after_epoch=11,
+        disable_aug_list=[
+            'GlobalRotScaleTrans', 'RandomFlip3D', 'ObjectSample'
+        ])
+]
diff --git a/projects/DSVT/dsvt/__init__.py b/projects/DSVT/dsvt/__init__.py
index ed602d6a8a..58395863a4 100644
--- a/projects/DSVT/dsvt/__init__.py
+++ b/projects/DSVT/dsvt/__init__.py
@@ -1,3 +1,4 @@
+from .disable_aug_hook import DisableAugHook
 from .dsvt import DSVT
 from .dsvt_head import DSVTCenterHead
 from .dsvt_transformer import DSVTMiddleEncoder
@@ -10,5 +11,5 @@
 __all__ = [
     'DSVTCenterHead', 'DSVT', 'DSVTMiddleEncoder', 'DynamicPillarVFE3D',
     'PointPillarsScatter3D', 'ResSECOND', 'DSVTBBoxCoder',
-    'DSVTObjectRangeFilter', 'DSVTPointsRangeFilter'
+    'DSVTObjectRangeFilter', 'DSVTPointsRangeFilter', 'DisableAugHook'
 ]
diff --git a/projects/DSVT/dsvt/disable_aug_hook.py b/projects/DSVT/dsvt/disable_aug_hook.py
new file mode 100644
index 0000000000..5a4dff5fb2
--- /dev/null
+++ b/projects/DSVT/dsvt/disable_aug_hook.py
@@ -0,0 +1,69 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+from mmengine.dataset import BaseDataset
+from mmengine.hooks import Hook
+from mmengine.model import is_model_wrapper
+from mmengine.runner import Runner
+
+from mmdet3d.registry import HOOKS
+
+
+@HOOKS.register_module()
+class DisableAugHook(Hook):
+    """The hook of disabling augmentations during training.
+
+    Args:
+        disable_after_epoch (int): The number of epochs after which
+            the data augmentation will be closed in the training.
+            Defaults to 15.
+        disable_aug_list (list): the list of data augmentation will
+            be closed in the training. Defaults to [].
+    """
+
+    def __init__(self,
+                 disable_after_epoch: int = 15,
+                 disable_aug_list: List = []):
+        self.disable_after_epoch = disable_after_epoch
+        self.disable_aug_list = disable_aug_list
+        self._restart_dataloader = False
+
+    def before_train_epoch(self, runner: Runner):
+        """Close augmentation.
+
+        Args:
+            runner (Runner): The runner.
+        """
+        epoch = runner.epoch
+        train_loader = runner.train_dataloader
+        model = runner.model
+        # TODO: refactor after mmengine using model wrapper
+        if is_model_wrapper(model):
+            model = model.module
+        if epoch == self.disable_after_epoch:
+
+            dataset = runner.train_dataloader.dataset
+            # handle dataset wrapper
+            if not isinstance(dataset, BaseDataset):
+                dataset = dataset.dataset
+            new_transforms = []
+            for transform in dataset.pipeline.transforms:  # noqa: E501
+                if transform.__class__.__name__ not in self.disable_aug_list:
+                    new_transforms.append(transform)
+                else:
+                    runner.logger.info(
+                        f'Disable {transform.__class__.__name__}')
+            dataset.pipeline.transforms = new_transforms
+            # The dataset pipeline cannot be updated when persistent_workers
+            # is True, so we need to force the dataloader's multi-process
+            # restart. This is a very hacky approach.
+            if hasattr(train_loader, 'persistent_workers'
+                       ) and train_loader.persistent_workers is True:
+                train_loader._DataLoader__initialized = False
+                train_loader._iterator = None
+                self._restart_dataloader = True
+        else:
+            # Once the restart is complete, we need to restore
+            # the initialization flag.
+            if self._restart_dataloader:
+                train_loader._DataLoader__initialized = True
diff --git a/projects/DSVT/dsvt/dsvt_head.py b/projects/DSVT/dsvt/dsvt_head.py
index e6ff075305..74bae83f2e 100644
--- a/projects/DSVT/dsvt/dsvt_head.py
+++ b/projects/DSVT/dsvt/dsvt_head.py
@@ -32,6 +32,11 @@ def __init__(self,
         self.loss_iou_reg = MODELS.build(
             loss_reg_iou) if loss_reg_iou is not None else None
 
+    def init_weights(self):
+        super().init_weights()
+        for task_head in self.task_heads:
+            task_head.init_weights()
+
     def forward_single(self, x: Tensor) -> dict:
         """Forward function for CenterPoint.
 

From 98247780baa74761c1b47dabe4cf79c57fe384c4 Mon Sep 17 00:00:00 2001
From: sunjihao1999 <578431509@qq.com>
Date: Mon, 18 Sep 2023 13:28:08 +0800
Subject: [PATCH 19/25] train align

---
 mmdet3d/datasets/waymo_dataset.py             |  2 +-
 mmdet3d/evaluation/metrics/waymo_metric.py    |  4 ++--
 .../models/dense_heads/centerpoint_head.py    |  2 +-
 mmdet3d/models/necks/second_fpn.py            | 18 +++++++++------
 ...ecfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py |  6 +++--
 projects/DSVT/dsvt/dsvt_head.py               | 23 ++++++++++++++++---
 projects/DSVT/dsvt/res_second.py              | 13 +----------
 projects/DSVT/dsvt/utils.py                   |  4 ++--
 8 files changed, 42 insertions(+), 30 deletions(-)

diff --git a/mmdet3d/datasets/waymo_dataset.py b/mmdet3d/datasets/waymo_dataset.py
index 58093a355d..f12462eaa8 100644
--- a/mmdet3d/datasets/waymo_dataset.py
+++ b/mmdet3d/datasets/waymo_dataset.py
@@ -210,7 +210,7 @@ def load_data_list(self) -> List[dict]:
 
         # load and parse data_infos.
         data_list = []
-        for raw_data_info in mmengine.track_iter_progress(raw_data_list):
+        for raw_data_info in raw_data_list:
             # parse raw data information to target format
             data_info = self.parse_data_info(raw_data_info)
             if isinstance(data_info, dict):
diff --git a/mmdet3d/evaluation/metrics/waymo_metric.py b/mmdet3d/evaluation/metrics/waymo_metric.py
index fb8ac495e6..dc2f209b9a 100644
--- a/mmdet3d/evaluation/metrics/waymo_metric.py
+++ b/mmdet3d/evaluation/metrics/waymo_metric.py
@@ -36,7 +36,7 @@ class WaymoMetric(KittiMetric):
         prefix (str, optional): The prefix that will be added in the metric
             names to disambiguate homonymous metrics of different evaluators.
             If prefix is not provided in the argument, self.default_prefix will
-            be used instead. Defaults to None.
+            be used instead. Defaults to Waymo metric.
         format_only (bool): Format the output results without perform
             evaluation. It is useful when you want to format the result to a
             specific format and submit it to the test server.
@@ -83,7 +83,7 @@ def __init__(self,
                  metric: Union[str, List[str]] = 'mAP',
                  pcd_limit_range: List[float] = [-85, -85, -5, 85, 85, 5],
                  convert_kitti_format: bool = True,
-                 prefix: Optional[str] = None,
+                 prefix: Optional[str] = 'Waymo metric',
                  format_only: bool = False,
                  pklfile_prefix: Optional[str] = None,
                  submission_prefix: Optional[str] = None,
diff --git a/mmdet3d/models/dense_heads/centerpoint_head.py b/mmdet3d/models/dense_heads/centerpoint_head.py
index 301db0eee0..c3fc187964 100644
--- a/mmdet3d/models/dense_heads/centerpoint_head.py
+++ b/mmdet3d/models/dense_heads/centerpoint_head.py
@@ -217,7 +217,7 @@ def forward(self, x):
         Returns:
             dict[str: torch.Tensor]: contains the following keys:
 
-                -reg （torch.Tensor): 2D regression value with the
+                -reg (torch.Tensor): 2D regression value with the
                     shape of [B, 2, H, W].
                 -height (torch.Tensor): Height value with the
                     shape of [B, 1, H, W].
diff --git a/mmdet3d/models/necks/second_fpn.py b/mmdet3d/models/necks/second_fpn.py
index 90e57ec05c..e52a26ec4c 100644
--- a/mmdet3d/models/necks/second_fpn.py
+++ b/mmdet3d/models/necks/second_fpn.py
@@ -21,6 +21,10 @@ class SECONDFPN(BaseModule):
         upsample_cfg (dict): Config dict of upsample layers.
         conv_cfg (dict): Config dict of conv layers.
         use_conv_for_no_stride (bool): Whether to use conv when stride is 1.
+        init_cfg (dict or :obj:`ConfigDict` or list[dict or :obj:`ConfigDict`],
+            optional): Initialization config dict. Defaults to 
+            [dict(type='Kaiming', layer='ConvTranspose2d'),
+             dict(type='Constant', layer='NaiveSyncBatchNorm2d', val=1.0)].
     """
 
     def __init__(self,
@@ -31,7 +35,13 @@ def __init__(self,
                  upsample_cfg=dict(type='deconv', bias=False),
                  conv_cfg=dict(type='Conv2d', bias=False),
                  use_conv_for_no_stride=False,
-                 init_cfg=None):
+                 init_cfg=[
+                     dict(type='Kaiming', layer='ConvTranspose2d'),
+                     dict(
+                         type='Constant',
+                         layer='NaiveSyncBatchNorm2d',
+                         val=1.0)
+                 ]):
         # if for GroupNorm,
         # cfg is dict(type='GN', num_groups=num_groups, eps=1e-3, affine=True)
         super(SECONDFPN, self).__init__(init_cfg=init_cfg)
@@ -64,12 +74,6 @@ def __init__(self,
             deblocks.append(deblock)
         self.deblocks = nn.ModuleList(deblocks)
 
-        if init_cfg is None:
-            self.init_cfg = [
-                dict(type='Kaiming', layer='ConvTranspose2d'),
-                dict(type='Constant', layer='NaiveSyncBatchNorm2d', val=1.0)
-            ]
-
     def forward(self, x):
         """Forward function.
 
diff --git a/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py b/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
index 8220057a83..9d0be465e8 100644
--- a/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
+++ b/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
@@ -133,6 +133,8 @@
         coord_type='LIDAR',
         load_dim=6,
         use_dim=[0, 1, 2, 3, 4],
+        norm_intensity=True,
+        norm_elongation=True,
         backend_args=backend_args),
     backend_args=backend_args)
 
@@ -192,7 +194,7 @@
     dataset=dict(
         type=dataset_type,
         data_root=data_root,
-        ann_file='waymo_infos_train.pkl',
+        ann_file='waymo_wo_cam_ins_infos_train.pkl',
         data_prefix=dict(pts='training/velodyne', sweeps='training/velodyne'),
         pipeline=train_pipeline,
         modality=input_modality,
@@ -214,7 +216,7 @@
         type=dataset_type,
         data_root=data_root,
         data_prefix=dict(pts='training/velodyne', sweeps='training/velodyne'),
-        ann_file='waymo_infos_val.pkl',
+        ann_file='waymo_wo_cam_ins_infos_val.pkl',
         pipeline=test_pipeline,
         modality=input_modality,
         test_mode=True,
diff --git a/projects/DSVT/dsvt/dsvt_head.py b/projects/DSVT/dsvt/dsvt_head.py
index 74bae83f2e..808d3c3310 100644
--- a/projects/DSVT/dsvt/dsvt_head.py
+++ b/projects/DSVT/dsvt/dsvt_head.py
@@ -12,6 +12,10 @@
                                   gaussian_radius)
 from mmdet3d.registry import MODELS
 from mmdet3d.structures import Det3DDataSample, xywhr2xyxyr
+from mmengine.model import kaiming_init
+import math
+from torch.nn.init import constant_
+import torch.nn as nn
 
 
 @MODELS.register_module()
@@ -33,9 +37,22 @@ def __init__(self,
             loss_reg_iou) if loss_reg_iou is not None else None
 
     def init_weights(self):
-        super().init_weights()
-        for task_head in self.task_heads:
-            task_head.init_weights()
+        kaiming_init(
+            self.shared_conv.conv,
+            a=math.sqrt(5),
+            mode='fan_in',
+            nonlinearity='leaky_relu',
+            distribution='uniform')
+        for head in self.task_heads[0].heads:
+            if head == 'heatmap':
+                constant_(self.task_heads[0].__getattr__(head)[-1].bias,
+                          self.task_heads[0].init_bias)
+            else:
+                for m in self.task_heads[0].__getattr__(head).modules():
+                    if isinstance(m, nn.Conv2d):
+                        kaiming_init(
+                            m, mode='fan_in', nonlinearity='leaky_relu')
+
 
     def forward_single(self, x: Tensor) -> dict:
         """Forward function for CenterPoint.
diff --git a/projects/DSVT/dsvt/res_second.py b/projects/DSVT/dsvt/res_second.py
index e1ddc1be6c..0755f895f9 100644
--- a/projects/DSVT/dsvt/res_second.py
+++ b/projects/DSVT/dsvt/res_second.py
@@ -78,8 +78,6 @@ class ResSECOND(BaseModule):
         out_channels (list[int]): Output channels for multi-scale feature maps.
         blocks_nums (list[int]): Number of blocks in each stage.
         layer_strides (list[int]): Strides of each stage.
-        norm_cfg (dict): Config dict of normalization layers.
-        conv_cfg (dict): Config dict of convolutional layers.
     """
 
     def __init__(self,
@@ -87,8 +85,7 @@ def __init__(self,
                  out_channels: Sequence[int] = [128, 128, 256],
                  blocks_nums: Sequence[int] = [1, 2, 2],
                  layer_strides: Sequence[int] = [2, 2, 2],
-                 init_cfg: OptMultiConfig = None,
-                 pretrained: Optional[str] = None) -> None:
+                 init_cfg: OptMultiConfig = None) -> None:
         super(ResSECOND, self).__init__(init_cfg=init_cfg)
         assert len(layer_strides) == len(blocks_nums)
         assert len(out_channels) == len(blocks_nums)
@@ -108,14 +105,6 @@ def __init__(self,
                     BasicResBlock(out_channels[i], out_channels[i]))
             blocks.append(nn.Sequential(*cur_layers))
         self.blocks = nn.Sequential(*blocks)
-        assert not (init_cfg and pretrained), \
-            'init_cfg and pretrained cannot be setting at the same time'
-        if isinstance(pretrained, str):
-            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
-                          'please use "init_cfg" instead')
-            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
-        else:
-            self.init_cfg = dict(type='Kaiming', layer='Conv2d')
 
     def forward(self, x: Tensor) -> Tuple[Tensor, ...]:
         """Forward function.
diff --git a/projects/DSVT/dsvt/utils.py b/projects/DSVT/dsvt/utils.py
index 4d697d2dbb..7845b27f3f 100644
--- a/projects/DSVT/dsvt/utils.py
+++ b/projects/DSVT/dsvt/utils.py
@@ -9,7 +9,7 @@
 from mmdet3d.models.task_modules import CenterPointBBoxCoder
 from mmdet3d.registry import MODELS, TASK_UTILS
 from .ops.ingroup_inds.ingroup_inds_op import ingroup_inds
-
+from mmengine import print_log
 get_inner_win_inds_cuda = ingroup_inds
 
 
@@ -267,7 +267,7 @@ def decode(self,
             thresh_mask = final_scores > self.score_threshold
 
         if self.post_center_range is not None:
-            self.post_center_range = torch.tensor(
+            self.post_center_range = torch.as_tensor(
                 self.post_center_range, device=heat.device)
             mask = (final_box_preds[..., :3] >=
                     self.post_center_range[:3]).all(2)

From 681423f9e48186a674c4bee7ef640824ec83774d Mon Sep 17 00:00:00 2001
From: sunjihao1999 <578431509@qq.com>
Date: Mon, 18 Sep 2023 13:32:29 +0800
Subject: [PATCH 20/25] fix lint

---
 mmdet3d/datasets/waymo_dataset.py  | 1 -
 mmdet3d/models/necks/second_fpn.py | 2 +-
 projects/DSVT/dsvt/dsvt_head.py    | 9 ++++-----
 projects/DSVT/dsvt/res_second.py   | 4 +---
 projects/DSVT/dsvt/utils.py        | 2 +-
 5 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/mmdet3d/datasets/waymo_dataset.py b/mmdet3d/datasets/waymo_dataset.py
index f12462eaa8..0d131164e9 100644
--- a/mmdet3d/datasets/waymo_dataset.py
+++ b/mmdet3d/datasets/waymo_dataset.py
@@ -2,7 +2,6 @@
 import os.path as osp
 from typing import Callable, List, Union
 
-import mmengine
 import numpy as np
 from mmengine import print_log
 from mmengine.fileio import load
diff --git a/mmdet3d/models/necks/second_fpn.py b/mmdet3d/models/necks/second_fpn.py
index e52a26ec4c..d4dc590c15 100644
--- a/mmdet3d/models/necks/second_fpn.py
+++ b/mmdet3d/models/necks/second_fpn.py
@@ -22,7 +22,7 @@ class SECONDFPN(BaseModule):
         conv_cfg (dict): Config dict of conv layers.
         use_conv_for_no_stride (bool): Whether to use conv when stride is 1.
         init_cfg (dict or :obj:`ConfigDict` or list[dict or :obj:`ConfigDict`],
-            optional): Initialization config dict. Defaults to 
+            optional): Initialization config dict. Defaults to
             [dict(type='Kaiming', layer='ConvTranspose2d'),
              dict(type='Constant', layer='NaiveSyncBatchNorm2d', val=1.0)].
     """
diff --git a/projects/DSVT/dsvt/dsvt_head.py b/projects/DSVT/dsvt/dsvt_head.py
index 808d3c3310..650a585dfb 100644
--- a/projects/DSVT/dsvt/dsvt_head.py
+++ b/projects/DSVT/dsvt/dsvt_head.py
@@ -1,10 +1,14 @@
+import math
 from typing import Dict, List, Tuple
 
 import torch
+import torch.nn as nn
 from mmcv.ops import boxes_iou3d
 from mmdet.models.utils import multi_apply
+from mmengine.model import kaiming_init
 from mmengine.structures import InstanceData
 from torch import Tensor
+from torch.nn.init import constant_
 
 from mmdet3d.models import CenterHead
 from mmdet3d.models.layers import circle_nms, nms_bev
@@ -12,10 +16,6 @@
                                   gaussian_radius)
 from mmdet3d.registry import MODELS
 from mmdet3d.structures import Det3DDataSample, xywhr2xyxyr
-from mmengine.model import kaiming_init
-import math
-from torch.nn.init import constant_
-import torch.nn as nn
 
 
 @MODELS.register_module()
@@ -53,7 +53,6 @@ def init_weights(self):
                         kaiming_init(
                             m, mode='fan_in', nonlinearity='leaky_relu')
 
-
     def forward_single(self, x: Tensor) -> dict:
         """Forward function for CenterPoint.
 
diff --git a/projects/DSVT/dsvt/res_second.py b/projects/DSVT/dsvt/res_second.py
index 0755f895f9..072c586181 100644
--- a/projects/DSVT/dsvt/res_second.py
+++ b/projects/DSVT/dsvt/res_second.py
@@ -1,7 +1,5 @@
 # modified from https://github.com/Haiyang-W/DSVT
-
-import warnings
-from typing import Optional, Sequence, Tuple
+from typing import Sequence, Tuple
 
 from mmengine.model import BaseModule
 from torch import Tensor
diff --git a/projects/DSVT/dsvt/utils.py b/projects/DSVT/dsvt/utils.py
index 7845b27f3f..978f9154da 100644
--- a/projects/DSVT/dsvt/utils.py
+++ b/projects/DSVT/dsvt/utils.py
@@ -9,7 +9,7 @@
 from mmdet3d.models.task_modules import CenterPointBBoxCoder
 from mmdet3d.registry import MODELS, TASK_UTILS
 from .ops.ingroup_inds.ingroup_inds_op import ingroup_inds
-from mmengine import print_log
+
 get_inner_win_inds_cuda = ingroup_inds
 
 

From fd0825ab094e06e7c13769af8587430c6edb5354 Mon Sep 17 00:00:00 2001
From: sunjihao1999 <578431509@qq.com>
Date: Mon, 18 Sep 2023 14:48:10 +0800
Subject: [PATCH 21/25] fix basepoints in_range_3d

---
 mmdet3d/structures/points/base_points.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mmdet3d/structures/points/base_points.py b/mmdet3d/structures/points/base_points.py
index 188d20d270..4cb54ce895 100644
--- a/mmdet3d/structures/points/base_points.py
+++ b/mmdet3d/structures/points/base_points.py
@@ -247,10 +247,10 @@ def in_range_3d(
         """
         in_range_flags = ((self.tensor[:, 0] > point_range[0])
                           & (self.tensor[:, 1] > point_range[1])
-                          #   & (self.tensor[:, 2] > point_range[2])
+                          & (self.tensor[:, 2] > point_range[2])
                           & (self.tensor[:, 0] < point_range[3])
-                          & (self.tensor[:, 1] < point_range[4]))
-        #   & (self.tensor[:, 2] < point_range[5]))
+                          & (self.tensor[:, 1] < point_range[4])
+                          & (self.tensor[:, 2] < point_range[5]))
         return in_range_flags
 
     @property

From 49a564ae619acd4f708e240f4bd68850c8624d33 Mon Sep 17 00:00:00 2001
From: sunjiahao1999 <578431509@qq.com>
Date: Wed, 27 Dec 2023 20:21:59 +0800
Subject: [PATCH 22/25] reset unused

---
 mmdet3d/datasets/convert_utils.py             | 16 ++++++-------
 projects/DSVT/README.md                       | 14 +++++++----
 ...ecfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py | 24 ++++++-------------
 tools/dataset_converters/kitti_converter.py   |  7 ------
 tools/dataset_converters/kitti_data_utils.py  |  7 +-----
 5 files changed, 25 insertions(+), 43 deletions(-)

diff --git a/mmdet3d/datasets/convert_utils.py b/mmdet3d/datasets/convert_utils.py
index 66dba29117..cb4d97e137 100644
--- a/mmdet3d/datasets/convert_utils.py
+++ b/mmdet3d/datasets/convert_utils.py
@@ -10,7 +10,7 @@
 from shapely.geometry import MultiPoint, box
 from shapely.geometry.polygon import Polygon
 
-from mmdet3d.structures import Box3DMode, LiDARInstance3DBoxes, points_cam2img
+from mmdet3d.structures import Box3DMode, CameraInstance3DBoxes, points_cam2img
 from mmdet3d.structures.ops import box_np_ops
 
 kitti_categories = ('Pedestrian', 'Cyclist', 'Car', 'Van', 'Truck',
@@ -318,11 +318,8 @@ def get_kitti_style_2d_boxes(info: dict,
 def convert_annos(info: dict, cam_idx: int) -> dict:
     """Convert front-cam anns to i-th camera (KITTI-style info)."""
     rect = info['calib']['R0_rect'].astype(np.float32)
-    if cam_idx == 0:
-        lidar2cami = info['calib']['Tr_velo_to_cam'].astype(np.float32)
-    else:
-        lidar2cami = info['calib'][f'Tr_velo_to_cam{cam_idx}'].astype(
-            np.float32)
+    lidar2cam0 = info['calib']['Tr_velo_to_cam'].astype(np.float32)
+    lidar2cami = info['calib'][f'Tr_velo_to_cam{cam_idx}'].astype(np.float32)
     annos = info['annos']
     converted_annos = copy.deepcopy(annos)
     loc = annos['location']
@@ -330,11 +327,12 @@ def convert_annos(info: dict, cam_idx: int) -> dict:
     rots = annos['rotation_y']
     gt_bboxes_3d = np.concatenate([loc, dims, rots[..., np.newaxis]],
                                   axis=1).astype(np.float32)
-    # BC-breaking: gt_bboxes_3d is already in lidar coordinates
+    # convert gt_bboxes_3d to velodyne coordinates
+    gt_bboxes_3d = CameraInstance3DBoxes(gt_bboxes_3d).convert_to(
+        Box3DMode.LIDAR, np.linalg.inv(rect @ lidar2cam0), correct_yaw=True)
     # convert gt_bboxes_3d to cam coordinates
-    gt_bboxes_3d = LiDARInstance3DBoxes(gt_bboxes_3d).convert_to(
+    gt_bboxes_3d = gt_bboxes_3d.convert_to(
         Box3DMode.CAM, rect @ lidar2cami, correct_yaw=True).numpy()
-
     converted_annos['location'] = gt_bboxes_3d[:, :3]
     converted_annos['dimensions'] = gt_bboxes_3d[:, 3:6]
     converted_annos['rotation_y'] = gt_bboxes_3d[:, 6]
diff --git a/projects/DSVT/README.md b/projects/DSVT/README.md
index a4b45b570d..c1d8cf2744 100644
--- a/projects/DSVT/README.md
+++ b/projects/DSVT/README.md
@@ -57,18 +57,24 @@ python tools/test.py projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-
 
 ### Training commands
 
-The support of training DSVT is on the way.
+In MMDetection3D's root directory, run the following command to test the model:
+
+```bash
+tools/dist_train.sh projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py 8 --sync_bn torch
+```
 
 ## Results and models
 
 ### Waymo
 
-|                                     Middle Encoder                                     |                                          Backbone                                           | Load Interval | Voxel type (voxel size) | Multi-Class NMS | Multi-frames | Mem (GB) | Inf time (fps) | mAP@L1 | mAPH@L1 | mAP@L2 | **mAPH@L2** | Download |
-| :------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------: | :-----------: | :---------------------: | :-------------: | :----------: | :------: | :------------: | :----: | :-----: | :----: | :---------: | :------: |
-| [DSVT](./configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py) | [ResSECOND](./configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py) |       5       |      voxel (0.32)       |        ✓        |      ×       |          |                |  75.2  |  72.2   |  68.9  |    66.1     |          |
+|                                     Middle Encoder                                     |                                          Backbone                                           | Load Interval | Voxel type (voxel size) | Multi-Class NMS | Multi-frames | mAP@L1 | mAPH@L1 | mAP@L2 | **mAPH@L2** | Download  |
+| :------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------: | :-----------: | :---------------------: | :-------------: | :----------: | :----: | :-----: | :----: | :---------: | :-------: |
+| [DSVT](./configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py) | [ResSECOND](./configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py) |       5       |      voxel (0.32)       |        ✓        |      ×       |  75.5  |  72.4   |  69.2  |    66.3     | [log](<>) |
 
 **Note** that `ResSECOND` denotes the base block in SECOND has residual layers.
 
+**Note** Regrettably, we are unable to provide the pre-trained model weights due to [Waymo Dataset License Agreement](https://waymo.com/open/terms/). However, we can provide the training logs.
+
 ## Citation
 
 ```latex
diff --git a/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py b/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
index 9d0be465e8..bf56259183 100644
--- a/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
+++ b/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
@@ -6,7 +6,6 @@
 voxel_size = [0.32, 0.32, 6]
 grid_size = [468, 468, 1]
 point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4.0]
-# data_root = 'data/waymo_mini/kitti_format/'
 data_root = 'data/waymo/kitti_format/'
 class_names = ['Car', 'Pedestrian', 'Cyclist']
 metainfo = dict(classes=class_names)
@@ -194,7 +193,7 @@
     dataset=dict(
         type=dataset_type,
         data_root=data_root,
-        ann_file='waymo_wo_cam_ins_infos_train.pkl',
+        ann_file='waymo_infos_train.pkl',
         data_prefix=dict(pts='training/velodyne', sweeps='training/velodyne'),
         pipeline=train_pipeline,
         modality=input_modality,
@@ -216,7 +215,7 @@
         type=dataset_type,
         data_root=data_root,
         data_prefix=dict(pts='training/velodyne', sweeps='training/velodyne'),
-        ann_file='waymo_wo_cam_ins_infos_val.pkl',
+        ann_file='waymo_infos_val.pkl',
         pipeline=test_pipeline,
         modality=input_modality,
         test_mode=True,
@@ -225,32 +224,23 @@
         backend_args=backend_args))
 test_dataloader = val_dataloader
 
-# val_evaluator = dict(
-#     type='WaymoMetric',
-#     ann_file='./data/waymo_mini/kitti_format/waymo_infos_val.pkl',
-#     waymo_bin_file='./data/waymo_mini/waymo_format/gt_mini.bin',
-#     backend_args=backend_args,
-#     convert_kitti_format=False)
 val_evaluator = dict(
     type='WaymoMetric',
-    ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
     waymo_bin_file='./data/waymo/waymo_format/gt.bin',
-    backend_args=backend_args,
-    convert_kitti_format=False)
+    result_prefix='./dsvt_pred')
 test_evaluator = val_evaluator
 
-vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]
-# vis_backends = [dict(type='LocalVisBackend')]
+# vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]
+vis_backends = [dict(type='LocalVisBackend')]
 visualizer = dict(
     type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# schedules
 lr = 1e-5
-# This schedule is mainly used by models on nuScenes dataset
-# max_norm=10 is better for SECOND
 optim_wrapper = dict(
     type='OptimWrapper',
     optimizer=dict(type='AdamW', lr=lr, weight_decay=0.05, betas=(0.9, 0.99)),
     clip_grad=dict(max_norm=10, norm_type=2))
-# learning rate
 param_scheduler = [
     dict(
         type='CosineAnnealingLR',
diff --git a/tools/dataset_converters/kitti_converter.py b/tools/dataset_converters/kitti_converter.py
index e904918f60..367cfd7ba9 100644
--- a/tools/dataset_converters/kitti_converter.py
+++ b/tools/dataset_converters/kitti_converter.py
@@ -5,7 +5,6 @@
 import mmcv
 import mmengine
 import numpy as np
-from mmengine import logging, print_log
 from nuscenes.utils.geometry_utils import view_points
 
 from mmdet3d.structures import points_cam2img
@@ -250,12 +249,6 @@ def create_waymo_info_file(data_path,
         max_sweeps (int, optional): Max sweeps before the detection frame
             to be used. Default: 5.
     """
-    print_log(
-        'Deprecation Warning: related functions has been migrated to '
-        '`Waymo2KITTI.create_waymo_info_file`. It will be removed in '
-        'the future!',
-        logger='current',
-        level=logging.WARNING)
     imageset_folder = Path(data_path) / 'ImageSets'
     train_img_ids = _read_imageset_file(str(imageset_folder / 'train.txt'))
     val_img_ids = _read_imageset_file(str(imageset_folder / 'val.txt'))
diff --git a/tools/dataset_converters/kitti_data_utils.py b/tools/dataset_converters/kitti_data_utils.py
index b0e90bd8d3..64c3bc415b 100644
--- a/tools/dataset_converters/kitti_data_utils.py
+++ b/tools/dataset_converters/kitti_data_utils.py
@@ -148,14 +148,10 @@ def get_label_anno(label_path):
                                         for x in content]).reshape(-1, 3)
     annotations['rotation_y'] = np.array([float(x[14])
                                           for x in content]).reshape(-1)
-    if len(content) != 0 and len(content[0]) >= 16:  # have score
+    if len(content) != 0 and len(content[0]) == 16:  # have score
         annotations['score'] = np.array([float(x[15]) for x in content])
     else:
         annotations['score'] = np.zeros((annotations['bbox'].shape[0], ))
-    # have num_lidar_points_in_box, given in waymo
-    if len(content) != 0 and len(content[0]) == 17:
-        annotations['num_points_in_gt'] = np.array(
-            [int(x[16]) for x in content])
     index = list(range(num_objects)) + [-1] * (num_gt - num_objects)
     annotations['index'] = np.array(index, dtype=np.int32)
     annotations['group_ids'] = np.arange(num_gt, dtype=np.int32)
@@ -556,7 +552,6 @@ def gather(self, image_ids):
         image_infos = mmengine.track_parallel_progress(self.gather_single,
                                                        image_ids,
                                                        self.num_worker)
-        # image_infos = mmengine.track_progress(self.gather_single, image_ids)
         return list(image_infos)
 
 

From 6d56789d7a22a0e63eebdb9b3c7add3da0b56096 Mon Sep 17 00:00:00 2001
From: sunjiahao1999 <578431509@qq.com>
Date: Thu, 28 Dec 2023 16:22:02 +0800
Subject: [PATCH 23/25] resolve review

---
 projects/DSVT/README.md                               |  6 +++---
 ...second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py |  8 +++-----
 projects/DSVT/dsvt/__init__.py                        |  4 ++--
 projects/DSVT/dsvt/dsvt_head.py                       |  7 ++++---
 projects/DSVT/dsvt/dynamic_pillar_vfe.py              |  1 -
 projects/DSVT/dsvt/res_second.py                      |  2 ++
 projects/DSVT/dsvt/transforms_3d.py                   |  4 ++--
 projects/DSVT/dsvt/utils.py                           | 11 ++++++-----
 8 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/projects/DSVT/README.md b/projects/DSVT/README.md
index c1d8cf2744..bb1b008236 100644
--- a/projects/DSVT/README.md
+++ b/projects/DSVT/README.md
@@ -67,9 +67,9 @@ tools/dist_train.sh projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-c
 
 ### Waymo
 
-|                                     Middle Encoder                                     |                                          Backbone                                           | Load Interval | Voxel type (voxel size) | Multi-Class NMS | Multi-frames | mAP@L1 | mAPH@L1 | mAP@L2 | **mAPH@L2** | Download  |
-| :------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------: | :-----------: | :---------------------: | :-------------: | :----------: | :----: | :-----: | :----: | :---------: | :-------: |
-| [DSVT](./configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py) | [ResSECOND](./configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py) |       5       |      voxel (0.32)       |        ✓        |      ×       |  75.5  |  72.4   |  69.2  |    66.3     | [log](<>) |
+|                                     Middle Encoder                                     |                                          Backbone                                           | Load Interval | Voxel type (voxel size) | Multi-Class NMS | Multi-frames | mAP@L1 | mAPH@L1 | mAP@L2 | **mAPH@L2** |                                                                             Download                                                                             |
+| :------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------: | :-----------: | :---------------------: | :-------------: | :----------: | :----: | :-----: | :----: | :---------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [DSVT](./configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py) | [ResSECOND](./configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py) |       5       |      voxel (0.32)       |        ✓        |      ×       |  75.5  |  72.4   |  69.2  |    66.3     | \[log\](\<https://download.openmmlab.com/mmdetection3d/v1.1.0_models/dsvt/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class_20230917_102130.log) |
 
 **Note** that `ResSECOND` denotes the base block in SECOND has residual layers.
 
diff --git a/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py b/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
index bf56259183..0e69876f0f 100644
--- a/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
+++ b/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
@@ -2,7 +2,6 @@
 custom_imports = dict(
     imports=['projects.DSVT.dsvt'], allow_failed_imports=False)
 
-# load_from = 'checkpoints/dsvt_init.pth'
 voxel_size = [0.32, 0.32, 6]
 grid_size = [468, 468, 1]
 point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4.0]
@@ -158,8 +157,8 @@
         rot_range=[-0.78539816, 0.78539816],
         scale_ratio_range=[0.95, 1.05],
         translation_std=[0.5, 0.5, 0.5]),
-    dict(type='DSVTPointsRangeFilter', point_cloud_range=point_cloud_range),
-    dict(type='DSVTObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointsRangeFilter3D', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter3D', point_cloud_range=point_cloud_range),
     # dict(type='ObjectNameFilter', classes=class_names),
     dict(type='PointShuffle'),
     dict(
@@ -176,7 +175,7 @@
         norm_intensity=True,
         norm_elongation=True,
         backend_args=backend_args),
-    dict(type='DSVTPointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointsRangeFilter3D', point_cloud_range=point_cloud_range),
     dict(
         type='Pack3DDetInputs',
         keys=['points'],
@@ -230,7 +229,6 @@
     result_prefix='./dsvt_pred')
 test_evaluator = val_evaluator
 
-# vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]
 vis_backends = [dict(type='LocalVisBackend')]
 visualizer = dict(
     type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
diff --git a/projects/DSVT/dsvt/__init__.py b/projects/DSVT/dsvt/__init__.py
index 58395863a4..a3666f0535 100644
--- a/projects/DSVT/dsvt/__init__.py
+++ b/projects/DSVT/dsvt/__init__.py
@@ -5,11 +5,11 @@
 from .dynamic_pillar_vfe import DynamicPillarVFE3D
 from .map2bev import PointPillarsScatter3D
 from .res_second import ResSECOND
-from .transforms_3d import DSVTObjectRangeFilter, DSVTPointsRangeFilter
+from .transforms_3d import ObjectRangeFilter3D, PointsRangeFilter3D
 from .utils import DSVTBBoxCoder
 
 __all__ = [
     'DSVTCenterHead', 'DSVT', 'DSVTMiddleEncoder', 'DynamicPillarVFE3D',
     'PointPillarsScatter3D', 'ResSECOND', 'DSVTBBoxCoder',
-    'DSVTObjectRangeFilter', 'DSVTPointsRangeFilter', 'DisableAugHook'
+    'ObjectRangeFilter3D', 'PointsRangeFilter3D', 'DisableAugHook'
 ]
diff --git a/projects/DSVT/dsvt/dsvt_head.py b/projects/DSVT/dsvt/dsvt_head.py
index 650a585dfb..ffc7669234 100644
--- a/projects/DSVT/dsvt/dsvt_head.py
+++ b/projects/DSVT/dsvt/dsvt_head.py
@@ -26,10 +26,10 @@ class DSVTCenterHead(CenterHead):
     """
 
     def __init__(self,
-                 *args,
                  loss_iou=dict(
                      type='mmdet.L1Loss', reduction='mean', loss_weight=1),
                  loss_reg_iou=None,
+                 *args,
                  **kwargs):
         super(DSVTCenterHead, self).__init__(*args, **kwargs)
         self.loss_iou = MODELS.build(loss_iou)
@@ -165,6 +165,7 @@ def calc_iou_loss(self, iou_preds, batch_box_preds, mask, ind, gt_boxes):
             gt_boxes: List of batch groundtruth boxes.
 
         Returns:
+            Tensor: IoU Loss.
         """
         if mask.sum() == 0:
             return iou_preds.new_zeros((1))
@@ -256,7 +257,7 @@ def get_targets_single(self,
         """Generate training targets for a single sample.
 
         Args:
-            gt_instances_3d (:obj:`InstanceData`): Gt_instances of
+            gt_instances_3d (:obj:`InstanceData`): Gt_instances_3d of
                 single data sample. It usually includes
                 ``bboxes_3d`` and ``labels_3d`` attributes.
 
@@ -401,7 +402,7 @@ def loss_by_feat(self, preds_dicts: Tuple[List[dict]],
                 tasks head, and the internal list indicate different
                 FPN level.
             batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
-                gt_instances. It usually includes ``bboxes_3d`` and
+                gt_instances_3d. It usually includes ``bboxes_3d`` and
                 ``labels_3d`` attributes.
 
         Returns:
diff --git a/projects/DSVT/dsvt/dynamic_pillar_vfe.py b/projects/DSVT/dsvt/dynamic_pillar_vfe.py
index 6d21a69713..3fc5266c56 100644
--- a/projects/DSVT/dsvt/dynamic_pillar_vfe.py
+++ b/projects/DSVT/dsvt/dynamic_pillar_vfe.py
@@ -77,7 +77,6 @@ def __init__(self, with_distance, use_absolute_xyz, use_norm, num_filters,
         self.voxel_x = voxel_size[0]
         self.voxel_y = voxel_size[1]
         self.voxel_z = voxel_size[2]
-        # TODO: remove it after 对齐精度
         point_cloud_range = np.array(point_cloud_range).astype(np.float32)
         self.x_offset = self.voxel_x / 2 + point_cloud_range[0]
         self.y_offset = self.voxel_y / 2 + point_cloud_range[1]
diff --git a/projects/DSVT/dsvt/res_second.py b/projects/DSVT/dsvt/res_second.py
index 072c586181..f8775e34e8 100644
--- a/projects/DSVT/dsvt/res_second.py
+++ b/projects/DSVT/dsvt/res_second.py
@@ -76,6 +76,8 @@ class ResSECOND(BaseModule):
         out_channels (list[int]): Output channels for multi-scale feature maps.
         blocks_nums (list[int]): Number of blocks in each stage.
         layer_strides (list[int]): Strides of each stage.
+        init_cfg (dict, optional): Config for weight initialization.
+            Defaults to None.
     """
 
     def __init__(self,
diff --git a/projects/DSVT/dsvt/transforms_3d.py b/projects/DSVT/dsvt/transforms_3d.py
index 03d2c38dd7..ff0c9a2314 100644
--- a/projects/DSVT/dsvt/transforms_3d.py
+++ b/projects/DSVT/dsvt/transforms_3d.py
@@ -7,7 +7,7 @@
 
 
 @TRANSFORMS.register_module()
-class DSVTObjectRangeFilter(BaseTransform):
+class ObjectRangeFilter3D(BaseTransform):
     """Filter objects by the range. It differs from `ObjectRangeFilter` by
     using `in_range_3d` instead of `in_range_bev`.
 
@@ -61,7 +61,7 @@ def __repr__(self) -> str:
 
 
 @TRANSFORMS.register_module()
-class DSVTPointsRangeFilter(BaseTransform):
+class PointsRangeFilter3D(BaseTransform):
     """Filter points by the range. It differs from `PointRangeFilter` by using
     `in_range_bev` instead of `in_range_3d`.
 
diff --git a/projects/DSVT/dsvt/utils.py b/projects/DSVT/dsvt/utils.py
index 978f9154da..d8b5d7dfa1 100644
--- a/projects/DSVT/dsvt/utils.py
+++ b/projects/DSVT/dsvt/utils.py
@@ -313,13 +313,13 @@ def center_to_corner2d(center, dim):
 @weighted_loss
 def diou3d_loss(pred_boxes, gt_boxes, eps: float = 1e-7):
     """
-    https://github.com/agent-sgs/PillarNet/blob/master/det3d/core/utils/center_utils.py # noqa
+    modified from https://github.com/agent-sgs/PillarNet/blob/master/det3d/core/utils/center_utils.py # noqa
     Args:
         pred_boxes (N, 7):
         gt_boxes (N, 7):
 
     Returns:
-        _type_: _description_
+        Tensor: Distance-IoU Loss.
     """
     assert pred_boxes.shape[0] == gt_boxes.shape[0]
 
@@ -371,14 +371,15 @@ def diou3d_loss(pred_boxes, gt_boxes, eps: float = 1e-7):
 @MODELS.register_module()
 class DIoU3DLoss(nn.Module):
     r"""3D bboxes Implementation of `Distance-IoU Loss: Faster and Better
-    Learning for Bounding Box Regression https://arxiv.org/abs/1911.08287`_.
+    Learning for Bounding Box Regression <https://arxiv.org/abs/1911.08287>`.
 
     Code is modified from https://github.com/Zzh-tju/DIoU.
 
     Args:
-        eps (float): Epsilon to avoid log(0).
+        eps (float): Epsilon to avoid log(0). Defaults to 1e-6.
         reduction (str): Options are "none", "mean" and "sum".
-        loss_weight (float): Weight of loss.
+            Defaults to "mean".
+        loss_weight (float): Weight of loss. Defaults to 1.0.
     """
 
     def __init__(self,

From fc93172d4dd3dd0ee216d18d8293b03c8f8433b3 Mon Sep 17 00:00:00 2001
From: sjh <sunjiahao1999>
Date: Thu, 28 Dec 2023 20:59:58 +0800
Subject: [PATCH 24/25] resolve review 2

---
 projects/DSVT/README.md                                     | 6 ++++--
 ...2_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py | 2 --
 projects/DSVT/dsvt/utils.py                                 | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/projects/DSVT/README.md b/projects/DSVT/README.md
index bb1b008236..d042c7c374 100644
--- a/projects/DSVT/README.md
+++ b/projects/DSVT/README.md
@@ -71,9 +71,11 @@ tools/dist_train.sh projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-c
 | :------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------: | :-----------: | :---------------------: | :-------------: | :----------: | :----: | :-----: | :----: | :---------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------: |
 | [DSVT](./configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py) | [ResSECOND](./configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py) |       5       |      voxel (0.32)       |        ✓        |      ×       |  75.5  |  72.4   |  69.2  |    66.3     | \[log\](\<https://download.openmmlab.com/mmdetection3d/v1.1.0_models/dsvt/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class_20230917_102130.log) |
 
-**Note** that `ResSECOND` denotes the base block in SECOND has residual layers.
+**Note**:
 
-**Note** Regrettably, we are unable to provide the pre-trained model weights due to [Waymo Dataset License Agreement](https://waymo.com/open/terms/). However, we can provide the training logs.
+\-  that `ResSECOND` denotes the base block in SECOND has residual layers.
+
+\- Regrettably, we are unable to provide the pre-trained model weights due to [Waymo Dataset License Agreement](https://waymo.com/open/terms/), so we only provide the training logs as shown above.
 
 ## Citation
 
diff --git a/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py b/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
index 0e69876f0f..7d2faf854c 100644
--- a/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
+++ b/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
@@ -159,7 +159,6 @@
         translation_std=[0.5, 0.5, 0.5]),
     dict(type='PointsRangeFilter3D', point_cloud_range=point_cloud_range),
     dict(type='ObjectRangeFilter3D', point_cloud_range=point_cloud_range),
-    # dict(type='ObjectNameFilter', classes=class_names),
     dict(type='PointShuffle'),
     dict(
         type='Pack3DDetInputs',
@@ -187,7 +186,6 @@
     batch_size=1,
     num_workers=4,
     persistent_workers=True,
-    # sampler=dict(type='DefaultSampler', shuffle=False),
     sampler=dict(type='DefaultSampler', shuffle=True),
     dataset=dict(
         type=dataset_type,
diff --git a/projects/DSVT/dsvt/utils.py b/projects/DSVT/dsvt/utils.py
index d8b5d7dfa1..706ee04280 100644
--- a/projects/DSVT/dsvt/utils.py
+++ b/projects/DSVT/dsvt/utils.py
@@ -371,7 +371,7 @@ def diou3d_loss(pred_boxes, gt_boxes, eps: float = 1e-7):
 @MODELS.register_module()
 class DIoU3DLoss(nn.Module):
     r"""3D bboxes Implementation of `Distance-IoU Loss: Faster and Better
-    Learning for Bounding Box Regression <https://arxiv.org/abs/1911.08287>`.
+    Learning for Bounding Box Regression <https://arxiv.org/abs/1911.08287>`_.
 
     Code is modified from https://github.com/Zzh-tju/DIoU.
 

From 7bbaeb7b3e0da79f14ffbea0a92d2b9e3c234860 Mon Sep 17 00:00:00 2001
From: sjh <sunjiahao1999>
Date: Thu, 28 Dec 2023 21:09:16 +0800
Subject: [PATCH 25/25] fix doc

---
 projects/DSVT/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/projects/DSVT/README.md b/projects/DSVT/README.md
index d042c7c374..d60e49abe5 100644
--- a/projects/DSVT/README.md
+++ b/projects/DSVT/README.md
@@ -73,9 +73,9 @@ tools/dist_train.sh projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-c
 
 **Note**:
 
-\-  that `ResSECOND` denotes the base block in SECOND has residual layers.
+- `ResSECOND` denotes the base block in SECOND has residual layers.
 
-\- Regrettably, we are unable to provide the pre-trained model weights due to [Waymo Dataset License Agreement](https://waymo.com/open/terms/), so we only provide the training logs as shown above.
+- Regrettably, we are unable to provide the pre-trained model weights due to [Waymo Dataset License Agreement](https://waymo.com/open/terms/), so we only provide the training logs as shown above.
 
 ## Citation