diff --git a/README.md b/README.md index 1f117ff..4b304ca 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # chainer-pose-proposal-net -- This is an implementation of [Pose Proposal Networks](http://openaccess.thecvf.com/content_ECCV_2018/papers/Sekii_Pose_Proposal_Networks_ECCV_2018_paper.pdf) with Chainer including training and prediction tools. +- This is an (unofficial) implementation of [Pose Proposal Networks](http://openaccess.thecvf.com/content_ECCV_2018/papers/Sekii_Pose_Proposal_Networks_ECCV_2018_paper.pdf) with Chainer including training and prediction tools. # License @@ -19,31 +19,43 @@ This project is licensed under the terms of the [license](LICENSE). ### MPII - If you train with COCO dataset you can skip. -- Access [MPII Human Pose Dataset](http://human-pose.mpi-inf.mpg.de/) and jump to `Download` page. Then download and extract both `Images (12.9 GB)` and `Annotations (12.5 MB)`. +- Access [MPII Human Pose Dataset](http://human-pose.mpi-inf.mpg.de/) and jump to `Download` page. Then download and extract both `Images (12.9 GB)` and `Annotations (12.5 MB)` at `~/work/dataset/mpii_dataset` for example. #### Create `mpii.json` We need decode `mpii_human_pose_v1_u12_1.mat` to generate `mpii.json`. This will be used on training or evaluating test dataset of MPII. ``` -$ sudo docker run --rm -v $(pwd):/work -v path/to/dataset:/data -w /work idein/chainer:4.5.0 python3 convert_mpii_dataset.py /data/mpii_human_pose_v1_u12_2/mpii_human_pose_v1_u12_1.mat /data/mpii.json +$ sudo docker run --rm -v $(pwd):/work -v path/to/dataset:mpii_dataset -w /work idein/chainer:4.5.0 python3 convert_mpii_dataset.py mpii_dataset/mpii_human_pose_v1_u12_2/mpii_human_pose_v1_u12_1.mat mpii_dataset/mpii.json ``` -It will generate `mpii.json` at `path/to/dataset` where is the root directory of MPII dataset. For those who hesitate to use Docker, you may edit `config.ini` as necessary. +It will generate `mpii.json` at `path/to/dataset`. Where `path/to/dataset` is the root directory of MPII dataset, for example, `~/work/dataset/mpii_dataset`. For those who hesitate to use Docker, you may edit `config.ini` as necessary. ### COCO - If you train with MPII dataset you can skip. -- Access [COCO dataset](http://cocodataset.org/) and jump to `Dataset` -> `download`. Then download and extract `2017 Train images [118K/18GB]`, `2017 Val images [5K/1GB]` and `2017 Train/Val annotations [241MB]`. +- Access [COCO dataset](http://cocodataset.org/) and jump to `Dataset` -> `download`. Then download and extract `2017 Train images [118K/18GB]`, `2017 Val images [5K/1GB]` and `2017 Train/Val annotations [241MB]` at `~/work/dataset/coco_dataset:/coco_dataset` for example. ## Running Training Scripts +OK let's begin! + ``` -$ sudo docker run --rm -v $(pwd):/work -v path/to/dataset:/data -w /work idein/chainer:4.5.0 python3 train.py +$ cat begin_train.sh +cat config.ini +docker run --rm \ +-v $(pwd):/work \ +-v ~/work/dataset/mpii_dataset:/mpii_dataset \ +-v ~/work/dataset/coco_dataset:/coco_dataset \ +--name ppn_idein \ +-w /work \ +idein/chainer:5.1.0 \ +python3 train.py +$ sudo bash begin_train.sh ``` - + - Optional argument `--runtime=nvidia` maybe require for some environment. -- This will train a model the base network is MobileNetV2 with MPII dataset located in `path/to/dataset` on host machine. +- It will train a model the base network is MobileNetV2 with MPII dataset located in `path/to/dataset` on host machine. - If we would like to train with COCO dataset, edit a part of `config.ini` as follow: before @@ -81,10 +93,22 @@ model_name = resnet18 # Prediction -- Very easy, all we have to do is: +- Very easy, all we have to do is, for example: + +``` +$ sudo bash run_predict.sh ./trained +``` + +- If you would like to configure parameter or hide bounding box, edit a part of `config.ini` as follow: ``` -$ sudo docker run --rm -v $(pwd):/work -v path/to/dataset:/data -w /work idein/chainer:4.5.0 python3 predict.py +[predict] +# If `False` is set, hide bbox of annotation other than human instance. +visbbox = True +# detection_thresh +detection_thresh = 0.15 +# ignore human its num of keypoints is less than min_num_keypoints +min_num_keypoints= 1 ``` # Demo: Realtime Pose Estimation @@ -102,6 +126,10 @@ docker build -t ppn . $ sudo bash build.sh ``` +Here is an result of ResNet18 trained with COCO running on laptop PC. + +![](readmedata/cpu-example.gif) + ## Run video.py - Set your USB camera that can recognize from OpenCV. @@ -109,17 +137,17 @@ $ sudo bash build.sh - Run `video.py` ``` -$ python video.py +$ python video.py ./trained ``` or ``` -$ sudo bash run_video.sh +$ sudo bash run_video.sh ./trained ``` ## High Performance Version -- To use feature of [Static Subgraph Optimizations](http://docs.chainer.org/en/stable/reference/static_graph_design.html) to accelerate inference speed, we should install Chainer 5.0.0 and CuPy 5.0.0 . +- To use feature of [Static Subgraph Optimizations](http://docs.chainer.org/en/stable/reference/static_graph_design.html) to accelerate inference speed, we should install Chainer 5.y.z and CuPy 5.y.z e.g. 5.0.0 or 5.1.0 . - Prepare high performance USB camera so that takes more than 60 FPS. - Run `high_speed.py` instead of `video.py` - Do not fall from the chair with surprise :D. diff --git a/augment.py b/augment.py index fd9e742..a192a7f 100644 --- a/augment.py +++ b/augment.py @@ -1,43 +1,427 @@ +import random + import numpy as np from scipy import ndimage +import chainercv.transforms as transforms +from chainercv.links.model.ssd.transforms import random_distort +import PIL +from PIL import ImageChops, ImageOps, ImageFilter, ImageEnhance -def rotate_point(point_yx, degree, center_yx): - offset_x, offset_y = center_yx +def rotate_point(point_yx, angle, center_yx): + offset_y, offset_x = center_yx shift = point_yx - center_yx shift_y, shift_x = shift[:, 0], shift[:, 1] - cos_rad = np.cos(np.deg2rad(degree)) - sin_rad = np.sin(np.deg2rad(degree)) + cos_rad = np.cos(np.deg2rad(angle)) + sin_rad = np.sin(np.deg2rad(angle)) qx = offset_x + cos_rad * shift_x + sin_rad * shift_y qy = offset_y - sin_rad * shift_x + cos_rad * shift_y return np.array([qy, qx]).transpose() -def rot_image(image, degree): - # CHW => HWC - image = image.transpose(1, 2, 0) - rot = ndimage.rotate(image, degree, reshape=False) - # HWC => CHW - rot = rot.transpose(2, 0, 1) +def rotate_image(image, angle): + rot = ndimage.rotate(image, angle, axes=(2, 1), reshape=False) + # disable image collapse + rot = np.clip(rot, 0, 255) return rot -def rotate(image, keypoints, bbox, degree): +def random_rotate(image, keypoints, bbox): + angle = np.random.uniform(-40, 40) + param = {} + param['angle'] = angle new_keypoints = [] center_yx = np.array(image.shape[1:]) / 2 for points in keypoints: rot_points = rotate_point(np.array(points), - degree, + angle, center_yx) new_keypoints.append(rot_points) new_bbox = [] for x, y, w, h in bbox: - points = np.array([[y + h / 2, x + w / 2]]) - ry, rx = rotate_point(points, - degree, - center_yx)[0] - new_bbox.append([rx - w / 2, ry - h / 2, w, h]) - - rot = rot_image(image, degree) - return rot, new_keypoints, new_bbox + + points = np.array( + [ + [y, x], + [y, x + w], + [y + h, x], + [y + h, x + w] + ] + ) + + rot_points = rotate_point( + points, + angle, + center_yx + ) + xmax = np.max(rot_points[:, 1]) + ymax = np.max(rot_points[:, 0]) + xmin = np.min(rot_points[:, 1]) + ymin = np.min(rot_points[:, 0]) + # x,y,w,h + new_bbox.append([xmin, ymin, xmax - xmin, ymax - ymin]) + + image = rotate_image(image, angle) + return image, new_keypoints, new_bbox, param + + +def spot_light(pil_img): + w, h = pil_img.size + effect_img = np.zeros((h, w, 3)) + scale_w = random.choice([5, 6, 7, 8, 9]) + scale_h = random.choice([5, 6, 7, 8, 9]) + x = random.choice(range(w // scale_w, w - w // scale_w)) + y = random.choice(range(h // scale_h, h - h // scale_h)) + light = random.choice(range(128, 220)) + effect_img[y - h // scale_h:y + h // scale_h, x - w // scale_w:x + w // scale_w] = light + effect_img = PIL.Image.fromarray(effect_img.astype(np.uint8)) + return ImageChops.add(pil_img, effect_img) + + +def blend_alpha(pil_img, direction='left'): + w, h = pil_img.size + effect_img = np.zeros((h, w, 3)) + if direction == 'right': + for x in range(w): + effect_img[:, x] = x * 255 / w + elif direction == 'left': + for x in range(w): + effect_img[:, x] = (w - x) * 255 / w + elif direction == 'up': + for y in range(h): + effect_img[y, :] = (h - y) * 255 / h + elif direction == 'down': + for y in range(h): + effect_img[y, :] = y * 255 / h + else: + raise Exception("invalid argument direction is 'right','left','up','down' actual {}".format(direction)) + effect_img = PIL.Image.fromarray(effect_img.astype(np.uint8)) + return PIL.Image.blend(pil_img, effect_img, 0.5) + + +def chop_image(pil_img, direction='left', op='add'): + w, h = pil_img.size + effect_img = np.zeros((h, w, 3)) + if direction == 'right': + for x in range(w): + effect_img[:, x] = x * 255 / w + elif direction == 'left': + for x in range(w): + effect_img[:, x] = (w - x) * 255 / w + elif direction == 'up': + for y in range(h): + effect_img[y, :] = (h - y) * 255 / h + elif direction == 'down': + for y in range(h): + effect_img[y, :] = y * 255 / h + else: + raise Exception("invalid argument direction. It should be 'right','left','up','down' actual {}".format(direction)) + effect_img = PIL.Image.fromarray(effect_img.astype(np.uint8)) + if op == 'add': + operation = ImageChops.add + elif op == 'subtract': + operation = ImageChops.subtract + elif op == 'multiply': + operation = ImageChops.multiply + elif op == 'screen': + operation = ImageChops.screen + elif op == 'lighter': + operation = ImageChops.lighter + elif op == 'darker': + operation = ImageChops.darker + else: + ops = ['add', 'subtract', 'multiply', 'screen', 'lighter', 'darker'] + raise Exception("invalid argument op. {} actual {}".format(ops, direction)) + return operation(pil_img, effect_img) + + +def filter_image(pil_img): + method = random.choice(['gaussian', 'blur', 'sharpen']) + if method == 'gaussian': + return pil_img.filter(ImageFilter.GaussianBlur(random.choice([0.5, 1.0, 1.5]))) + if method == 'blur': + return pil_img.filter(ImageFilter.BLUR) + if method == 'sharpen': + return pil_img.filter(ImageFilter.SHARPEN) + + +def random_process_by_PIL(image): + # convert CHW -> HWC -> PIL.Image + pil_img = PIL.Image.fromarray(image.transpose(1, 2, 0).astype(np.uint8)) + + method = np.random.choice( + ['equalize', 'spot_light', 'chop', 'blend'], + p=[0.15, 0.15, 0.35, 0.35] + ) + + param = {'pil': method, 'filter': False} + if method == 'equalize': + pil_img = ImageOps.equalize(pil_img) + if method == 'spot_light': + pil_img = spot_light(pil_img) + if method == 'chop': + direction = random.choice(['left', 'right', 'up', 'down']) + op = random.choice(['add', 'subtract', 'multiply', 'screen', 'lighter', 'darker']) + pil_img = chop_image(pil_img, direction, op) + if method == 'blend': + direction = random.choice(['left', 'right', 'up', 'down']) + pil_img = blend_alpha(pil_img, direction) + + if np.random.choice([True, False], p=[0.1, 0.9]): + pil_img = filter_image(pil_img) + param['filter'] = True + # back to CHW + image = np.asarray(pil_img).transpose(2, 0, 1).astype(np.float32) + return image, param + + +def augment_image(image, dataset_type): + """color augmentation""" + param = {} + + if dataset_type == 'mpii': + method = np.random.choice( + ['random_distort', 'pil'], + p=[1.0, 0.0], + ) + elif dataset_type == 'coco': + method = np.random.choice( + ['random_distort', 'pil'], + p=[0.5, 0.5], + ) + + if method == 'random_distort': + param['method'] = method + # color augmentation provided by ChainerCV + ret = random_distort(image, contrast_low=0.3, contrast_high=2) + return ret, param + if method == 'pil': + ret, param = random_process_by_PIL(image) + param['method'] = method + return ret, param + + +def random_flip(image, keypoints, bbox, is_labeled, is_visible, flip_indices): + """ + random x_flip + Note that if image is flipped, `flip_indices` translate elements. + e.g. left_shoulder -> right_shoulder. + """ + _, H, W = image.shape + image, param = transforms.random_flip(image, x_random=True, return_param=True) + + if param['x_flip']: + keypoints = [ + transforms.flip_point(points, (H, W), x_flip=True)[flip_indices] + for points in keypoints + ] + + is_labeled = [label[flip_indices] for label in is_labeled] + is_visible = [vis[flip_indices] for vis in is_visible] + + new_bbox = [] + for x, y, w, h in bbox: + [[y, x]] = transforms.flip_point(np.array([[y, x + w]]), (H, W), x_flip=True) + new_bbox.append([x, y, w, h]) + bbox = new_bbox + + return image, keypoints, bbox, is_labeled, is_visible, param + + +def scale_fit_short(image, keypoints, bbox, length): + _, H, W = image.shape + min_hw = min(H, W) + scale = length / min_hw + new_image = transforms.scale(image, size=length, fit_short=True) + new_keypoints = [scale * k for k in keypoints] + new_bbox = [scale * np.asarray(b) for b in bbox] + return new_image, new_keypoints, new_bbox + + +def intersection(bbox0, bbox1): + x0, y0, w0, h0 = bbox0 + x1, y1, w1, h1 = bbox1 + + def relu(x): return max(0, x) + w = relu(min(x0 + w0, x1 + w1) - max(x0, x1)) + h = relu(min(y0 + h0, y1 + h1) - max(y0, y1)) + return w * h + + +def translate_bbox(bbox, size, y_offset, x_offset): + cropped_H, cropped_W = size + new_bbox = [] + for x, y, w, h in bbox: + x_shift = x + x_offset + y_shift = y + y_offset + is_intersect = intersection([0, 0, cropped_W, cropped_H], [x_shift, y_shift, w, h]) + if is_intersect: + xmin = max(0, x_shift) + ymin = max(0, y_shift) + xmax = min(cropped_W, x_shift + w) + ymax = min(cropped_H, y_shift + h) + wnew = xmax - xmin + hnew = ymax - ymin + new_bbox.append([xmin, ymin, wnew, hnew]) + else: + new_bbox.append([x_shift, y_shift, w, h]) + return new_bbox + + +def crop(img, y_slice, x_slice, copy=False): + ret = img.copy() if copy else img + return ret[:, y_slice, x_slice] + + +def crop_all_humans(image, keypoints, bbox, is_labeled): + _, H, W = image.shape + aspect = W / H + param = {} + if len(keypoints) == 0: + param['do_nothing'] = True + return image, keypoints, bbox, param + + kymax = max([np.max(ks[l, 0]) for l, ks in zip(is_labeled, keypoints)]) + kxmax = max([np.max(ks[l, 1]) for l, ks in zip(is_labeled, keypoints)]) + kymin = min([np.min(ks[l, 0]) for l, ks in zip(is_labeled, keypoints)]) + kxmin = min([np.min(ks[l, 1]) for l, ks in zip(is_labeled, keypoints)]) + + bxmax = max([b[0] + b[2] for b in bbox]) + bymax = max([b[1] + b[3] for b in bbox]) + bxmin = min([b[0] for b in bbox]) + bymin = min([b[1] for b in bbox]) + + ymax = max(kymax, bymax) + xmax = max(kxmax, bxmax) + ymin = min(kymin, bymin) + xmin = min(kxmin, bxmin) + + if (xmax + xmin) / 2 < W / 2: + x_start = random.randint(0, max(0, int(xmin))) + y_start = random.randint(0, max(0, int(ymin))) + y_end = random.randint(min(H, int(ymax)), H) + ylen = y_end - y_start + xlen = aspect * ylen + x_end = min(W, int(x_start + xlen)) + x_slice = slice(x_start, x_end, None) + y_slice = slice(y_start, y_end, None) + else: + x_end = random.randint(min(int(xmax), W), W) + y_end = random.randint(min(int(ymax), H), H) + y_start = random.randint(0, max(0, int(ymin))) + ylen = y_end - y_start + xlen = aspect * ylen + x_start = max(0, int(x_end - xlen)) + x_slice = slice(x_start, x_end, None) + y_slice = slice(y_start, y_end, None) + + cropped = crop(image, y_slice=y_slice, x_slice=x_slice, copy=True) + _, cropped_H, cropped_W = cropped.shape + param['x_slice'] = x_slice + param['y_slice'] = y_slice + if cropped_H <= 50 or cropped_W <= 50: + """ + This case, for example, cropped_H=0 will cause an error when try to resize image + or resize small image to insize will cause low resolution human image. + To avoid situations, we will stop crop image. + """ + param['do_nothing'] = True + return image, keypoints, bbox, param + image = cropped + + keypoints = [ + transforms.translate_point( + points, x_offset=-x_slice.start, y_offset=-y_slice.start) + for points in keypoints + ] + + bbox = translate_bbox( + bbox, + size=(cropped_H, cropped_W), + x_offset=-x_slice.start, + y_offset=-y_slice.start, + ) + + return image, keypoints, bbox, param + + +def random_sized_crop(image, keypoints, bbox): + image, param = transforms.random_sized_crop( + image, + scale_ratio_range=(0.5, 5), + aspect_ratio_range=(0.75, 1.3333333333333333), + return_param=True + ) + + keypoints = [ + transforms.translate_point(points, + x_offset=-param['x_slice'].start, + y_offset=-param['y_slice'].start + ) + for points in keypoints + ] + + _, cropped_H, cropped_W = image.shape + + bbox = translate_bbox( + bbox, + size=(cropped_H, cropped_W), + x_offset=-param['x_slice'].start, + y_offset=-param['y_slice'].start, + ) + + return image, keypoints, bbox, {random_sized_crop.__name__: param} + + +def resize(image, keypoints, bbox, size): + _, H, W = image.shape + new_h, new_w = size + image = transforms.resize(image, (new_h, new_w)) + + keypoints = [ + transforms.resize_point(points, (H, W), (new_h, new_w)) + for points in keypoints + ] + + new_bbox = [] + for x, y, bw, bh in bbox: + [[y, x]] = transforms.resize_point(np.array([[y, x]]), (H, W), (new_h, new_w)) + bw *= new_w / W + bh *= new_h / H + new_bbox.append([x, y, bw, bh]) + return image, keypoints, new_bbox + + +def random_resize(image, keypoints, bbox): + # Random resize + _, H, W = image.shape + scalew, scaleh = np.random.uniform(0.7, 1.5, 2) + resizeW, resizeH = int(W * scalew), int(H * scaleh) + image, keypoints, bbox = resize(image, keypoints, bbox, (resizeH, resizeW)) + return image, keypoints, bbox, {'H': resizeH, 'W': resizeW} + + +def random_crop(image, keypoints, bbox, is_labeled, dataset_type): + if dataset_type == 'mpii': + crop_target = np.random.choice( + ['random_sized_crop', 'crop_all_humans'], + p=[0.1, 0.9], + ) + if dataset_type == 'coco': + crop_target = np.random.choice( + ['random_sized_crop', 'crop_all_humans'], + p=[0.5, 0.5], + ) + + param = {'crop_target': crop_target} + if crop_target == 'random_sized_crop': + image, keypoints, bbox, p = random_resize(image, keypoints, bbox) + param.update(p) + image, keypoints, bbox, p = random_sized_crop(image, keypoints, bbox) + param.update(p) + elif crop_target == 'crop_all_humans': + image, keypoints, bbox, p = crop_all_humans(image, keypoints, bbox, is_labeled) + param.update(p) + + return image, keypoints, bbox, param diff --git a/begin_train.sh b/begin_train.sh new file mode 100644 index 0000000..a4826c0 --- /dev/null +++ b/begin_train.sh @@ -0,0 +1,10 @@ +cat config.ini +docker run --rm \ +-v $(pwd):/work \ +-v ~/work/dataset/mpii_dataset:/mpii_dataset \ +-v ~/work/dataset/coco_dataset:/coco_dataset \ +--runtime=nvidia \ +--name ppn_idein \ +-w /work \ +idein/chainer:5.1.0 \ +python3 train.py diff --git a/coco_dataset.py b/coco_dataset.py index 3f0463d..6dcd9a3 100644 --- a/coco_dataset.py +++ b/coco_dataset.py @@ -157,10 +157,10 @@ def get_coco_dataset(insize, image_root, annotations, is_labeled = d[:, 2] >= 1 entry = images[image_id] - entry[1].append(keypoints) - entry[2].append(bbox) - entry[3].append(is_visible) - entry[4].append(is_labeled) + entry[1].append(np.asarray(keypoints)) + entry[2].append(np.asarray(bbox)) + entry[3].append(np.asarray(is_visible).astype(np.bool)) + entry[4].append(np.asarray(is_labeled).astype(np.bool)) # filter-out non annotated images image_paths = [] diff --git a/config.ini b/config.ini index 26cf41e..88120fb 100644 --- a/config.ini +++ b/config.ini @@ -2,7 +2,7 @@ # batchsize / num of gpus equals the batchsize per gpu batchsize = 22 gpus = main=0 -num_process = 8 +num_process = 11 seed = 0 train_iter = 260000 learning_rate = 0.007 @@ -10,8 +10,8 @@ learning_rate = 0.007 ## mpii dataset [mpii] -images = /data/images -annotations = /data/mpii.json +images = /mpii_dataset/images +annotations = /mpii_dataset/mpii.json parts_scale = 0.5x0.5 instance_scale = 2.0x2.0 train_size = 0.9 @@ -22,10 +22,10 @@ use_cache = False ## coco dataset [coco] -train_images = /data/train2017 -train_annotations = /data/annotations/person_keypoints_train2017.json -val_images = /data/val2017 -val_annotations = /data/annotations/person_keypoints_val2017.json +train_images = /coco_dataset/train2017 +train_annotations = /coco_dataset/annotations/person_keypoints_train2017.json +val_images = /coco_dataset/val2017 +val_annotations = /coco_dataset/annotations/person_keypoints_val2017.json parts_scale = 0.2x0.2 instance_scale = 1.0x1.0 min_num_keypoints = 5 @@ -55,4 +55,12 @@ lambda_resp = 0.25 lambda_iou = 1.0 lambda_coor = 5.0 lambda_size = 5.0 -lambda_limb = 0.5 \ No newline at end of file +lambda_limb = 0.5 + +[predict] +# If `False` is set, hide bbox of annotation other than human instance. +visbbox = True +# detection_thresh +detection_thresh = 0.15 +# ignore human its num of keypoints is less than min_num_keypoints +min_num_keypoints= 1 diff --git a/dataset.py b/dataset.py index e0413f5..f37632f 100644 --- a/dataset.py +++ b/dataset.py @@ -1,11 +1,13 @@ import os -import numpy as np + from chainer.dataset import DatasetMixin from chainercv import utils import chainercv.transforms as transforms import numpy as np -from augment import rotate +from augment import random_rotate, random_flip, random_crop +from augment import scale_fit_short, resize +from augment import augment_image class KeypointDataset2D(DatasetMixin): @@ -43,93 +45,60 @@ def __init__(self, def __len__(self): return len(self.image_paths) - def transform(self, image, keypoints, bbox, is_labeled): - _, H, W = image.shape - # PCA Lighting - image = transforms.pca_lighting(image, sigma=5) + def transform(self, image, keypoints, bbox, is_labeled, is_visible, dataset_type): + transform_param = {} + + # Color augmentation + image, param = augment_image(image, dataset_type) + transform_param['augment_image'] = param # Random rotate - degree = np.random.uniform(-40, 40) - image, keypoints, bbox = rotate(image, keypoints, bbox, degree) + image, keypoints, bbox, param = random_rotate(image, keypoints, bbox) + transform_param['random_rotate'] = param + # Random flip - image, param = transforms.random_flip(image, x_random=True, return_param=True) - if param['x_flip']: - keypoints = [ - transforms.flip_point(points, (H, W), x_flip=True)[self.flip_indices] - for points in keypoints - ] - - is_labeled = [label[self.flip_indices] for label in is_labeled] - - new_bbox = [] - for x, y, w, h in bbox: - [[y, x]] = transforms.flip_point(np.array([[y, x + w]]), (H, W), x_flip=True) - new_bbox.append([x, y, w, h]) - bbox = new_bbox - - # Random resize - scalew, scaleh = np.random.uniform(1.0, 2.0, 2) - resizeW, resizeH = int(W * scalew), int(H * scalew) - image, keypoints, bbox = self.resize(image, keypoints, bbox, (resizeH, resizeW)) + image, keypoints, bbox, is_labeled, is_visible, param = random_flip(image, keypoints, bbox, is_labeled, is_visible, self.flip_indices) + transform_param['random_flip'] = param # Random crop - image, param = transforms.random_sized_crop(image, - scale_ratio_range=(0.5, 5), return_param=True) - keypoints = [ - transforms.translate_point(points, - x_offset=-param['x_slice'].start, - y_offset=-param['y_slice'].start - ) - for points in keypoints - ] - new_bbox = [] - for x, y, w, h in bbox: - new_bbox.append([x - param['x_slice'].start, y - param['y_slice'].start, w, h]) - bbox = new_bbox - - return image, keypoints, bbox, is_labeled - - def resize(self, image, keypoints, bbox, size): - _, h, w = image.shape - new_h, new_w = size - - image = transforms.resize(image, (new_h, new_w)) - keypoints = [ - transforms.resize_point(points, (h, w), (new_h, new_w)) - for points in keypoints - ] - new_bbox = [] - for x, y, bw, bh in bbox: - [[y, x]] = transforms.resize_point(np.array([[y, x]]), (h, w), (new_h, new_w)) - bw *= new_w / w - bh *= new_h / h - new_bbox.append([x, y, bw, bh]) - return image, keypoints, new_bbox + image, keypoints, bbox, param = random_crop(image, keypoints, bbox, is_labeled, dataset_type) + transform_param['random_crop'] = param + + return image, keypoints, bbox, is_labeled, is_visible, transform_param def get_example(self, i): w, h = self.insize if self.use_cache and self.cached_samples[i] is not None: - image, keypoints, bbox, is_labeled = self.cached_samples[i] + image, keypoints, bbox, is_labeled, is_visible = self.cached_samples[i] else: path = os.path.join(self.image_root, self.image_paths[i]) image = utils.read_image(path, dtype=np.float32, color=True) keypoints = self.keypoints[i] bbox = self.bbox[i] is_labeled = self.is_labeled[i] + is_visible = self.is_visible[i] - image, keypoints, bbox = self.resize(image, keypoints, bbox, (h, w)) if self.use_cache: - self.cached_samples[i] = image, keypoints, bbox, is_labeled + image, keypoints, bbox = resize(image, keypoints, bbox, (h, w)) + self.cached_samples[i] = image, keypoints, bbox, is_labeled, is_visible image = image.copy() keypoints = keypoints.copy() bbox = bbox.copy() is_labeled = is_labeled.copy() - - if self.do_augmentation: - image, keypoints, bbox, is_labeled = self.transform(image, keypoints, bbox, is_labeled) - image, keypoints, bbox = self.resize(image, keypoints, bbox, (h, w)) + is_visible = is_visible.copy() + + transform_param = {} + try: + if self.do_augmentation: + image, keypoints, bbox = scale_fit_short(image, keypoints, bbox, length=int(min(h, w) * 1.25)) + image, keypoints, bbox, is_labeled, is_visible, transform_param = self.transform( + image, keypoints, bbox, is_labeled, is_visible, self.dataset_type) + transform_param['do_augmentation'] = self.do_augmentation + image, keypoints, bbox = resize(image, keypoints, bbox, (h, w)) + except Exception as e: + raise Exception("something wrong...transform_param = {}".format(transform_param)) return { 'path': self.image_paths[i], @@ -139,5 +108,7 @@ def get_example(self, i): 'keypoints': keypoints, 'bbox': bbox, 'is_labeled': is_labeled, + 'is_visible': is_visible, 'dataset_type': self.dataset_type, + 'transform_param': transform_param } diff --git a/docker/gpu/Dockerfile b/docker/gpu/Dockerfile index d0a31e1..a4e7bb8 100644 --- a/docker/gpu/Dockerfile +++ b/docker/gpu/Dockerfile @@ -26,7 +26,7 @@ rm -rf /tmp/*.tar.gz && \ apt-get clean && rm -rf /tmp/* /var/tmp* /var/lib/apt/lists/* && \ rm -f /etc/ssh/ssh_host_* && rm -rf /usr/share/man?? /usr/share/man/??_* # install python dependencies -RUN pip3 install pillow matplotlib scipy -RUN pip3 install chainer==5.0.0 cupy-cuda90==5.0.0 chainercv==0.11.0 ideep4py +RUN pip3 install pillow matplotlib scipy tqdm +RUN pip3 install chainer==5.1.0 cupy-cuda90==5.1.0 chainercv==0.11.0 ideep4py # Use Agg backend for matplotlib ENV DISPLAY 0 \ No newline at end of file diff --git a/export_onnx.py b/export_onnx.py new file mode 100644 index 0000000..bd0c555 --- /dev/null +++ b/export_onnx.py @@ -0,0 +1,105 @@ +""" +Export pretrained model to ONNX format. +This is a rough sketch. +For more information see + +https://github.com/chainer/onnx-chainer + +""" +import argparse +import configparser +import logging +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + +import os + +import chainer +import chainer.links as L +from chainer import initializers +import numpy as np +import onnx +import onnx_chainer + +from predict import load_config +from utils import parse_size + + +def get_network(model, **kwargs): + if model == 'mv2': + from network_mobilenetv2 import MobilenetV2 + return MobilenetV2(**kwargs) + elif model == 'resnet50': + from network_resnet import ResNet50 + return ResNet50(**kwargs) + elif model == 'resnet18': + from network_resnet import ResNet + return ResNet(n_layers=18) + elif model == 'resnet34': + from network_resnet import ResNet + return ResNet(n_layers=34) + else: + raise Exception('Invalid model name') + + +class MyModel(chainer.Chain): + + def __init__(self, config): + super(MyModel, self).__init__() + + dataset_type = config.get('dataset', 'type') + if dataset_type == 'mpii': + import mpii_dataset as x_dataset + elif dataset_type == 'coco': + import coco_dataset as x_dataset + else: + raise Exception('Unknown dataset {}'.format(dataset_type)) + + with self.init_scope(): + dtype = np.float32 + self.feature_layer = get_network(config.get('model_param', 'model_name'), dtype=dtype, width_multiplier=1.0) + ksize = self.feature_layer.last_ksize + self.local_grid_size = parse_size(config.get('model_param', 'local_grid_size')) + self.keypoint_names = x_dataset.KEYPOINT_NAMES + self.edges = x_dataset.EDGES + self.lastconv = L.Convolution2D(None, + 6 * len(self.keypoint_names) + + self.local_grid_size[0] * self.local_grid_size[1] * len(self.edges), + ksize=ksize, stride=1, pad=ksize // 2, + initialW=initializers.HeNormal(1 / np.sqrt(2), dtype)) + + def __call__(self, x): + h = self.feature_layer(x) + h = self.feature_layer.last_activation(self.lastconv(h)) + return h + + +def export_onnx(args): + config = load_config(args) + model = MyModel(config) + chainer.serializers.load_npz(os.path.join(args.model, 'bestmodel.npz'), model) + w, h = parse_size(config.get('model_param', 'insize')) + x = np.zeros((1, 3, h, w), dtype=np.float32) + logger.info('begin export') + output = os.path.join(args.model, 'bestmodel.onnx') + with chainer.using_config('train', False): + onnx_chainer.export(model, x, filename=output) + logger.info('end export') + logger.info('run onnx.check') + onnx_model = onnx.load(output) + onnx.checker.check_model(onnx_model) + logger.info('done') + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('model', help='path/to/model', type=str) + return parser.parse_args() + + +def main(): + args = parse_arguments() + export_onnx(args) + +if __name__ == '__main__': + main() diff --git a/high_speed.py b/high_speed.py index b0e9227..7aea6e1 100644 --- a/high_speed.py +++ b/high_speed.py @@ -1,3 +1,4 @@ +import argparse import configparser import os import queue @@ -12,7 +13,7 @@ import numpy as np from PIL import Image -from predict import get_feature, get_humans_by_feature, draw_humans, create_model +from predict import get_feature, get_humans_by_feature, draw_humans, create_model, load_config from utils import parse_size QUEUE_SIZE = 5 @@ -83,11 +84,12 @@ def stop(self): self.stop_event.set() -def main(): - config = configparser.ConfigParser() - config.read('config.ini', 'UTF-8') - - model = create_model(config) +def high_speed(args): + config = load_config(args) + dataset_type = config.get('dataset', 'type') + detection_thresh = config.getfloat('predict', 'detection_thresh') + min_num_keypoints = config.getint('predict', 'min_num_keypoints') + model = create_model(args, config) if os.path.exists('mask.png'): mask = Image.open('mask.png') @@ -123,7 +125,12 @@ def main(): degree = degree % 360 try: image, feature_map = predictor.get() - humans = get_humans_by_feature(model, feature_map) + humans = get_humans_by_feature( + model, + feature_map, + detection_thresh, + min_num_keypoints + ) except queue.Empty: continue except Exception: @@ -134,7 +141,8 @@ def main(): model.edges, pilImg, humans, - mask=mask.rotate(degree) if mask else None + mask=mask.rotate(degree) if mask else None, + visbbox=config.getboolean('predict', 'visbbox'), ) img_with_humans = cv2.cvtColor(np.asarray(pilImg), cv2.COLOR_RGB2BGR) msg = 'GPU ON' if chainer.backends.cuda.available else 'GPU OFF' @@ -158,5 +166,16 @@ def main(): capture.join() predictor.join() + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('model', help='path/to/model', type=str) + return parser.parse_args() + + +def main(): + args = parse_arguments() + high_speed(args) + if __name__ == '__main__': main() diff --git a/model.py b/model.py index 20fd3de..ec20bcf 100644 --- a/model.py +++ b/model.py @@ -226,12 +226,13 @@ def static_forward(self, x): def forward(self, x): """ - this provide interface of forwarding - chainer 5.0.0 gives us static_graph to increase of speed of training - but for some reason this does train i.e. loss does not decrease at all. - We do not trust it for now on training. On the other hand. the speed of - inference increases very well.Also note that if we use ideep option, - the output result between `static_forward` and `_forward` will be different. + This provides an interface of forwarding. + ChainerV5 has a feature Static Subgraph Optimizations to increase training speed. + But for some reason, our model does not decrease loss value at all. + We do not trust it for now on training. On the other hand, by decorating `static_graph` + at forward function, it increases speed of inference very well. + Also note that if we use ideep option, the output result between + `static_forward` and `_forward` will be different. """ if chainer.config.train: return self._forward(x) diff --git a/mpii_dataset.py b/mpii_dataset.py index b15776f..2f1cfea 100644 --- a/mpii_dataset.py +++ b/mpii_dataset.py @@ -122,7 +122,8 @@ def get_mpii_dataset(insize, image_root, annotations, entry[0].append(np.array(keypoints)) # array of y,x entry[1].append(np.array([x1, y1, x2 - x1, y2 - y1])) # x, y, w, h entry[2].append(np.array(is_visible, dtype=np.bool)) - entry[3].append(np.ones(len(is_visible), dtype=np.bool)) + is_labeled = np.ones(len(is_visible), dtype=np.bool) + entry[3].append(is_labeled) # split dataset train_images, test_images = split_dataset_random( diff --git a/predict.py b/predict.py index e78ab0e..bab5c4d 100644 --- a/predict.py +++ b/predict.py @@ -11,9 +11,6 @@ import random import time -import matplotlib -matplotlib.use('Agg') -from matplotlib import pyplot as plt import numpy as np import chainer @@ -22,15 +19,12 @@ else: xp = np -import chainercv.transforms as transforms from chainercv.utils import non_maximum_suppression -from chainercv.visualizations import vis_bbox from PIL import ImageDraw, Image from coco_dataset import get_coco_dataset from mpii_dataset import get_mpii_dataset from model import PoseProposalNet -from train import create_model from network_resnet import ResNet50 from utils import parse_size @@ -63,12 +57,12 @@ def get_feature(model, image): return resp, conf, x, y, w, h, e -def estimate(model, image): +def estimate(model, image, detection_thresh=0.15, min_num_keypoints=-1): feature_map = get_feature(model, image) - return get_humans_by_feature(model, feature_map) + return get_humans_by_feature(model, feature_map, detection_thresh, min_num_keypoints) -def get_humans_by_feature(model, feature_map, detection_thresh=0.15): +def get_humans_by_feature(model, feature_map, detection_thresh=0.15, min_num_keypoints=-1): resp, conf, x, y, w, h, e = feature_map start = time.time() delta = resp * conf @@ -113,14 +107,14 @@ def get_humans_by_feature(model, feature_map, detection_thresh=0.15): break human[t] = bbox[(t, j_h, j_w)] i_h, i_w = j_h, j_w - - humans.append(human) + if min_num_keypoints <= len(human) - 1: + humans.append(human) logger.info('alchemy time {:.5f}'.format(time.time() - start)) logger.info('num humans = {}'.format(len(humans))) return humans -def draw_humans(keypoint_names, edges, pil_image, humans, mask=None): +def draw_humans(keypoint_names, edges, pil_image, humans, mask=None, visbbox=True): """ This is what happens when you use alchemy on humans... note that image should be PIL object @@ -134,7 +128,7 @@ def draw_humans(keypoint_names, edges, pil_image, humans, mask=None): else: fill = None ymin, xmin, ymax, xmax = b - if k == 0: + if k == 0: # human instance # adjust size t = 1 xmin = int(xmin * t + xmax * (1 - t)) @@ -149,9 +143,17 @@ def draw_humans(keypoint_names, edges, pil_image, humans, mask=None): fill=fill, outline=COLOR_MAP[keypoint_names[k]]) else: - drawer.rectangle(xy=[xmin, ymin, xmax, ymax], - fill=fill, - outline=COLOR_MAP[keypoint_names[k]]) + if visbbox: + drawer.rectangle(xy=[xmin, ymin, xmax, ymax], + fill=fill, + outline=COLOR_MAP[keypoint_names[k]]) + else: + r = 2 + x = (xmin + xmax) / 2 + y = (ymin + ymax) / 2 + drawer.ellipse((x - r, y - r, x + r, y + r), + fill=COLOR_MAP[keypoint_names[k]]) + for s, t in edges: if s in human and t in human: by = (human[s][0] + human[s][2]) / 2 @@ -166,7 +168,7 @@ def draw_humans(keypoint_names, edges, pil_image, humans, mask=None): return pil_image -def create_model(config): +def create_model(args, config): global DIRECTED_GRAPHS, COLOR_MAP dataset_type = config.get('dataset', 'type') @@ -194,7 +196,7 @@ def create_model(config): width_multiplier=config.getfloat('model_param', 'width_multiplier'), ) - result_dir = config.get('result', 'dir') + result_dir = args.model chainer.serializers.load_npz( os.path.join(result_dir, 'bestmodel.npz'), model @@ -211,9 +213,18 @@ def create_model(config): return model -def main(): +def load_config(args): config = configparser.ConfigParser() - config.read('config.ini', 'UTF-8') + config_path = os.path.join(args.model, 'src', 'config.ini') + logger.info(config_path) + config.read(config_path, 'UTF-8') + return config + + +def predict(args): + config = load_config(args) + detection_thresh = config.getfloat('predict', 'detection_thresh') + min_num_keypoints = config.getint('predict', 'min_num_keypoints') dataset_type = config.get('dataset', 'type') logger.info('loading {}'.format(dataset_type)) if dataset_type == 'mpii': @@ -235,22 +246,37 @@ def main(): else: raise Exception('Unknown dataset {}'.format(dataset_type)) - model = create_model(config) + model = create_model(args, config) idx = random.choice(range(len(test_set))) image = test_set.get_example(idx)['image'] - humans = estimate(model, - image.astype(np.float32)) + humans = estimate( + model, + image.astype(np.float32), + detection_thresh, + min_num_keypoints, + ) pil_image = Image.fromarray(image.transpose(1, 2, 0).astype(np.uint8)) pil_image = draw_humans( keypoint_names=model.keypoint_names, edges=model.edges, pil_image=pil_image, - humans=humans + humans=humans, + visbbox=config.getboolean('predict', 'visbbox') ) pil_image.save('result.png', 'PNG') +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('model', help='path/to/model', type=str) + return parser.parse_args() + + +def main(): + args = parse_arguments() + predict(args) + if __name__ == '__main__': main() diff --git a/readmedata/cpu-example.gif b/readmedata/cpu-example.gif new file mode 100644 index 0000000..5d725be Binary files /dev/null and b/readmedata/cpu-example.gif differ diff --git a/run_high_speed.sh b/run_high_speed.sh new file mode 100644 index 0000000..0bab818 --- /dev/null +++ b/run_high_speed.sh @@ -0,0 +1,19 @@ +CMDNAME=`basename $0` +BASEMODELDIR=$(pwd) + +if [ $# -ne 1 ]; then + echo "Usage: $CMDNAME path/to/model" 1>&2 + exit 1 +fi + +xhost +local:docker +docker run --rm \ +-e DISPLAY=$DISPLAY \ +-v /tmp/.X11-unix/:/tmp/.X11-unix \ +-v $(pwd):/work \ +-v $BASEMODELDIR:/models \ +--device=/dev/video0:/dev/video0 \ +--runtime=nvidia \ +-w /work \ +ppn:latest python3 high_speed.py /models/$1 +xhost -local:docker diff --git a/run_predict.sh b/run_predict.sh new file mode 100644 index 0000000..b0b1cf0 --- /dev/null +++ b/run_predict.sh @@ -0,0 +1,15 @@ +CMDNAME=`basename $0` +BASEMODELDIR=$(pwd) + +if [ $# -ne 1 ]; then + echo "Usage: $CMDNAME path/to/model" 1>&2 + exit 1 +fi + +docker run --rm \ +-v $(pwd):/work \ +-v $BASEMODELDIR:/models \ +-v ~/work/dataset/mpii_dataset:/mpii_dataset \ +-v ~/work/dataset/coco_dataset:/coco_dataset \ +-w /work \ +idein/chainer:5.1.0 python3 predict.py /models/$1 \ No newline at end of file diff --git a/run_video.sh b/run_video.sh index 6ac5132..a711b28 100644 --- a/run_video.sh +++ b/run_video.sh @@ -1,10 +1,19 @@ +CMDNAME=`basename $0` +BASEMODELDIR=$(pwd) + +if [ $# -ne 1 ]; then + echo "Usage: $CMDNAME path/to/model" 1>&2 + exit 1 +fi + xhost +local:docker docker run --rm \ -e DISPLAY=$DISPLAY \ -v /tmp/.X11-unix/:/tmp/.X11-unix \ --v $PWD:/work \ --w /work \ +-v $(pwd):/work \ +-v $BASEMODELDIR:/models \ --device=/dev/video0:/dev/video0 \ --runtime=nvidia \ -ppn:latest python3 video.py -xhost -local:docker \ No newline at end of file +-w /work \ +ppn:latest python3 video.py /models/$1 +xhost -local:docker diff --git a/train.py b/train.py index c1a9cfc..5a52383 100644 --- a/train.py +++ b/train.py @@ -2,8 +2,9 @@ import random import configparser -from logging import getLogger -logger = getLogger('__main__') +import logging +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) import matplotlib matplotlib.use('Agg') @@ -15,12 +16,11 @@ from chainer.training import extensions import numpy as np -import visualize from model import PoseProposalNet - from coco_dataset import get_coco_dataset from mpii_dataset import get_mpii_dataset from utils import parse_size, parse_kwargs, save_files +import visualize def setup_devices(ids): @@ -73,7 +73,9 @@ def main(): config.read(args.config_path, 'UTF-8') chainer.global_config.autotune = True - chainer.cuda.set_max_workspace_size(11388608) + # chainer.cuda.set_max_workspace_size(11388608) + chainer.cuda.set_max_workspace_size(512 * 1024 * 1024) + chainer.config.cudnn_fast_batch_normalization = True # create result dir and copy file logger.info('> store file to result dir %s', config.get('result', 'dir')) @@ -139,9 +141,10 @@ def main(): train_set, config.getint('training_param', 'batchsize'), n_processes=config.getint('training_param', 'num_process') ) - test_iter = chainer.iterators.SerialIterator( + test_iter = chainer.iterators.MultiprocessIterator( test_set, config.getint('training_param', 'batchsize'), - repeat=False, shuffle=False + repeat=False, shuffle=False, + n_processes=config.getint('training_param', 'num_process') ) logger.info('> setup optimizer') @@ -194,6 +197,7 @@ def main(): logger.info('> start training') trainer.run() + if __name__ == '__main__': import logging logger.addHandler(logging.StreamHandler()) diff --git a/video.py b/video.py index 2a680fe..c10f700 100644 --- a/video.py +++ b/video.py @@ -1,3 +1,4 @@ +import argparse import configparser import logging logger = logging.getLogger(__name__) @@ -13,15 +14,13 @@ from PIL import ImageDraw, Image from predict import COLOR_MAP -from predict import estimate, draw_humans, create_model +from predict import estimate, draw_humans, create_model, load_config from utils import parse_size -def main(): - config = configparser.ConfigParser() - config.read('config.ini', 'UTF-8') - - model = create_model(config) +def video(args): + config = load_config(args) + model = create_model(args, config) cap = cv2.VideoCapture(0) if cap.isOpened() is False: @@ -37,6 +36,8 @@ def main(): fps_time = 0 degree = 0 + detection_thresh = config.getfloat('predict', 'detection_thresh') + min_num_keypoints = config.getint('predict', 'min_num_keypoints') while cap.isOpened(): degree += 5 degree = degree % 360 @@ -45,14 +46,17 @@ def main(): image = cv2.resize(image, model.insize) with chainer.using_config('autotune', True): humans = estimate(model, - image.transpose(2, 0, 1).astype(np.float32)) + image.transpose(2, 0, 1).astype(np.float32), + detection_thresh, + min_num_keypoints) pilImg = Image.fromarray(image) pilImg = draw_humans( model.keypoint_names, model.edges, pilImg, humans, - mask=mask.rotate(degree) if mask else None + mask=mask.rotate(degree) if mask else None, + visbbox=config.getboolean('predict', 'visbbox'), ) img_with_humans = cv2.cvtColor(np.asarray(pilImg), cv2.COLOR_RGB2BGR) msg = 'GPU ON' if chainer.backends.cuda.available else 'GPU OFF' @@ -66,5 +70,16 @@ def main(): if cv2.waitKey(1) == 27: break + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('model', help='path/to/model', type=str) + return parser.parse_args() + + +def main(): + args = parse_arguments() + video(args) + if __name__ == '__main__': main()