Posenet.py 보고서 #53

KimMin-Gwan · 2023-03-26T11:33:11Z

KimMin-Gwan
Mar 26, 2023
Maintainer

Posenet 개요

Posenet은 Google Creative Lab에서 개발한 실시간 인간 자세 추정 기능이다. Tensorflow.js에서 동작하며 브라우저에서 동작하도록 설계되어있다. 우리는 이 Posenet을 로컬에서 사용가능하게 Python으로 구현 한 것을 사용했다.
python으로 Posenet을 구현하는 것은 Tensorflow.js에서 사용하는 것과는 차이가 있다. 특히 사람의 위치를 추정하는 방식을 스스로 지원하는 Posenet.js와는 다르게 Posenet.py는 사물 인식 모듈인 Mobile_net_v1을 사용한다.
사람의 위치를 특정한 후에는 Tensoflow.js에서 사용된 학습 모듈을 불러와 부위별 좌표와 예측값을 제공한다.

converter 모듈과 Posenet 파일들로 구현되어 있다.

converter 모듈

converter 모듈의 구성은 다음과 같다.
- config.py
- config.yaml
- tfjs2python.py
- wget.py

config.py

import yaml
import os

BASE_DIR = os.path.dirname(__file__)

def load_config(config_name='config.yaml'):
    cfg_f = open(os.path.join(BASE_DIR, config_name), "r+")
    cfg = yaml.load(cfg_f, Loader=yaml.FullLoader)
    return cfg

Posenet.js에서 사용된 딥러닝 레이어가 저장된 config.yaml을 읽어오는 부분이다.

tfjs2python.py

import json
import struct
#import tensorflow as tf

import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

from tensorflow.python.tools.freeze_graph import freeze_graph
import cv2
import numpy as np
import tempfile

from posenet.converter.config import load_config

BASE_DIR = os.path.join(tempfile.gettempdir(), '_posenet_weights')


def to_output_strided_layers(convolution_def, output_stride):
    current_stride = 1
    rate = 1
    block_id = 0
    buff = []
    for _a in convolution_def:
        conv_type = _a[0]
        stride = _a[1]
        
        if current_stride == output_stride:
            layer_stride = 1
            layer_rate = rate
            rate *= stride
        else:
            layer_stride = stride
            layer_rate = 1
            current_stride *= stride
        
        buff.append({
            'blockId': block_id,
            'convType': conv_type,
            'stride': layer_stride,
            'rate': layer_rate,
            'outputStride': current_stride
        })
        block_id += 1

    return buff


def load_variables(chkpoint, base_dir=BASE_DIR):
    manifest_path = os.path.join(base_dir, chkpoint, "manifest.json")
    if not os.path.exists(manifest_path):
        print('Weights for checkpoint %s are not downloaded. Downloading to %s ...' % (chkpoint, base_dir))
        from posenet.converter.wget import download
        download(chkpoint, base_dir)
        assert os.path.exists(manifest_path)

    with open(manifest_path) as f:
        variables = json.load(f)

    # with tf.variable_scope(None, 'MobilenetV1'):
    for x in variables:
        filename = variables[x]["filename"]
        byte = open(os.path.join(base_dir, chkpoint, filename), 'rb').read()
        fmt = str(int(len(byte) / struct.calcsize('f'))) + 'f'
        d = struct.unpack(fmt, byte)
        d = tf.cast(d, tf.float32)
        d = tf.reshape(d, variables[x]["shape"])
        variables[x]["x"] = tf.Variable(d, name=x)

    return variables


def _read_imgfile(path, width, height):
    img = cv2.imread(path)
    img = cv2.resize(img, (width, height))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = img.astype(float)
    img = img * (2.0 / 255.0) - 1.0
    return img


def build_network(image, layers, variables):

    def _weights(layer_name):
        return variables["MobilenetV1/" + layer_name + "/weights"]['x']

    def _biases(layer_name):
        return variables["MobilenetV1/" + layer_name + "/biases"]['x']

    def _depthwise_weights(layer_name):
        return variables["MobilenetV1/" + layer_name + "/depthwise_weights"]['x']

    def _conv_to_output(mobile_net_output, output_layer_name):
        w = tf.nn.conv2d(mobile_net_output, _weights(output_layer_name), [1, 1, 1, 1], padding='SAME')
        w = tf.nn.bias_add(w, _biases(output_layer_name), name=output_layer_name)
        return w

    def _conv(inputs, stride, block_id):
        return tf.nn.relu6(
            tf.nn.conv2d(inputs, _weights("Conv2d_" + str(block_id)), stride, padding='SAME')
            + _biases("Conv2d_" + str(block_id)))

    def _separable_conv(inputs, stride, block_id, dilations):
        if dilations is None:
            dilations = [1, 1]

        dw_layer = "Conv2d_" + str(block_id) + "_depthwise"
        pw_layer = "Conv2d_" + str(block_id) + "_pointwise"

        w = tf.nn.depthwise_conv2d(
            inputs, _depthwise_weights(dw_layer), stride, 'SAME', rate=dilations, data_format='NHWC')
        w = tf.nn.bias_add(w, _biases(dw_layer))
        w = tf.nn.relu6(w)

        w = tf.nn.conv2d(w, _weights(pw_layer), [1, 1, 1, 1], padding='SAME')
        w = tf.nn.bias_add(w, _biases(pw_layer))
        w = tf.nn.relu6(w)

        return w

    x = image
    buff = []
    with tf.variable_scope(None, 'MobilenetV1'):

        for m in layers:
            stride = [1, m['stride'], m['stride'], 1]
            rate = [m['rate'], m['rate']]
            if m['convType'] == "conv2d":
                x = _conv(x, stride, m['blockId'])
                buff.append(x)
            elif m['convType'] == "separableConv":
                x = _separable_conv(x, stride, m['blockId'], rate)
                buff.append(x)

    heatmaps = _conv_to_output(x, 'heatmap_2')
    offsets = _conv_to_output(x, 'offset_2')
    displacement_fwd = _conv_to_output(x, 'displacement_fwd_2')
    displacement_bwd = _conv_to_output(x, 'displacement_bwd_2')
    heatmaps = tf.sigmoid(heatmaps, 'heatmap')

    return heatmaps, offsets, displacement_fwd, displacement_bwd


def convert(model_id, model_dir, check=False):
    cfg = load_config()
    checkpoints = cfg['checkpoints']
    image_size = cfg['imageSize']
    output_stride = cfg['outputStride']
    chkpoint = checkpoints[model_id]

    if chkpoint == 'mobilenet_v1_050':
        mobile_net_arch = cfg['mobileNet50Architecture']
    elif chkpoint == 'mobilenet_v1_075':
        mobile_net_arch = cfg['mobileNet75Architecture']
    else:
        mobile_net_arch = cfg['mobileNet100Architecture']

    width = image_size
    height = image_size

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    cg = tf.Graph()
    with cg.as_default():
        layers = to_output_strided_layers(mobile_net_arch, output_stride)
        variables = load_variables(chkpoint)

        init = tf.global_variables_initializer()
        with tf.Session() as sess:
            sess.run(init)
            saver = tf.train.Saver()

            image_ph = tf.placeholder(tf.float32, shape=[1, None, None, 3], name='image')
            outputs = build_network(image_ph, layers, variables)

            sess.run(
                [outputs],
                feed_dict={
                    image_ph: [np.ndarray(shape=(height, width, 3), dtype=np.float32)]
                }
            )

            save_path = os.path.join(model_dir, 'checkpoints', 'model-%s.ckpt' % chkpoint)
            if not os.path.exists(os.path.dirname(save_path)):
                os.makedirs(os.path.dirname(save_path))
            checkpoint_path = saver.save(sess, save_path, write_state=False)

            tf.train.write_graph(cg, model_dir, "model-%s.pbtxt" % chkpoint)

            # Freeze graph and write our final model file
            freeze_graph(
                input_graph=os.path.join(model_dir, "model-%s.pbtxt" % chkpoint),
                input_saver="",
                input_binary=False,
                input_checkpoint=checkpoint_path,
                output_node_names='heatmap,offset_2,displacement_fwd_2,displacement_bwd_2',
                restore_op_name="save/restore_all",
                filename_tensor_name="save/Const:0",
                output_graph=os.path.join(model_dir, "model-%s.pb" % chkpoint),
                clear_devices=True,
                initializer_nodes="")

            if check and os.path.exists("./images/tennis_in_crowd.jpg"):
                # Result
                input_image = _read_imgfile("./images/tennis_in_crowd.jpg", width, height)
                input_image = np.array(input_image, dtype=np.float32)
                input_image = input_image.reshape(1, height, width, 3)

                heatmaps_result, offsets_result, displacement_fwd_result, displacement_bwd_result = sess.run(
                    outputs,
                    feed_dict={image_ph: input_image}
                )

                print("Test image stats")
                print(input_image)
                print(input_image.shape)
                print(np.mean(input_image))

                heatmaps_result = heatmaps_result[0]

                print("Heatmaps")
                print(heatmaps_result[0:1, 0:1, :])
                print(heatmaps_result.shape)
                print(np.mean(heatmaps_result))

과거에 posenet을 python으로 옮기는 과정을 진행했던 rwightman님이 작업한 내용들입니다.
전체적인 구성은 tensorflow_1의 레이어 구성방식을 따르고 있으며, config.yaml에서 받아온 레이어를 그대로 활용하여 사용합니다.
posenet.js 에서 사용된 weights 값을 그대로 활용하여 동작하고 있으며, mobilenet을 통해 구체적인 2d레이어를 구상합니다.

wget

import urllib.request
import posixpath
import json
import zlib
import os

from posenet.converter.config import load_config

CFG = load_config()
GOOGLE_CLOUD_STORAGE_DIR = CFG['GOOGLE_CLOUD_STORAGE_DIR']
CHECKPOINTS = CFG['checkpoints']
CHK = CFG['chk']


def download_file(checkpoint, filename, base_dir):
    output_path = os.path.join(base_dir, checkpoint, filename)
    url = posixpath.join(GOOGLE_CLOUD_STORAGE_DIR, checkpoint, filename)
    req = urllib.request.Request(url)
    response = urllib.request.urlopen(req)
    if response.info().get('Content-Encoding') == 'gzip':
        data = zlib.decompress(response.read(), zlib.MAX_WBITS | 32)
    else:
        # this path not tested since gzip encoding default on google server
        # may need additional encoding/text handling if hit in the future
        data = response.read()
    with open(output_path, 'wb') as f:
        f.write(data)

def download(checkpoint, base_dir='./weights/'):
    save_dir = os.path.join(base_dir, checkpoint)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    download_file(checkpoint, 'manifest.json', base_dir)
    with open(os.path.join(save_dir, 'manifest.json'), 'r') as f:
        json_dict = json.load(f)

    for x in json_dict:
        filename = json_dict[x]['filename']
        print('Downloading', filename)
        download_file(checkpoint, filename, base_dir)

def main():
    checkpoint = CHECKPOINTS[CHK]
    download(checkpoint)

if __name__ == "__main__":
    main()

tfjs2python.py을 이용하여 posenet의 기존 학습 값을 읽어오는 부분들 입니다.
_models라는 학습 모델을 하나 생성해주며 이 코드는 posenet을 처음 실행할 때 한번만 실행됩니다.

Posenet 패키지

Posenet 패키지는 다음과 같은 구성으로 이루어져 있다.
- argument.py
- constants.py
- decode.py
- decode_multi.py
- detection.py
- gesture_mode.py
- model.py
- parts.py
- utils.py

argument.py

# Argument
MODEL = 101
CAM_ID = 0
CAM_WIDTH = 1280
CAM_HEIGTH= 720
SCALE_FACTOR = 0.7 #0.7125

기존에 rwightman님이 사용한 코드에는 argparse를 이용해서 모델 특성이나 카메라 인풋의 크기를 조절 했다.
우리가 사용하는 시스템에 모델 특성과 카메라 인풋의 크기 조절이 유동적일 필요는 없으니 상수로 고정시켰다.

constants.py

PART_NAMES = [
    "nose", "leftEye", "rightEye", "leftEar", "rightEar", "leftShoulder",
    "rightShoulder", "leftElbow", "rightElbow", "leftWrist", "rightWrist",
    "leftHip", "rightHip", "leftKnee", "rightKnee", "leftAnkle", "rightAnkle"
]

# NUM_KEYPOINTS = 부위의 개수를 나타내는 변수 
NUM_KEYPOINTS = len(PART_NAMES)

# PART_IDS = 부위 이름을 해당 부위의 인덱스로 매핑한 딕셔너리
PART_IDS = {pn: pid for pid, pn in enumerate(PART_NAMES)}

# CONNECTED_PART_NAMES = 연결된 부위들의 쌍으로 나타내는 리스트 
CONNECTED_PART_NAMES = [
    ("leftHip", "leftShoulder"), ("leftElbow", "leftShoulder"),
    ("leftElbow", "leftWrist"), ("leftHip", "leftKnee"),
    ("leftKnee", "leftAnkle"), ("rightHip", "rightShoulder"),
    ("rightElbow", "rightShoulder"), ("rightElbow", "rightWrist"),
    ("rightHip", "rightKnee"), ("rightKnee", "rightAnkle"),
    ("leftShoulder", "rightShoulder"), ("leftHip", "rightHip")
]

# CONNECTED_PART_INDICES = 연결된 부위들의 인덱스 쌍을 나타내는 리스트
CONNECTED_PART_INDICES = [(PART_IDS[a], PART_IDS[b]) for a, b in CONNECTED_PART_NAMES]

# LOCAL_MAXIMUM_RADIUS = Keypoint 검출시 연관된 인근 영역의 크기
LOCAL_MAXIMUM_RADIUS = 1

# POSE_CHAIN = 인체 포즈 추정시 이전 부위와 연결된 부위를 나타내는 리스트
POSE_CHAIN = [
    ("nose", "leftEye"), ("leftEye", "leftEar"), ("nose", "rightEye"),
    ("rightEye", "rightEar"), ("nose", "leftShoulder"),
    ("leftShoulder", "leftElbow"), ("leftElbow", "leftWrist"),
    ("leftShoulder", "leftHip"), ("leftHip", "leftKnee"),
    ("leftKnee", "leftAnkle"), ("nose", "rightShoulder"),
    ("rightShoulder", "rightElbow"), ("rightElbow", "rightWrist"),
    ("rightShoulder", "rightHip"), ("rightHip", "rightKnee"),
    ("rightKnee", "rightAnkle")
]

# PARENT_CHILD_TUPLES = POSE_CHAIN 리스트를 부위 인덱스 쌍으로 나타내는 리스트
PARENT_CHILD_TUPLES = [(PART_IDS[parent], PART_IDS[child]) for parent, child in POSE_CHAIN]

# PART_CHANNELS = 출력 마스크에서 각 부위의 색상 채널 이름을 나타내는 리스트 
PART_CHANNELS = [
  'left_face',
  'right_face',
  'right_upper_leg_front',
  'right_lower_leg_back',
  'right_upper_leg_back',
  'left_lower_leg_front',
  'left_upper_leg_front',
  'left_upper_leg_back',
  'left_lower_leg_back',
  'right_feet',
  'right_lower_leg_front',
  'left_feet',
  'torso_front',
  'torso_back',
  'right_upper_arm_front',
  'right_upper_arm_back',
  'right_lower_arm_back',
  'left_lower_arm_front',
  'left_upper_arm_front',
  'left_upper_arm_back',
  'left_lower_arm_back',
  'right_hand',
  'right_lower_arm_front',
  'left_hand'
]

부위별 좌표를 알아내는 posenet 모델의 특성상 부위의 이름이 문자열 형태로 있어야하고, 부위를 각각 선으로 연결하여 뼈대를 만들기 때문에 부위들 간의 인과 관계를 묶어서 저장한다.

decode_multi.py

from posenet.decode import *
from posenet.constants import *
import time
import scipy.ndimage as ndi

def within_nms_radius_fast(pose_coords, squared_nms_radius, point):

    if not pose_coords.shape[0]:
        return False
    return np.any(np.sum((pose_coords - point) ** 2, axis=1) <= squared_nms_radius)

부위에 찍힌 keypoint 사이의 거리를 계산한다.

def get_instance_score_fast(
        exist_pose_coords,
        squared_nms_radius,
        keypoint_scores, keypoint_coords):

    # 배열이 비어있는지 확인 후 비어있지 않다면 exist..배열의 모든 행과 keypoint... 배열의 모든 행 간의 거리 계산
    if exist_pose_coords.shape[0]:
        s = np.sum((exist_pose_coords - keypoint_coords) ** 2, axis=2) > squared_nms_radius
        not_overlapped_scores = np.sum(keypoint_scores[np.all(s, axis=0)])

    # 배열이 비어있다면 keypoint_scores 배열의 모든 값을 더하기    
    else:
        not_overlapped_scores = np.sum(keypoint_scores)    
    return not_overlapped_scores / len(keypoint_scores)

사람일 확률을 받아온다. 일정 확률이 넘어가면 사람으로 인정된다.

def build_part_with_score_fast(score_threshold, local_max_radius, scores):
    parts = []
    num_keypoints = scores.shape[2]
    lmd = 2 * local_max_radius + 1

    for keypoint_id in range(num_keypoints):
        kp_scores = scores[:, :, keypoint_id].copy()
        kp_scores[kp_scores < score_threshold] = 0  

        max_vals = ndi.maximum_filter(kp_scores, size=lmd, mode='constant')
        max_loc = np.logical_and(kp_scores == max_vals, kp_scores > 0)

        max_loc_idx = max_loc.nonzero()
        for y, x in zip(*max_loc_idx):
            parts.append((
                scores[y, x, keypoint_id],
                keypoint_id,
                np.array((y, x))
            ))
    #print(parts)

    return parts

부위별 확률을 계산하여 리턴한다.
확률의 크기에 상관없이 전부다 리턴시킨다.

# 여러 인물들의 포즈를 디코딩하는 함수
def decode_multiple_poses(
        scores, offsets, displacements_fwd, displacements_bwd, output_stride,
        max_pose_detections=10, score_threshold=0.5, nms_radius=20, min_pose_score=0.5):

    pose_count = 0
    pose_scores = np.zeros(max_pose_detections)
    # 각 인스턴스의 모든 키포인트의 점수를 저장하는 배열 (2차원)
    pose_keypoint_scores = np.zeros((max_pose_detections, NUM_KEYPOINTS))
    # 각 인스턴스의 모든 키포인트의 좌표를 저장하는 배열 (3차원)
    pose_keypoint_coords = np.zeros((max_pose_detections, NUM_KEYPOINTS, 2))

    squared_nms_radius = nms_radius ** 2

    # 함수를 통해 반환된값을 score_parts에 저장하고 점수가 높은 순서대로 정렬하기
    scored_parts = build_part_with_score_fast(score_threshold, LOCAL_MAXIMUM_RADIUS, scores)
    scored_parts = sorted(scored_parts, key=lambda x: x[0], reverse=True)
    height = scores.shape[0]
    width = scores.shape[1]
    # 각 배열을 재구성하고 마지막 두 축을 바꾸어 배열을 갱신
    offsets = offsets.reshape(height, width, 2, -1).swapaxes(2, 3)
    displacements_fwd = displacements_fwd.reshape(height, width, 2, -1).swapaxes(2, 3)
    displacements_bwd = displacements_bwd.reshape(height, width, 2, -1).swapaxes(2, 3)

    
    for root_score, root_id, root_coord in scored_parts:

        # 각 값을 이용하여 root_point의 이미지 좌표를 계산
        root_image_coords = root_coord * output_stride + offsets[
            root_coord[0], root_coord[1], root_id]

        # 현재 예측된 포즈와 중복되는지 확인
        if within_nms_radius_fast(
                pose_keypoint_coords[:pose_count, root_id, :], squared_nms_radius, root_image_coords):
            continue

        # 중복되지않는다면 decode_pose함수를 통해 root와 연결된 모든 부위의 포즈 정보를 계산    
        keypoint_scores, keypoint_coords = decode_pose(
            root_score, root_id, root_image_coords,
            scores, offsets, output_stride,
            displacements_fwd, displacements_bwd)

        # get_instance...함수를 통해 인스턴스에 대한 점수 계산
        pose_score = get_instance_score_fast(
            pose_keypoint_coords[:pose_count, :, :], squared_nms_radius, keypoint_scores, keypoint_coords)

        # 이 과정들을 반복하여 프레임에 있는 모든 포즈에 대한 정보를 담기

        # 감지된 포즈들에 대한 결과를 저장하는 과정
        # 조건 만족 여부에 따른 반복문 루프 종료 여부 결정
        if min_pose_score == 0. or pose_score >= min_pose_score:
            pose_scores[pose_count] = pose_score
            pose_keypoint_scores[pose_count, :] = keypoint_scores
            pose_keypoint_coords[pose_count, :, :] = keypoint_coords
            pose_count += 1

        if pose_count >= max_pose_detections:
            break

    # 감지된 포즈의 개수, 포즈 점수, 포즈의 키포인트 좌표를 담고 있는 numpy 배열         
    return pose_scores, pose_keypoint_scores, pose_keypoint_coords

predict하여 도출된 값들을 정리하여 활용가능하게 만들어준다.
자세한 동작은 주석을 참고

decode.py

import numpy as np

from posenet.constants import *



def traverse_to_targ_keypoint(
        edge_id, source_keypoint, target_keypoint_id, scores, offsets, output_stride, displacements):
    height = scores.shape[0]
    width = scores.shape[1]

    # clip함수를 사용하여 source_keypoint_indices를 계산 (계산된 인덱스가 배열의 범위를 벗어나지 않도록 하기 위한 것)
    # np.clip = 지정된 범위내에서 잘라냄 (min보다 작으면 min / max보다 크면 max로 변환)
    source_keypoint_indices = np.clip(
        np.round(source_keypoint / output_stride), a_min=0, a_max=[height - 1, width - 1]).astype(np.int32)

    # displaced_point = 현재 키포인트와 연결된 다음 키포인트로 이동한 점의 좌표
    displaced_point = source_keypoint + displacements[
        source_keypoint_indices[0], source_keypoint_indices[1], edge_id]

    # clip함수를 사용하여 displaced_point_indices를 계산 (다음 키포인트의 위치를 배열 인덱스 형태로 계산하기 위한 것)
    displaced_point_indices = np.clip(
        np.round(displaced_point / output_stride), a_min=0, a_max=[height - 1, width - 1]).astype(np.int32)

    # score = 현재 연결된 키포인트와 연결된 다음 키포인트에 대한 예측 점수
    score = scores[displaced_point_indices[0], displaced_point_indices[1], target_keypoint_id]

    # image_coord = 예측된 다음 키포인트의 이미지 상의 좌표 
    image_coord = displaced_point_indices * output_stride + offsets[
        displaced_point_indices[0], displaced_point_indices[1], target_keypoint_id]

    return score, image_coord

주어진 edge_id와 source_keypoint를 사용하여 target_keypoint_id와 일치하는 키포인트를 찾는 함수

def decode_pose(
        root_score, root_id, root_image_coord,
        scores,
        offsets,
        output_stride,
        displacements_fwd,
        displacements_bwd
):
    num_parts = scores.shape[2]
    num_edges = len(PARENT_CHILD_TUPLES)

    # instance_keypoint_scores = 키포인트 각각의 신뢰도 값을 저장하는 배열
    # instance_keypoint_coords = 키포인트 각각의 좌표 값을 저장하는 2차원 배열 
    instance_keypoint_scores = np.zeros(num_parts)
    instance_keypoint_coords = np.zeros((num_parts, 2))
    instance_keypoint_scores[root_id] = root_score
    instance_keypoint_coords[root_id] = root_image_coord

    # 이미지에서 키포인트 간의 관계를 연결하는 작업을 수행(역순으로 반복)
    for edge in reversed(range(num_edges)):

        # 배열에서 추출한 각 edge의 출발점과 도착점 설정
        target_keypoint_id, source_keypoint_id = PARENT_CHILD_TUPLES[edge]

        if (instance_keypoint_scores[source_keypoint_id] > 0.0 and
                instance_keypoint_scores[target_keypoint_id] == 0.0):
            # traverse_to_targ_keypoint함수를 통해 현재 edege를 따라 도착점까지 탐색하면서 도착점의 좌표값과 신뢰도 값 추출
            score, coords = traverse_to_targ_keypoint(
                edge,instance_keypoint_coords[source_keypoint_id],
                target_keypoint_id,
                scores, offsets, output_stride, displacements_bwd)

            # 추출한 도착점의 좌표값과 신뢰도 값을 배열에 저장
            instance_keypoint_scores[target_keypoint_id] = score
            instance_keypoint_coords[target_keypoint_id] = coords

    # 동작 방식은 위의 for문과 동일하나 순서대로 반복한다는 것이 차이점 (역순 X)
    for edge in range(num_edges):
        source_keypoint_id, target_keypoint_id = PARENT_CHILD_TUPLES[edge]
        if (instance_keypoint_scores[source_keypoint_id] > 0.0 and
                instance_keypoint_scores[target_keypoint_id] == 0.0):
            score, coords = traverse_to_targ_keypoint(
                edge,instance_keypoint_coords[source_keypoint_id],
                target_keypoint_id,
                scores, offsets, output_stride, displacements_fwd)
            instance_keypoint_scores[target_keypoint_id] = score
            instance_keypoint_coords[target_keypoint_id] = coords

    # 반환값은 추출한 도착점의 좌표값과 신뢰도 값이 담아져있는 배열
    return instance_keypoint_scores, instance_keypoint_coords

예측한 키포인트 관련 정보를 사용하여 이미지 상에서 실제 키포인트 좌표를 디코딩하는 함수
자세한 동작 과정은 주석을 참고

detection.py

import cv2
import posenet

# 본문에 있던 while 돌아가는 부분
def detection(frame, model_cfg, model_outputs, sess, gesture, command, parts):
    input_image, display_image, output_scale = posenet.read_cap(
        frame, scale_factor=posenet.SCALE_FACTOR, output_stride=model_cfg['output_stride'])

    trigger = None
    heatmaps_result, offsets_result, displacement_fwd_result, displacement_bwd_result = sess.run(
        model_outputs,
        feed_dict={'image:0': input_image}
    )

    pose_scores, keypoint_scores, keypoint_coords = posenet.decode_multi.decode_multiple_poses(
        heatmaps_result.squeeze(axis=0),
        offsets_result.squeeze(axis=0),
        displacement_fwd_result.squeeze(axis=0),
        displacement_bwd_result.squeeze(axis=0),
        output_stride=model_cfg['output_stride'],
        max_pose_detections=10,
        min_pose_score=0.15)

    keypoint_coords *= output_scale

    #overlay_image = posenet.draw_skel_and_kp(
    #    display_image, pose_scores, keypoint_scores, keypoint_coords,
    #    min_pose_score=0.15, min_part_score=0.1)

    if gesture is True:
        overlay_image, command, parts = posenet.figure_out_command(
            display_image, pose_scores, keypoint_scores, keypoint_coords, command, parts)
        '''
        overlay_image, trigger = posenet.draw_part_name(
            display_image, pose_scores, keypoint_scores, keypoint_coords,
            min_pose_score=0.15, min_part_score=0.1)
        command = 1
        '''

    else:
        overlay_image, trigger = posenet.draw_part_name(
            display_image, pose_scores, keypoint_scores, keypoint_coords,
            min_pose_score=0.15, min_part_score=0.1)

       
    overlay_image= cv2.flip(overlay_image, 1)
    
    return overlay_image, trigger, command, parts

본문의 반복문에 있던 코드를 하나의 함수로 구현했다.
앞 부분에서는 현재 프래임을 predict 하는 부분이다. decode_mulitiple_pose에서 predict가 실행된다.
매게변수로 받는 gesture 라는 변수를 기준으로 figure_out_command함수로 가거나 draw_parts_name을 실행한다.
draw_parts_name은 평시 상태에서 사용되며, 부위별 좌표를 받아내는 것 이외 동작은 없다.
figure_out_command는 제스처 모드에 사용되며, 제스처를 구분하는 제스처 모드가 바로 이것이다.

gesture_mode.py

박세빈님이 구현한 내용으로 대체

model.py

import posenet.converter.config

#텐서플로우 버전 1을 사용하도록 
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior() #tensorflow V2의 함수를 비활성화

import os
# 경고 수준을 2단계로 설정(경고 비활성화)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
MODEL_DIR = './_models'

def model_id_to_ord(model_id):
    if 0 <= model_id < 4:
        return model_id  # id is already ordinal
    elif model_id == 50:
        return 0
    elif model_id == 75:
        return 1
    elif model_id == 100:
        return 2
    else:  # 101
        return 3

파라미터에 따라 다른 모델을 불러온다.
여기서는 101만 불러오기 때문에 3을 리턴시킨다.

def load_config(model_ord):
    converter_cfg = posenet.converter.config.load_config() #config.yaml 파일을 읽어온다.
    #config.yaml은 mobilenet 에서 사용하는 model의 세부정보가 기록되어있다.

    checkpoints = converter_cfg['checkpoints'] # config.yaml에 기록된 checkpoints 배열을 가지고옴
    #checkpoints: [ 'mobilenet_v1_050', 'mobilenet_v1_075', 'mobilenet_v1_100', 'mobilenet_v1_101'] 

    output_stride = converter_cfg['outputStride'] # config.yaml에 기록된 outputStride
    #outputStride :  16

    checkpoint_name = checkpoints[model_ord] #model_ord = 3
    #checkpoint_name : mobilenet_v1_100

    model_cfg = {
        'output_stride': output_stride, # 16
        'checkpoint_name': checkpoint_name, # mobilenet_v1_101
    }

    return model_cfg

레이어와 모델이 들어있은 config.yaml 파일을 읽어오는 부분이다.

def load_model(model_id, sess, model_dir=MODEL_DIR):
    model_ord = model_id_to_ord(model_id) #if input 101, return 3
    model_cfg = load_config(model_ord)  #리턴되서 받아온 값은 아래와 같다
    """
    model_cfg = {
        'output_stride': output_stride, # 16
        'checkpoint_name': checkpoint_name, # mobilenet_v1_101
    }
    """
    model_path = os.path.join(model_dir, 'model-%s.pb' % model_cfg['checkpoint_name'])
    
    #대충 tfjs2python.py에서 convert함수를 통해 모델을 생성한다는 뜻
    if not os.path.exists(model_path):
        print('Cannot find model file %s, converting from tfjs...' % model_path)
        from posenet.converter.tfjs2python import convert
        convert(model_ord, model_dir, check=False)
        assert os.path.exists(model_path)
    
    # tensorflow 모델을 불러오는 부분
    with tf.gfile.GFile(model_path, 'rb') as f:
        graph_def = tf.GraphDef()
    graph_def.ParseFromString(f.read())
    #session에 모델을 올려준다
    sess.graph.as_default()
    tf.import_graph_def(graph_def, name ='')

    offsets = sess.graph.get_tensor_by_name('offset_2:0')
    displacement_fwd = sess.graph.get_tensor_by_name('displacement_fwd_2:0')
    displacement_bwd = sess.graph.get_tensor_by_name('displacement_bwd_2:0')
    heatmaps = sess.graph.get_tensor_by_name('heatmap:0')

    #모델 내용을 리턴 ({모델 이름, 아웃풋 갯수}, 텐서모델명, 텐서 모델 오프셋, 연산이 필요한 데이터, 연산이 끝난 데이터,)
    return model_cfg, [heatmaps, offsets, displacement_fwd, displacement_bwd]

converter 모듈로 최초에 _model 을 생성하고나서 _model의 내용을 읽어오는 부분으로 구성되어 있다.
초기에 argparse에 의해 model의 특징을 선택 할 수 있도록 되어져 있었는데, 이 부분은 그대로 사용했다.

utils.py

import cv2 
import numpy as np  

import posenet.constants


# 입력 : 이미지의 너비(width), 높이(height), 출력 스트라이드(output_stride)
def valid_resolution(width, height, output_stride=16):

    # 유효한 해상도 계산
    target_width = (int(width) // output_stride) * output_stride + 1
    target_height = (int(height) // output_stride) * output_stride + 1

    # 반환값은 (target_width, target_height) 형태의 튜플로 반환
    return target_width, target_height

유효한 해상도를 계산하는 함수

def _process_input(source_img, scale_factor=1.0, output_stride=16):

    # valid_resolution 함수를 통해 유효한 해상도를 계산
    target_width, target_height = valid_resolution(
        source_img.shape[1] * scale_factor, source_img.shape[0] * scale_factor, output_stride=output_stride)
    
    # scale = 이미지를 처리하기 위해 사용되는 크기 비율
    # 축소된 비율을 이용해 배열 만들고 이를 이용해 후속 처리 단계에서 원하는 크기로 확장 
    scale = np.array([source_img.shape[0] / target_height, source_img.shape[1] / target_width])
    #print('target_width = ', target_width)
    #print('target_height = ', target_height)
    
    # cv2의 resize함수를 이용하여 source_img를 target_width와 target_height에 맞게 조정
    # resize 함수 = 이미지의 크기를 조절하는 함수 
    # cv2.INTER_LINEAR 함수 = 양선형 보간법 (효율성이 가장 좋음, 속도 빠름, 퀄리티 적당)
    input_img = cv2.resize(source_img, (target_width, target_height), interpolation=cv2.INTER_LINEAR)
    
    # cv2의 cvtColor함수를 이용하여 BGR 색상 공간에서 RGB 색상 공간으로 변환
    input_img = cv2.cvtColor(input_img, cv2.COLOR_BGR2RGB).astype(np.float32)
    
    # 이미지를 픽셀 단위로 정규화 (픽셀 범위 : 0 ~ 255 -> -1 ~ 1)
    input_img = input_img * (2.0 / 255.0) - 1.0

    # reshape함수를 이용하여 4차원을 가지는 배열로 변환
    # 첫번째 차원 : 배치 크기 (하나의 이미지만 처리하므로 1) / 두번째, 세번째 차원 : 이미지의 높이와 너비 / 네번째 차원 : 색상 채널
    input_img = input_img.reshape(1, target_height, target_width, 3)
    
    # 반환값은 전처리된 입력 이미지(input_img), 원본 입력 이미지(source_img), 크기 조정시 사용된 크기 비율(scale)
    return input_img, source_img, scale

입력 이미지를 처리하는 함수이다.
입력 이미지는 source_img로 제공, scale_factor와 output_stride 매개변수를 사용하여 이미지를 전처리한다.

def read_cap(cap, scale_factor=1.0, output_stride=16):

    # cap.read 함수를 이용하여 프레임을 읽어오기
    # 비디오 프레임을 제대로 읽으면 res가 true, 실패시 false  / 읽은 프레임은 img

    # 반환값은 읽은 이미지를 함수를 통해 처리한 값    
    return _process_input(cap, scale_factor, output_stride)

웹캠에서 읽은 영상 프레임을 입력으로 받아 함수 호출을 통한 처리 후 반환하는 함수

def get_adjacent_keypoints(keypoint_scores, keypoint_coords, min_confidence=0.1):
    results = [] #리턴시킬 배열 (x, y)
    
    # posenet에서 받아온 인덱스들의 좌 우를 찍음
    # posenet.CONNECTED_PART_INDICES는 연결된 키포인트 쌍들을 나타내는 상수
    for left, right in posenet.CONNECTED_PART_INDICES:
        if keypoint_scores[left] < min_confidence or keypoint_scores[right] < min_confidence:
            continue
        
        #받아온 결과를 numpy.array 형식으로 [y, x] 와 같이 저장 (::의 역할 / opencv에서는 좌표값을 y,x 순서로 다룸)
        results.append(
            np.array([keypoint_coords[left][::-1], keypoint_coords[right][::-1]]).astype(np.int32),
        )
   
    # 반환값은 좌표값 
    return results

점의 좌표를 가지고 오는 함수 (연결된 키포인트들의 좌표를 계산)

# 부위를 화면에 표시해주는 함수
def draw_part_name(
        img, instance_scores, keypoint_scores, keypoint_coords,
        min_pose_score = 0.5, min_part_score=0.5):
    out_img = img
    trigger = False
    font = cv2.FONT_HERSHEY_SIMPLEX
    real_co = []
    leftWrist = 0
    rightWrist = 0
    nose = 0
    
    for ii, score in enumerate(instance_scores):


        # score가 뭘 하는지 모르겠지만 아래 함수에서 써서 같이 써줌
        if score < min_pose_score:
            continue
        
        # instance_scores의 길이는 포즈 갯수와 같음
        for ki in range(len(instance_scores)):
            if instance_scores[ki] == 0.:
                break
            
            # x와 y가 뒤집혀 있어서 뒤집어서 real_co 배열에 넣어줌
            for kc, (s, c) in enumerate(zip(keypoint_scores[ki, :], keypoint_coords[ki, :, :])):
                name = posenet.PART_NAMES[kc]
                x = c[1].astype(np.int32)
                real_co.append(x)
                y = c[0].astype(np.int32)
                real_co.append(y)

                # 파트가 실제로 화면에 찍혔을때
                if s > min_part_score :
                    # 화면에 출력
                    cv2.putText(out_img, name, real_co, font, 1, (0, 0, 0), 1 ) 
                    # 화면에 잡힌 손과 코의 y좌표를 저장
                    if name == posenet.PART_NAMES[0]:
                        nose = y
                    elif name == posenet.PART_NAMES[9]:
                        leftWrist = y
                    elif name == posenet.PART_NAMES[10]:
                        rightWrist = y
                # 배열 비우기
                real_co.clear()
    # 만약 왼손과 오른손이 화면에 잡히고
    if leftWrist != 0 or rightWrist != 0:
        # 두 손중 하나가 코 위에 있다면 아래의 코드를 실행한다.
        if leftWrist < nose or rightWrist < nose :
            print('hand is higher than nose now')
            trigger = True

    return out_img, trigger

입력 이미지 위에 PoseNet 알고리즘이 예측한 결과를 시각화하여 출력 이미지를 반환하는 함수

KimMin-Gwan · 2023-03-26T14:02:55Z

KimMin-Gwan
Mar 26, 2023
Maintainer Author

함수별로 분해해서 다시 수정 및 설명하겠습니다.

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Posenet.py 보고서 #53

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Replies: 1 comment

{{title}}

Select a reply

Posenet.py 보고서 #53

KimMin-Gwan Mar 26, 2023 Maintainer

Posenet 개요

converter 모듈

config.py

tfjs2python.py

wget

Posenet 패키지

argument.py

constants.py

decode_multi.py

decode.py

detection.py

gesture_mode.py

model.py

utils.py

Replies: 1 comment

KimMin-Gwan Mar 26, 2023 Maintainer Author

KimMin-Gwan
Mar 26, 2023
Maintainer

KimMin-Gwan
Mar 26, 2023
Maintainer Author