Skip to content

Latest commit

Β 

History

History
192 lines (156 loc) Β· 5.57 KB

facebookresearch_pytorchvideo_x3d.md

File metadata and controls

192 lines (156 loc) Β· 5.57 KB
layout background-class body-class category title summary image author tags github-link github-id featured_image_1 featured_image_2 accelerator demo-model-link
hub_detail
hub-background
hub
researchers
X3D
X3D networks pretrained on the Kinetics 400 dataset
x3d.png
FAIR PyTorchVideo
vision
facebookresearch/pytorchvideo
no-image
no-image
β€œcuda-optional”

μ‚¬μš© μ˜ˆμ‹œ

Imports

λͺ¨λΈ 뢈러였기:

import torch
# `x3d_s` λͺ¨λΈ 선택
model_name = 'x3d_s'
model = torch.hub.load('facebookresearch/pytorchvideo', model_name, pretrained=True)

λ‚˜λ¨Έμ§€ ν•¨μˆ˜λ“€ 뢈러였기:

import json
import urllib
from pytorchvideo.data.encoded_video import EncodedVideo

from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample
)

μ…‹μ—…

λͺ¨λΈμ„ 평가 λͺ¨λ“œλ‘œ μ„€μ •ν•˜κ³  μ›ν•˜λŠ” λ””λ°”μ΄μŠ€ 방식을 μ„ νƒν•©λ‹ˆλ‹€.

# GPU λ˜λŠ” CPU 방식을 μ„€μ •ν•©λ‹ˆλ‹€.
device = "cpu"
model = model.eval()
model = model.to(device)

ν† μΉ˜ ν—ˆλΈŒ λͺ¨λΈμ΄ ν›ˆλ ¨λœ Kinetics 400 데이터셋에 λŒ€ν•΄ IDμ—μ„œμ˜ λ ˆμ΄λΈ” 맀핑 정보λ₯Ό λ‹€μš΄λ‘œλ“œν•©λ‹ˆλ‹€. μ΄λŠ” 예츑된 클래슀 IDμ—μ„œ μΉ΄ν…Œκ³ λ¦¬ λ ˆμ΄λΈ” 이름을 κ°€μ Έμ˜€λŠ”λ° μ‚¬μš©λ©λ‹ˆλ‹€.

json_url = "https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json"
json_filename = "kinetics_classnames.json"
try: urllib.URLopener().retrieve(json_url, json_filename)
except: urllib.request.urlretrieve(json_url, json_filename)
with open(json_filename, "r") as f:
    kinetics_classnames = json.load(f)

# λ ˆμ΄λΈ” 이름 맀핑에 λŒ€ν•œ ID λ§Œλ“€κΈ°
kinetics_id_to_classname = {}
for k, v in kinetics_classnames.items():
    kinetics_id_to_classname[v] = str(k).replace('"', "")

μž…λ ₯ ν˜•νƒœμ— λŒ€ν•œ μ •μ˜

mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
frames_per_second = 30
model_transform_params  = {
    "x3d_xs": {
        "side_size": 182,
        "crop_size": 182,
        "num_frames": 4,
        "sampling_rate": 12,
    },
    "x3d_s": {
        "side_size": 182,
        "crop_size": 182,
        "num_frames": 13,
        "sampling_rate": 6,
    },
    "x3d_m": {
        "side_size": 256,
        "crop_size": 256,
        "num_frames": 16,
        "sampling_rate": 5,
    }
}

# λͺ¨λΈμ— λ§žλŠ” λ³€ν™˜ λ§€κ°œλ³€μˆ˜ κ°€μ Έμ˜€κΈ°
transform_params = model_transform_params[model_name]

# 이 λ³€ν™˜μ€ slow_R50 λͺ¨λΈμ— ν•œμ •λ©λ‹ˆλ‹€.
transform =  ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(transform_params["num_frames"]),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(size=transform_params["side_size"]),
            CenterCropVideo(
                crop_size=(transform_params["crop_size"], transform_params["crop_size"])
            )
        ]
    ),
)

# μž…λ ₯ 클립의 κΈΈμ΄λŠ” λͺ¨λΈμ— 따라 λ‹¬λΌμ§‘λ‹ˆλ‹€.
clip_duration = (transform_params["num_frames"] * transform_params["sampling_rate"])/frames_per_second

μΆ”λ‘  μ‹€ν–‰

예제 μ˜μƒμ„ λ‹€μš΄λ‘œλ“œν•©λ‹ˆλ‹€.

url_link = "https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4"
video_path = 'archery.mp4'
try: urllib.URLopener().retrieve(url_link, video_path)
except: urllib.request.urlretrieve(url_link, video_path)

μ˜μƒμ„ 뢈러였고 λͺ¨λΈμ— ν•„μš”ν•œ μž…λ ₯ ν˜•μ‹μœΌλ‘œ λ³€ν™˜ν•©λ‹ˆλ‹€.

# μ‹œμž‘ 및 μ’…λ£Œ 기간을 μ§€μ •ν•˜μ—¬ 뢈러올 클립의 기간을 μ„ νƒν•©λ‹ˆλ‹€.
# start_secλŠ” μ˜μƒμ—μ„œ 행동이 μ‹œμž‘λ˜λŠ” μœ„μΉ˜μ™€ μΌμΉ˜ν•΄μ•Όν•©λ‹ˆλ‹€.
start_sec = 0
end_sec = start_sec + clip_duration

# EncodedVideo helper 클래슀λ₯Ό μ΄ˆκΈ°ν™”ν•˜κ³  μ˜μƒμ„ λΆˆλŸ¬μ˜΅λ‹ˆλ‹€.
video = EncodedVideo.from_path(video_path)

# μ›ν•˜λŠ” 클립을 λΆˆλŸ¬μ˜΅λ‹ˆλ‹€.
video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)

# μ˜μƒ μž…λ ₯을 μ •κ·œν™”ν•˜κΈ° μœ„ν•΄ λ³€ν˜•(transform) ν•¨μˆ˜λ₯Ό μ μš©ν•©λ‹ˆλ‹€.
video_data = transform(video_data)

# μž…λ ₯을 μ›ν•˜λŠ” λ””λ°”μ΄μŠ€λ‘œ μ΄λ™ν•©λ‹ˆλ‹€.
inputs = video_data["video"]
inputs = inputs.to(device)

μ˜ˆμΈ‘κ°’ κ΅¬ν•˜κΈ°

# λͺ¨λΈμ„ 톡해 μž…λ ₯클립을 μ „λ‹¬ν•©λ‹ˆλ‹€.
preds = model(inputs[None, ...])

# 예츑된 클래슀λ₯Ό κ°€μ Έμ˜΅λ‹ˆλ‹€.
post_act = torch.nn.Softmax(dim=1)
preds = post_act(preds)
pred_classes = preds.topk(k=5).indices[0]

# 예츑된 클래슀λ₯Ό λ ˆμ΄λΈ” 이름에 λ§€ν•‘ν•©λ‹ˆλ‹€.
pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]
print("Top 5 predicted labels: %s" % ", ".join(pred_class_names))

λͺ¨λΈ μ„€λͺ…

X3D λͺ¨λΈ μ•„ν‚€ν…μ²˜λŠ” Kinetics 데이터셋에 λŒ€ν•΄ 사전 ν›ˆλ ¨λœ [1]을 기반으둜 ν•©λ‹ˆλ‹€.

| arch | depth | frame length x sample rate | top 1 | top 5 | Flops (G) | Params (M) | | --------------- | ----------- | ----------- | ----------- | ----------- | ----------- | ----------- | ----------- | | X3D | XS | 4x12 | 69.12 | 88.63 | 0.91 | 3.79 | | X3D | S | 13x6 | 73.33 | 91.27 | 2.96 | 3.79 | | X3D | M | 16x5 | 75.94 | 92.72 | 6.72 | 3.79 |

μ°Έκ³ λ¬Έν—Œ

[1] Christoph Feichtenhofer, "X3D: Expanding Architectures for Efficient Video Recognition." https://arxiv.org/abs/2004.04730