はじめに
前回は 3D-ResNets-PyTorch へ渡す入力データを作成する方法を説明しました。
今回は、 Tensor 型へ変換しつつ、結果を確認するところまで説明できればと思います。
前提条件
前提条件は以下の通りです。
- Python3.9
- torch == 1.13.0+cu117, torchvision == 0.14.0+cu117
- 作業は WSL2 で実施します
推論プログラムの作成
推論プログラムは、以下となります。
以前の記事で作成した main_inference.py が必要です。
inference_custom.py
import time
import json
from collections import defaultdict
import torch
import torch.nn.functional as F
from utils import AverageMeter
import cv2
import numpy as np
from spatial_transforms import (Compose, Normalize, Resize, CenterCrop,
ToTensor, ScaleValue, PickFirstChannels)
def get_video_results(outputs, class_names, output_topk):
sorted_scores, locs = torch.topk(outputs,
k=min(output_topk, len(class_names)))
video_results = []
for i in range(sorted_scores.size(0)):
video_results.append({
'label': class_names[locs[i].item()],
'score': sorted_scores[i].item()
})
return video_results
def get_normalize_method(mean, std, no_mean_norm, no_std_norm):
if no_mean_norm:
if no_std_norm:
return Normalize([0, 0, 0], [1, 1, 1])
else:
return Normalize([0, 0, 0], std)
else:
if no_std_norm:
return Normalize(mean, [1, 1, 1])
else:
return Normalize(mean, std)
def inference(data_loader, model, result_path, class_names, no_average,
output_topk, opt):
print('inference')
spatial_transform = []
normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm,
opt.no_std_norm)
spatial_transform = [Resize(opt.sample_size)]
if opt.inference_crop == 'center':
spatial_transform.append(CenterCrop(opt.sample_size))
spatial_transform.append(ToTensor())
if opt.input_type == 'flow':
spatial_transform.append(PickFirstChannels(n=2))
spatial_transform.extend([ScaleValue(opt.value_scale), normalize])
spatial_transform = Compose(spatial_transform)
model.eval()
batch_time = AverageMeter()
data_time = AverageMeter()
results = {'results': defaultdict(list)}
end_time = time.time()
result_list1 = []
result_list2 = []
with torch.no_grad():
for i, (inputs, targets) in enumerate(data_loader):
data_time.update(time.time() - end_time)
from PIL import Image
video = []
for i in range(1,17):
img = Image.open("../UCF101_images/UCF101/ApplyEyeMakeup/v_ApplyEyeMakeup_g01_c01/image_{:05}.jpg".format(i))
video.append(img.convert('RGB'))
# single data
spatial_transform.randomize_parameters()
clip = [spatial_transform(img) for img in video]
clips = torch.unsqueeze((torch.stack(clip, 0).permute(1, 0, 2, 3)),0)
# multiple data
clip1 = [spatial_transform(img) for img in video]
clips_ = torch.stack([torch.stack(clip1, 0).permute(1, 0, 2, 3),
torch.stack(clip1, 0).permute(1, 0, 2, 3),
torch.stack(clip1, 0).permute(1, 0, 2, 3)], dim=0)
# for i, j, k, l, m, n in zip(inputs[0][0], inputs[0][1], inputs[0][2], clips[0][0], clips[0][1], clips[0][2]):
# ii = i.cpu().detach().numpy()
# jj = j.cpu().detach().numpy()
# kk = k.cpu().detach().numpy()
# ll = l.cpu().detach().numpy()
# mm = m.cpu().detach().numpy()
# nn = n.cpu().detach().numpy()
# cv2.imshow("input1", ii)
# cv2.imshow("input2", jj)
# cv2.imshow("input3", kk)
# cv2.imshow("clip1", ll)
# cv2.imshow("clip2", mm)
# cv2.imshow("clip3", nn)
# cv2.waitKey(0)
video_ids, segments = zip(*targets)
outputs = model(inputs)
outputs = F.softmax(outputs, dim=1).cpu()
for j in range(outputs.size(0)):
results['results'][video_ids[j]].append({
'segment': segments[j],
'output': outputs[j]
})
sorted_scores, locs = torch.topk(torch.mean(outputs, dim=0),
k=min(output_topk, len(class_names)))
result_list1.append([sorted_scores.item(), locs.item()])
print(result_list1)
outputs = model(clips)
outputs = F.softmax(outputs, dim=1).cpu()
sorted_scores, locs = torch.topk(torch.mean(outputs, dim=0),
k=min(output_topk, len(class_names)))
result_list2.append([sorted_scores.item(), locs.item()])
print(result_list2)
a
batch_time.update(time.time() - end_time)
end_time = time.time()
print('[{}/{}]\t'
'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'.format(
i + 1,
len(data_loader),
batch_time=batch_time,
data_time=data_time))
inference_results = {'results': {}}
if not no_average:
for video_id, video_results in results['results'].items():
video_outputs = [
segment_result['output'] for segment_result in video_results
]
video_outputs = torch.stack(video_outputs)
average_scores = torch.mean(video_outputs, dim=0)
inference_results['results'][video_id] = get_video_results(
average_scores, class_names, output_topk)
with result_path.open('w') as f:
json.dump(inference_results, f)
推論用の Tensor 作成部分は 78 – 87 行目です。
# single data
spatial_transform.randomize_parameters()
clip = [spatial_transform(img) for img in video]
clips = torch.unsqueeze((torch.stack(clip, 0).permute(1, 0, 2, 3)),0)
# multiple data
clip1 = [spatial_transform(img) for img in video]
clips_ = torch.stack([torch.stack(clip1, 0).permute(1, 0, 2, 3),
torch.stack(clip1, 0).permute(1, 0, 2, 3),
torch.stack(clip1, 0).permute(1, 0, 2, 3)], dim=0)
16 フレームのみ推論したい場合は single_data、16 フレームを複数同時に推論したい場合は multiple_data を使用してください。
カスタムデータの推論は 119 – 124 行目です。
outputs = model(clips)
outputs = F.softmax(outputs, dim=1).cpu()
sorted_scores, locs = torch.topk(torch.mean(outputs, dim=0),
k=min(output_topk, len(class_names)))
result_list2.append([sorted_scores.item(), locs.item()])
print(result_list2)
result_list2 がスコアとクラス名になります。出力は以下となります。
[[0.8918734192848206, 1]]
[[0.8490836024284363, 1]]
結果の照合
実際に、データローダーとカスタムデータの推論結果が同じになるかどうか確認していきます。
inference_custom.py の 88 行目に以下を追記してください。
inputs = torch.unsqueeze(inputs[0], 0)
torch.unsqueeze で入力の次元を合わせています。
出力は以下のようになります。
[[0.8490234613418579, 1]]
[[0.8490234613418579, 1]]
合っています!
まとめ
ここで、今までのプログラム・手順をまとめます。
git からリポジトリをクローン
git clone https://github.com/kenshohara/3D-ResNets-PyTorch.git
cd 3D-ResNets-PyTorch
main_inference.py と inference_custom.py を作成
main_inference.py
from pathlib import Path
import json
import random
import os
import numpy as np
import torch
from torch.nn import CrossEntropyLoss
from torch.optim import SGD, lr_scheduler
import torch.multiprocessing as mp
import torch.distributed as dist
from torch.backends import cudnn
import torchvision
from opts import parse_opts
from model import (generate_model, load_pretrained_model, make_data_parallel,
get_fine_tuning_parameters)
from mean import get_mean_std
from spatial_transforms import (Compose, Normalize, Resize, CenterCrop,
CornerCrop, MultiScaleCornerCrop,
RandomResizedCrop, RandomHorizontalFlip,
ToTensor, ScaleValue, ColorJitter,
PickFirstChannels)
from temporal_transforms import (LoopPadding, TemporalRandomCrop,
TemporalCenterCrop, TemporalEvenCrop,
SlidingWindow, TemporalSubsampling)
from temporal_transforms import Compose as TemporalCompose
from dataset import get_training_data, get_validation_data, get_inference_data
from utils import Logger, worker_init_fn, get_lr
from training import train_epoch
from validation import val_epoch
import inference_custom
def json_serial(obj):
if isinstance(obj, Path):
return str(obj)
def get_opt():
opt = parse_opts()
opt.mean, opt.std = get_mean_std(opt.value_scale, dataset=opt.mean_dataset)
opt.n_input_channels = 3
opt.resume_path = "./results/save_200.pth"
opt.root_path = "./"
opt.device = torch.device('cpu' if opt.no_cuda else 'cuda')
if not opt.no_cuda:
cudnn.benchmark = True
opt.video_path = Path("../UCF101_images/UCF101")
opt.annotation_path = Path("../UCF101_json/ucf101_01.json")
opt.result_path = Path("./results")
opt.dataset = "ucf101"
opt.no_train = True
opt.no_val = True
opt.inference = True
opt.output_topk = 1
opt.inference_batch_size = 1
opt.n_threads = 1
opt.n_classes = 10
opt.model_depth = 50
opt.arch = '{}-{}'.format(opt.model, opt.model_depth)
# print(opt)
with (opt.result_path / 'opts.json').open('w') as opt_file:
json.dump(vars(opt), opt_file, default=json_serial)
return opt
def resume_model(resume_path, arch, model):
print('loading checkpoint {} model'.format(resume_path))
checkpoint = torch.load(resume_path, map_location='cpu')
assert arch == checkpoint['arch']
if hasattr(model, 'module'):
model.module.load_state_dict(checkpoint['state_dict'])
else:
model.load_state_dict(checkpoint['state_dict'])
return model
def resume_train_utils(resume_path, begin_epoch, optimizer, scheduler):
print('loading checkpoint {} train utils'.format(resume_path))
checkpoint = torch.load(resume_path, map_location='cpu')
begin_epoch = checkpoint['epoch'] + 1
if optimizer is not None and 'optimizer' in checkpoint:
optimizer.load_state_dict(checkpoint['optimizer'])
if scheduler is not None and 'scheduler' in checkpoint:
scheduler.load_state_dict(checkpoint['scheduler'])
return begin_epoch, optimizer, scheduler
def get_normalize_method(mean, std, no_mean_norm, no_std_norm):
if no_mean_norm:
if no_std_norm:
return Normalize([0, 0, 0], [1, 1, 1])
else:
return Normalize([0, 0, 0], std)
else:
if no_std_norm:
return Normalize(mean, [1, 1, 1])
else:
return Normalize(mean, std)
def get_train_utils(opt, model_parameters):
assert opt.train_crop in ['random', 'corner', 'center']
spatial_transform = []
if opt.train_crop == 'random':
spatial_transform.append(
RandomResizedCrop(
opt.sample_size, (opt.train_crop_min_scale, 1.0),
(opt.train_crop_min_ratio, 1.0 / opt.train_crop_min_ratio)))
elif opt.train_crop == 'corner':
scales = [1.0]
scale_step = 1 / (2**(1 / 4))
for _ in range(1, 5):
scales.append(scales[-1] * scale_step)
spatial_transform.append(MultiScaleCornerCrop(opt.sample_size, scales))
elif opt.train_crop == 'center':
spatial_transform.append(Resize(opt.sample_size))
spatial_transform.append(CenterCrop(opt.sample_size))
normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm,
opt.no_std_norm)
if not opt.no_hflip:
spatial_transform.append(RandomHorizontalFlip())
if opt.colorjitter:
spatial_transform.append(ColorJitter())
spatial_transform.append(ToTensor())
if opt.input_type == 'flow':
spatial_transform.append(PickFirstChannels(n=2))
spatial_transform.append(ScaleValue(opt.value_scale))
spatial_transform.append(normalize)
spatial_transform = Compose(spatial_transform)
assert opt.train_t_crop in ['random', 'center']
temporal_transform = []
if opt.sample_t_stride > 1:
temporal_transform.append(TemporalSubsampling(opt.sample_t_stride))
if opt.train_t_crop == 'random':
temporal_transform.append(TemporalRandomCrop(opt.sample_duration))
elif opt.train_t_crop == 'center':
temporal_transform.append(TemporalCenterCrop(opt.sample_duration))
temporal_transform = TemporalCompose(temporal_transform)
train_data = get_training_data(opt.video_path, opt.annotation_path,
opt.dataset, opt.input_type, opt.file_type,
spatial_transform, temporal_transform)
if opt.distributed:
train_sampler = torch.utils.data.distributed.DistributedSampler(
train_data)
else:
train_sampler = None
train_loader = torch.utils.data.DataLoader(train_data,
batch_size=opt.batch_size,
shuffle=(train_sampler is None),
num_workers=opt.n_threads,
pin_memory=True,
sampler=train_sampler,
worker_init_fn=worker_init_fn)
if opt.is_master_node:
train_logger = Logger(opt.result_path / 'train.log',
['epoch', 'loss', 'acc', 'lr'])
train_batch_logger = Logger(
opt.result_path / 'train_batch.log',
['epoch', 'batch', 'iter', 'loss', 'acc', 'lr'])
else:
train_logger = None
train_batch_logger = None
if opt.nesterov:
dampening = 0
else:
dampening = opt.dampening
optimizer = SGD(model_parameters,
lr=opt.learning_rate,
momentum=opt.momentum,
dampening=dampening,
weight_decay=opt.weight_decay,
nesterov=opt.nesterov)
assert opt.lr_scheduler in ['plateau', 'multistep']
assert not (opt.lr_scheduler == 'plateau' and opt.no_val)
if opt.lr_scheduler == 'plateau':
scheduler = lr_scheduler.ReduceLROnPlateau(
optimizer, 'min', patience=opt.plateau_patience)
else:
scheduler = lr_scheduler.MultiStepLR(optimizer,
opt.multistep_milestones)
return (train_loader, train_sampler, train_logger, train_batch_logger,
optimizer, scheduler)
def get_val_utils(opt):
normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm,
opt.no_std_norm)
spatial_transform = [
Resize(opt.sample_size),
CenterCrop(opt.sample_size),
ToTensor()
]
if opt.input_type == 'flow':
spatial_transform.append(PickFirstChannels(n=2))
spatial_transform.extend([ScaleValue(opt.value_scale), normalize])
spatial_transform = Compose(spatial_transform)
temporal_transform = []
if opt.sample_t_stride > 1:
temporal_transform.append(TemporalSubsampling(opt.sample_t_stride))
temporal_transform.append(
TemporalEvenCrop(opt.sample_duration, opt.n_val_samples))
temporal_transform = TemporalCompose(temporal_transform)
val_data, collate_fn = get_validation_data(opt.video_path,
opt.annotation_path, opt.dataset,
opt.input_type, opt.file_type,
spatial_transform,
temporal_transform)
if opt.distributed:
val_sampler = torch.utils.data.distributed.DistributedSampler(
val_data, shuffle=False)
else:
val_sampler = None
val_loader = torch.utils.data.DataLoader(val_data,
batch_size=(opt.batch_size //
opt.n_val_samples),
shuffle=False,
num_workers=opt.n_threads,
pin_memory=True,
sampler=val_sampler,
worker_init_fn=worker_init_fn,
collate_fn=collate_fn)
if opt.is_master_node:
val_logger = Logger(opt.result_path / 'val.log',
['epoch', 'loss', 'acc'])
else:
val_logger = None
return val_loader, val_logger
def get_inference_utils(opt):
assert opt.inference_crop in ['center', 'nocrop']
normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm,
opt.no_std_norm)
spatial_transform = [Resize(opt.sample_size)]
if opt.inference_crop == 'center':
spatial_transform.append(CenterCrop(opt.sample_size))
spatial_transform.append(ToTensor())
if opt.input_type == 'flow':
spatial_transform.append(PickFirstChannels(n=2))
spatial_transform.extend([ScaleValue(opt.value_scale), normalize])
spatial_transform = Compose(spatial_transform)
temporal_transform = []
if opt.sample_t_stride > 1:
temporal_transform.append(TemporalSubsampling(opt.sample_t_stride))
temporal_transform.append(
SlidingWindow(opt.sample_duration, opt.inference_stride))
temporal_transform = TemporalCompose(temporal_transform)
inference_data, collate_fn = get_inference_data(
opt.video_path, opt.annotation_path, opt.dataset, opt.input_type,
opt.file_type, opt.inference_subset, spatial_transform,
temporal_transform)
inference_loader = torch.utils.data.DataLoader(
inference_data,
batch_size=opt.inference_batch_size,
shuffle=False,
num_workers=opt.n_threads,
pin_memory=True,
worker_init_fn=worker_init_fn,
collate_fn=collate_fn)
return inference_loader, inference_data.class_names
def save_checkpoint(save_file_path, epoch, arch, model, optimizer, scheduler):
if hasattr(model, 'module'):
model_state_dict = model.module.state_dict()
else:
model_state_dict = model.state_dict()
save_states = {
'epoch': epoch,
'arch': arch,
'state_dict': model_state_dict,
'optimizer': optimizer.state_dict(),
'scheduler': scheduler.state_dict()
}
torch.save(save_states, save_file_path)
def main_worker(index, opt):
random.seed(opt.manual_seed)
np.random.seed(opt.manual_seed)
torch.manual_seed(opt.manual_seed)
model = generate_model(opt)
model = resume_model(opt.resume_path, opt.arch, model)
model = make_data_parallel(model, opt.distributed, opt.device)
if opt.inference:
inference_loader, inference_class_names = get_inference_utils(opt)
inference_result_path = opt.result_path / '{}.json'.format(
opt.inference_subset)
inference_custom.inference(inference_loader, model, inference_result_path,
inference_class_names, opt.inference_no_average,
opt.output_topk, opt)
if __name__ == '__main__':
opt = get_opt()
main_worker(-1, opt)
inference_custom.py
import time
import json
from collections import defaultdict
import torch
import torch.nn.functional as F
from utils import AverageMeter
import cv2
import numpy as np
from spatial_transforms import (Compose, Normalize, Resize, CenterCrop,
ToTensor, ScaleValue, PickFirstChannels)
def get_video_results(outputs, class_names, output_topk):
sorted_scores, locs = torch.topk(outputs,
k=min(output_topk, len(class_names)))
video_results = []
for i in range(sorted_scores.size(0)):
video_results.append({
'label': class_names[locs[i].item()],
'score': sorted_scores[i].item()
})
return video_results
def get_normalize_method(mean, std, no_mean_norm, no_std_norm):
if no_mean_norm:
if no_std_norm:
return Normalize([0, 0, 0], [1, 1, 1])
else:
return Normalize([0, 0, 0], std)
else:
if no_std_norm:
return Normalize(mean, [1, 1, 1])
else:
return Normalize(mean, std)
def inference(data_loader, model, result_path, class_names, no_average,
output_topk, opt):
print('inference')
spatial_transform = []
normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm,
opt.no_std_norm)
spatial_transform = [Resize(opt.sample_size)]
if opt.inference_crop == 'center':
spatial_transform.append(CenterCrop(opt.sample_size))
spatial_transform.append(ToTensor())
if opt.input_type == 'flow':
spatial_transform.append(PickFirstChannels(n=2))
spatial_transform.extend([ScaleValue(opt.value_scale), normalize])
spatial_transform = Compose(spatial_transform)
model.eval()
batch_time = AverageMeter()
data_time = AverageMeter()
results = {'results': defaultdict(list)}
end_time = time.time()
result_list1 = []
result_list2 = []
with torch.no_grad():
for i, (inputs, targets) in enumerate(data_loader):
data_time.update(time.time() - end_time)
from PIL import Image
video = []
for i in range(1,17):
img = Image.open("../UCF101_images/UCF101/ApplyEyeMakeup/v_ApplyEyeMakeup_g01_c01/image_{:05}.jpg".format(i))
video.append(img.convert('RGB'))
# single data
spatial_transform.randomize_parameters()
clip = [spatial_transform(img) for img in video]
clips = torch.unsqueeze((torch.stack(clip, 0).permute(1, 0, 2, 3)),0)
# multiple data
clip1 = [spatial_transform(img) for img in video]
clips_ = torch.stack([torch.stack(clip1, 0).permute(1, 0, 2, 3),
torch.stack(clip1, 0).permute(1, 0, 2, 3),
torch.stack(clip1, 0).permute(1, 0, 2, 3)], dim=0)
inputs = torch.unsqueeze(inputs[0], 0)
# for i, j, k, l, m, n in zip(inputs[0][0], inputs[0][1], inputs[0][2], clips[0][0], clips[0][1], clips[0][2]):
# ii = i.cpu().detach().numpy()
# jj = j.cpu().detach().numpy()
# kk = k.cpu().detach().numpy()
# ll = l.cpu().detach().numpy()
# mm = m.cpu().detach().numpy()
# nn = n.cpu().detach().numpy()
# cv2.imshow("input1", ii)
# cv2.imshow("input2", jj)
# cv2.imshow("input3", kk)
# cv2.imshow("clip1", ll)
# cv2.imshow("clip2", mm)
# cv2.imshow("clip3", nn)
# cv2.waitKey(0)
video_ids, segments = zip(*targets)
outputs = model(inputs)
outputs = F.softmax(outputs, dim=1).cpu()
for j in range(outputs.size(0)):
results['results'][video_ids[j]].append({
'segment': segments[j],
'output': outputs[j]
})
sorted_scores, locs = torch.topk(torch.mean(outputs, dim=0),
k=min(output_topk, len(class_names)))
result_list1.append([sorted_scores.item(), locs.item()])
print(result_list1)
outputs = model(clips)
outputs = F.softmax(outputs, dim=1).cpu()
sorted_scores, locs = torch.topk(torch.mean(outputs, dim=0),
k=min(output_topk, len(class_names)))
result_list2.append([sorted_scores.item(), locs.item()])
print(result_list2)
a
batch_time.update(time.time() - end_time)
end_time = time.time()
print('[{}/{}]\t'
'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'.format(
i + 1,
len(data_loader),
batch_time=batch_time,
data_time=data_time))
inference_results = {'results': {}}
if not no_average:
for video_id, video_results in results['results'].items():
video_outputs = [
segment_result['output'] for segment_result in video_results
]
video_outputs = torch.stack(video_outputs)
average_scores = torch.mean(video_outputs, dim=0)
inference_results['results'][video_id] = get_video_results(
average_scores, class_names, output_topk)
with result_path.open('w') as f:
json.dump(inference_results, f)
推論を実行
python3 main_inference.py
結果出力
[[0.8490234613418579, 1]]
[[0.8490234613418579, 1]]
おわりに
今回は推論プログラムの結果確認と、今までの記事をまとめました。
これで、データローダーなしで特定の 16 フレームを推論に投げることができます。
このままでは不便なので、次回はプログラムを一つのファイルにまとめていきます。
コメント