物体の三次元姿勢推定 CenterSnap -bop_toolkitでアノテーションデータ作成- 【Python】

はじめに
前提条件
アノテーションデータの作成
プログラム説明
おわりに

はじめに

前回は bop_toolkit を使用して、マスクデータを作成しました。

今回は、アノテーションデータを作成していきます。

前提条件

前提条件は以下の通りです。

Windows11 (三次元モデルの準備にのみ使用)
Ubuntu22 (モデル準備以降に使用)
Python3.10.x
CloudCompare
open3d == 0.16.0
こちらの記事を参考に三次元モデルを作成していること
シーンの作成が完了していること
こちらの記事を参考に bop_toolkit_lib のインストールとプログラムの修正が完了していること
マスクデータの作成が完了していること

アノテーションデータの作成

scene_gt_info.json を作成するためのプログラムを作成していきます。

calc_gt_info.py

# Author: Tomas Hodan (hodantom@cmp.felk.cvut.cz)
# Center for Machine Perception, Czech Technical University in Prague

"""Calculates visibility, 2D bounding boxes etc. for the ground-truth poses.

See docs/bop_datasets_format.md for documentation of the calculated info.

The info is saved in folder "{train,val,test}_gt_info" in the main folder of the
selected dataset.
"""

import os
import numpy as np

from bop_toolkit_lib import config
from bop_toolkit_lib import dataset_params
from bop_toolkit_lib import inout
from bop_toolkit_lib import misc
from bop_toolkit_lib import renderer
from bop_toolkit_lib import visibility


# PARAMETERS.
################################################################################
p = {
  # See dataset_params.py for options.
  'dataset': 'lm',

  # Dataset split. Options: 'train', 'val', 'test'.
  'dataset_split': 'train',

  # Dataset split type. None = default. See dataset_params.py for options.
  'dataset_split_type': "pbr",

  # Whether to save visualizations of visibility masks.
  'vis_visibility_masks': False,

  # Tolerance used in the visibility test [mm].
  'delta': 15,

  # Type of the renderer.
  'renderer_type': 'vispy',  # Options: 'vispy', 'cpp', 'python'.

  # Folder containing the BOP datasets.
  'datasets_path': "/path/to/makeNOCS/output_data/bop_data",

  # Path template for output images with object masks.
  # 'vis_mask_visib_tpath': os.path.join(
  #   config.output_path, 'vis_gt_visib_delta={delta}',
  #   'vis_gt_visib_delta={delta}', '{dataset}', '{split}', '{scene_id:06d}',
  #   '{im_id:06d}_{gt_id:06d}.jpg'),
  'vis_mask_visib_tpath': os.path.join(
    "/path/to/makeNOCS/output_data/bop_data","visibility_mask" ,'{scene_id:06d}',
    '{im_id:06d}_{gt_id:06d}.jpg'),
}
################################################################################


if p['vis_visibility_masks']:
  from bop_toolkit_lib import visualization

# Load dataset parameters.
dp_split = dataset_params.get_split_params(
  p['datasets_path'], p['dataset'], p['dataset_split'], p['dataset_split_type'])

model_type = None
if p['dataset'] == 'tless':
  model_type = 'cad'
dp_model = dataset_params.get_model_params(
  p['datasets_path'], p['dataset'], model_type)

# Initialize a renderer.
misc.log('Initializing renderer...')

# The renderer has a larger canvas for generation of masks of truncated objects.
im_width, im_height = dp_split['im_size']
ren_width, ren_height = 3 * im_width, 3 * im_height
ren_cx_offset, ren_cy_offset = im_width, im_height
ren = renderer.create_renderer(
  ren_width, ren_height, p['renderer_type'], mode='depth')

for obj_id in dp_model['obj_ids']:
  model_fpath = dp_model['model_tpath'].format(obj_id=obj_id)
  ren.add_object(obj_id, model_fpath)

scene_ids = dataset_params.get_present_scene_ids(dp_split)
for scene_id in scene_ids:
  # Load scene info and ground-truth poses.
  scene_camera = inout.load_scene_camera(
    dp_split['scene_camera_tpath'].format(scene_id=scene_id))
  scene_gt = inout.load_scene_gt(
    dp_split['scene_gt_tpath'].format(scene_id=scene_id))

  scene_gt_info = {}
  im_ids = sorted(scene_gt.keys())
  for im_counter, im_id in enumerate(im_ids):
    if im_counter % 100 == 0:
      misc.log(
        'Calculating GT info - dataset: {} ({}, {}), scene: {}, im: {}'.format(
          p['dataset'], p['dataset_split'], p['dataset_split_type'], scene_id,
          im_id))

    # Load depth image.
    depth_fpath = dp_split['depth_tpath'].format(scene_id=scene_id, im_id=im_id)
    if not os.path.exists(depth_fpath):
      depth_fpath = depth_fpath.replace('.tif', '.png')
    depth = inout.load_depth(depth_fpath)
    depth *= scene_camera[im_id]['depth_scale']  # Convert to [mm].

    K = scene_camera[im_id]['cam_K']
    fx, fy, cx, cy = K[0, 0], K[1, 1], K[0, 2], K[1, 2]
    im_size = (depth.shape[1], depth.shape[0])

    scene_gt_info[im_id] = []
    for gt_id, gt in enumerate(scene_gt[im_id]):
      # Render depth image of the object model in the ground-truth pose.
      depth_gt_large = ren.render_object(
        gt['obj_id'], gt['cam_R_m2c'], gt['cam_t_m2c'],
        fx, fy, cx + ren_cx_offset, cy + ren_cy_offset)['depth']
      depth_gt = depth_gt_large[
                   ren_cy_offset:(ren_cy_offset + im_height),
                   ren_cx_offset:(ren_cx_offset + im_width)]

      # Convert depth images to distance images.
      dist_gt = misc.depth_im_to_dist_im_fast(depth_gt, K)
      dist_im = misc.depth_im_to_dist_im_fast(depth, K)

      # Estimation of the visibility mask.
      visib_gt = visibility.estimate_visib_mask_gt(
        dist_im, dist_gt, p['delta'], visib_mode='bop19')

      # Mask of the object in the GT pose.
      obj_mask_gt_large = depth_gt_large > 0
      obj_mask_gt = dist_gt > 0

      # Number of pixels in the whole object silhouette
      # (even in the truncated part).
      px_count_all = np.sum(obj_mask_gt_large)

      # Number of pixels in the object silhouette with a valid depth measurement
      # (i.e. with a non-zero value in the depth image).
      px_count_valid = np.sum(dist_im[obj_mask_gt] > 0)

      # Number of pixels in the visible part of the object silhouette.
      px_count_visib = visib_gt.sum()

      # Visible surface fraction.
      if px_count_all > 0:
        visib_fract = px_count_visib / float(px_count_all)
      else:
        visib_fract = 0.0

      # Bounding box of the whole object silhouette
      # (including the truncated part).
      bbox = [-1, -1, -1, -1]
      if px_count_visib > 0:
        ys, xs = obj_mask_gt_large.nonzero()
        ys -= ren_cy_offset
        xs -= ren_cx_offset
        bbox = misc.calc_2d_bbox(xs, ys, im_size)

      # Bounding box of the visible surface part.
      bbox_visib = [-1, -1, -1, -1]
      if px_count_visib > 0:
        ys, xs = visib_gt.nonzero()
        bbox_visib = misc.calc_2d_bbox(xs, ys, im_size)

      # Store the calculated info.
      scene_gt_info[im_id].append({
        'px_count_all': int(px_count_all),
        'px_count_valid': int(px_count_valid),
        'px_count_visib': int(px_count_visib),
        'visib_fract': float(visib_fract),
        'bbox_obj': [int(e) for e in bbox],
        'bbox_visib': [int(e) for e in bbox_visib]
      })

      # Visualization of the visibility mask.
      if p['vis_visibility_masks']:

        depth_im_vis = visualization.depth_for_vis(depth, 0.2, 1.0)
        depth_im_vis = np.dstack([depth_im_vis] * 3)

        visib_gt_vis = visib_gt.astype(np.float)
        zero_ch = np.zeros(visib_gt_vis.shape)
        visib_gt_vis = np.dstack([zero_ch, visib_gt_vis, zero_ch])

        vis = 0.5 * depth_im_vis + 0.5 * visib_gt_vis
        vis[vis > 1] = 1

        vis_path = p['vis_mask_visib_tpath'].format(
          delta=p['delta'], dataset=p['dataset'], split=p['dataset_split'],
          scene_id=scene_id, im_id=im_id, gt_id=gt_id)
        misc.ensure_dir(os.path.dirname(vis_path))
        inout.save_im(vis_path, vis)

  # Save the info for the current scene.
  scene_gt_info_path = dp_split['scene_gt_info_tpath'].format(scene_id=scene_id)
  misc.ensure_dir(os.path.dirname(scene_gt_info_path))
  inout.save_json(scene_gt_info_path, scene_gt_info)

上記を実行すると、scene_gt_info.json が作成されます。

cd makeNOCS/bop_toolkit
python3 scripts/calc_gt_info.py

scene_gt_info.json

{
  "0": [{
         "bbox_obj": [185, 227, 85, 54], 
         "bbox_visib": [185, 227, 85, 54], 
         "px_count_all": 3182, 
         "px_count_valid": 3182, 
         "px_count_visib": 3151, 
         "visib_fract": 0.9902576995600252}, 
        {
         "bbox_obj": [189, 179, 65, 48], 
         "bbox_visib": [189, 179, 65, 48], 
         "px_count_all": 2014, 
         "px_count_valid": 2014, 
         "px_count_visib": 2004, 
         "visib_fract": 0.9950347567030785}, 
        {
         "bbox_obj": [265, 231, 58, 59], 
         "bbox_visib": [266, 231, 57, 59], 
         "px_count_all": 2367, 
         "px_count_valid": 2367, 
         "px_count_visib": 2317, 
         "visib_fract": 0.9788762146176595}, 
        {
         "bbox_obj": [232, 170, 50, 39], "bbox_visib": [235, 170, 47, 39], "px_count_all": 1193, "px_count_valid": 1193, "px_count_visib": 1083, "visib_fract": 0.9077954735959766}, {"bbox_obj": [292, 222, 64, 40], "bbox_visib": [294, 222, 62, 39], "px_count_all": 1908, "px_count_valid": 1908, "px_count_visib": 1591, "visib_fract": 0.8338574423480084}, {"bbox_obj": [377, 227, 42, 42], "bbox_visib": [377, 227, 42, 42], "px_count_all": 1057, "px_count_valid": 1057, "px_count_visib": 1041, "visib_fract": 0.9848628192999054}, {"bbox_obj": [261, 295, 90, 72], "bbox_visib": [261, 295, 90, 72], "px_count_all": 4071, "px_count_valid": 4071, "px_count_visib": 4048, "visib_fract": 0.9943502824858758}, {"bbox_obj": [445, 258, 53, 34], "bbox_visib": [446, 258, 52, 34], "px_count_all": 1279, "px_count_valid": 1279, "px_count_visib": 1153, "visib_fract": 0.9014855355746677}, {"bbox_obj": [404, 268, 63, 38], "bbox_visib": [404, 268, 63, 38], "px_count_all": 1664, "px_count_valid": 1664, "px_count_visib": 1647, "visib_fract": 0.9897836538461539}, {"bbox_obj": [373, 301, 74, 73], "bbox_visib": [373, 301, 74, 73], "px_count_all": 3288, "px_count_valid": 3288, "px_count_visib": 3277, "visib_fract": 0.996654501216545}],

0 は画像の番号です。今回は 75 枚出力するので、75番まであるはずです。

bbox_obj … バウンディングボックスの座標 (x1, x2, y1, y2?)
bbox_visib … 視認できるバウンディングボックスの座標
px_count_all … オブジェクトの総ピクセル数
px_count_valid … オブジェクトの総ピクセル数の検証結果？
px_count_visib … 視認できるオブジェクトのピクセル数
visib_fract … 視認できるピクセル数 / 総ピクセル数

プログラム説明

前回と異なる点を説明していきます。

import os
import numpy as np

from bop_toolkit_lib import config
from bop_toolkit_lib import dataset_params
from bop_toolkit_lib import inout
from bop_toolkit_lib import misc
from bop_toolkit_lib import renderer
from bop_toolkit_lib import visibility


# PARAMETERS.
################################################################################
p = {
  # See dataset_params.py for options.
  'dataset': 'lm',

  # Dataset split. Options: 'train', 'val', 'test'.
  'dataset_split': 'train',

  # Dataset split type. None = default. See dataset_params.py for options.
  'dataset_split_type': "pbr",

  # Whether to save visualizations of visibility masks.
  'vis_visibility_masks': False,

  # Tolerance used in the visibility test [mm].
  'delta': 15,

  # Type of the renderer.
  'renderer_type': 'vispy',  # Options: 'vispy', 'cpp', 'python'.

  # Folder containing the BOP datasets.
  'datasets_path': "/path/to/makeNOCS/output_data/bop_data",

  # Path template for output images with object masks.
  # 'vis_mask_visib_tpath': os.path.join(
  #   config.output_path, 'vis_gt_visib_delta={delta}',
  #   'vis_gt_visib_delta={delta}', '{dataset}', '{split}', '{scene_id:06d}',
  #   '{im_id:06d}_{gt_id:06d}.jpg'),
  'vis_mask_visib_tpath': os.path.join(
    "/path/to/makeNOCS/output_data/bop_data","visibility_mask" ,'{scene_id:06d}',
    '{im_id:06d}_{gt_id:06d}.jpg'),
}
################################################################################

前回と異なる点は、vis_visibility_masks です。これは、隠れていない部分のみのマスク画像を出力するかどうか選択します。

CenterSnap では使用しません。

# Estimation of the visibility mask.
visib_gt = visibility.estimate_visib_mask_gt(
dist_im, dist_gt, p['delta'], visib_mode='bop19')

# Mask of the object in the GT pose.
obj_mask_gt_large = depth_gt_large > 0
obj_mask_gt = dist_gt > 0

estimate_visib_mask で視認できるマスク画像を作成します。

depth_gt_large > 0 及び dist_gt > 0 → 画素値がある部分を True にします。

ここで、depth_gt_large はオブジェクト単体をレンダリングした深度画像で、dist_gt は BlenderProc から出力されたマスク画像です。

# Number of pixels in the whole object silhouette
# (even in the truncated part).
px_count_all = np.sum(obj_mask_gt_large)

対象のオブジェクトの総ピクセル数を算出します。

# Number of pixels in the object silhouette with a valid depth measurement
# (i.e. with a non-zero value in the depth image).
px_count_valid = np.sum(dist_im[obj_mask_gt] > 0)

オブジェクトのシルエット部分のピクセル数です。距離を持っていればカウントされるので、基本的には px_count_all = px_count_valid ?

# Number of pixels in the visible part of the object silhouette.
px_count_visib = visib_gt.sum()

視認できるオブジェクトのシルエット部分を算出します。

# Visible surface fraction.
if px_count_all > 0:
  visib_fract = px_count_visib / float(px_count_all)
else:
  visib_fract = 0.0

視認できるピクセル数 / 総ピクセル数を計算します。

# Bounding box of the whole object silhouette
# (including the truncated part).
bbox = [-1, -1, -1, -1]
if px_count_visib > 0:
  ys, xs = obj_mask_gt_large.nonzero()
  ys -= ren_cy_offset
  xs -= ren_cx_offset
  bbox = misc.calc_2d_bbox(xs, ys, im_size)

オブジェクトのバウンディングボックスを算出します。

# Bounding box of the visible surface part.
bbox_visib = [-1, -1, -1, -1]
if px_count_visib > 0:
  ys, xs = visib_gt.nonzero()
  bbox_visib = misc.calc_2d_bbox(xs, ys, im_size)

オブジェクトの視認できる部分のバウンディングボックスを算出します。

# Store the calculated info.
scene_gt_info[im_id].append({
  'px_count_all': int(px_count_all),
  'px_count_valid': int(px_count_valid),
  'px_count_visib': int(px_count_visib),
  'visib_fract': float(visib_fract),
  'bbox_obj': [int(e) for e in bbox],
  'bbox_visib': [int(e) for e in bbox_visib]
})

算出した項目を json 形式でまとめていきます。

残りのプログラムは作成した json を保存するだけなので、割愛します。