物体の三次元姿勢推定 CenterSnap -CenterSnap学習用中間ファイルの作成- 【Python】

はじめに
前提条件
CenterSnap インストール
中間ファイルの作成
おわりに

はじめに

前回は AutoEncoder を学習させる方法について説明しました。

今回は、CenterSnap を学習させるための中間ファイルを作成する方法について説明します。

前提条件

前提条件は以下の通りです。

Windows11 (三次元モデルの準備にのみ使用)
Ubuntu22 (モデル準備以降に使用)
Python3.10.x
CloudCompare
open3d == 0.16.0
こちらの記事を参考に三次元モデルを作成していること
シーンの作成が完了していること
こちらの記事を参考に bop_toolkit_lib のインストールとプログラムの修正が完了していること
マスクデータの作成が完了していること
アノテーションデータの作成が完了していること
オブジェクトのモデル情報の作成が完了していること
ShapeNetCore の HDF5 ファイルの作成が完了していること
object-deformnet 用の中間ファイル作成が完了していること

CenterSnap インストール

CenterSnap をインストールしていきます。

cd makeNOCS
git clone https://github.com/zubair-irshad/CenterSnap.git
cd CenterSnap

requirements.txt を以下のようにしてください。

open3d
absl-py==0.13.0
backcall==0.2.0
boto3==1.18.1
botocore==1.21.1
cachetools==4.2.2
certifi==2021.5.30
charset-normalizer==2.0.3
click==8.0.1
colour==0.1.5
configparser==5.0.2
cycler==0.10.0
decorator==4.4.2
docker-pycreds==0.4.0
future==0.18.2
gitdb==4.0.7
GitPython==3.1.18
google-auth==1.33.0
google-auth-oauthlib==0.4.4
greenlet==1.1.0
grpcio==1.38.1
idna==3.2
#imageio==2.9.0
ipython==7.25.0
ipython-genutils==0.2.0
jedi==0.18.0
jmespath==0.10.0
kiwisolver==1.3.1
Markdown==3.3.4
matplotlib==3.1.3
matplotlib-inline==0.1.2
msgpack==1.0.2
networkx==2.5.1
numpy==1.23.5
oauthlib==3.1.1
#opencv-python==4.5.3.56
parso==0.8.2
pathtools==0.1.2
pexpect==4.8.0
pickleshare==0.7.5
#Pillow==8.3.1
promise==2.3
prompt-toolkit==3.0.19
protobuf==3.17.3
psutil==5.8.0
ptyprocess==0.7.0
pyasn1==0.4.8
pyasn1-modules==0.2.8
Pygments==2.9.0
pynvim==0.4.3
pyparsing==2.4.7
python-dateutil==2.8.2
pytorch-lightning==0.7.5
PyWavelets==1.1.1
PyYAML==5.4.1
requests==2.26.0
requests-oauthlib==1.3.0
rsa==4.7.2
s3transfer==0.5.0
scikit-image==0.18.2
#scipy==1.7.0
sentry-sdk==1.3.0
shortuuid==1.0.1
six==1.16.0
smmap==4.0.0
subprocess32==3.5.4
tensorboard==2.5.0
tensorboard-data-server==0.6.1
tensorboard-plugin-wit==1.8.0
tifffile==2021.7.2
fvcore
#torch==1.7.1
#torchvision==0.8.2
tqdm==4.61.2
#traitlets==5.0.5
urllib3==1.26.6
wandb==0.11.0
wcwidth==0.2.5
#Werkzeug==2.0.1
ydiff==1.2
zstandard==0.15.2
plotly==5.6.0
argparse
h5py

python3 -m pip install -r requirements.txt

また、cuda にシンボリックリンクを貼っておきます。

sudo ln -s /usr/lib/wsl/lib/libcuda.so.1 /usr/local/cuda/lib64/libcuda.so

中間ファイルの作成

続いて、中間ファイルを作成していきます。

touch prepare_data/generate_data_nocs_custom.py

generate_data_nocs_custom.py

import os
import pathlib
import math
import glob
import cv2
import numpy as np
import torch
import torchvision.transforms as transforms
import _pickle as cPickle
from tqdm import tqdm
from utils.nocs_utils import load_depth, align_nocs_to_depth
from simnet.lib import camera
from simnet.lib import transform
from simnet.lib import datapoint
from simnet.lib.net.pre_processing import obb_inputs
from simnet.lib.depth_noise import DepthManager
from simnet.lib.net.models.auto_encoder import PointCloudAE
from PIL import Image
import argparse
import h5py
import json
from matplotlib import pyplot as plt

def create_img_list(data_dir):
    """ Create train/val/test data list for CAMERA and Real. """
    # CAMERA dataset
    for subset in ['train', 'val']:
        img_list = []
        img_dir = os.path.join(data_dir, 'CAMERA', subset)
        file_list = glob.glob(img_dir+"/000000/color/*.jpg")
        for f in file_list:
            basename = os.path.basename(f)
            img_path = os.path.join(subset, "000000/color", basename)
            img_list.append(img_path)
        with open(os.path.join(data_dir, 'CAMERA', subset+'_list_all.txt'), 'w') as f:
            for img_path in img_list:
                f.write("%s\n" % img_path)
    # Real dataset
    for subset in ['train', 'test']:
        img_list = []
        img_dir = os.path.join(data_dir, 'Real', subset)
        file_list = glob.glob(img_dir+"/000000/color/*.jpg")
        for f in file_list:
            basename = os.path.basename(f)
            img_path = os.path.join(subset, "000000/color", basename)
            img_list.append(img_path)
        with open(os.path.join(data_dir, 'Real', subset+'_list_all.txt'), 'w') as f:
            for img_path in img_list:
                f.write("%s\n" % img_path)
    print('Write all data paths to file done!')

def align_rotation(R):
    """ Align rotations for symmetric objects.
    Args:
        sRT: 4 x 4
    """

    theta_x = R[0, 0] + R[2, 2]
    theta_y = R[0, 2] - R[2, 0]
    r_norm = math.sqrt(theta_x**2 + theta_y**2)
    s_map = np.array([[theta_x/r_norm, 0.0, -theta_y/r_norm],
                      [0.0,            1.0,  0.0           ],
                      [theta_y/r_norm, 0.0,  theta_x/r_norm]])
    rotation = R @ s_map
    return rotation

def process_data(img_path, depth, filepath_dict):
    """ Load instance masks for the objects in the image. """
    # mask_path = img_path + '_mask.png'
    mask_path = filepath_dict["mask"]
    mask = cv2.imread(mask_path)[:, :, 2]
    mask = np.array(mask, dtype=np.int32)
    all_inst_ids = sorted(list(np.unique(mask)))
    assert all_inst_ids[-1] == 255
    del all_inst_ids[-1]    # remove background
    num_all_inst = len(all_inst_ids)

    meta_path = filepath_dict["meta"]
    with open(meta_path, 'r') as f:
        i = 0
        for line in f:
            i += 1
        num_all_inst = i

    h, w = mask.shape

    # coord_path = img_path + '_coord.png'
    coord_path = filepath_dict["coord"]
    with h5py.File(coord_path, "r") as h5:
        coord_map = h5["nocs"]
        coord_map = coord_map[:, :, :3]
    # print(coord_map.shape)
    # coord_map = cv2.imread(coord_path)[:, :, :3]
    coord_map = coord_map[:, :, (2, 1, 0)]
    # flip z axis of coord map
    coord_map = np.array(coord_map, dtype=np.float32)# / 255
    coord_map[:, :, 2] = 1 - coord_map[:, :, 2]

    class_ids = []
    instance_ids = []
    model_list = []
    masks = np.zeros([h, w, num_all_inst], dtype=np.uint8)
    coords = np.zeros((h, w, num_all_inst, 3), dtype=np.float32)
    bboxes = np.zeros((num_all_inst, 4), dtype=np.int32)
    # meta_path = img_path + '_meta.txt'
    meta_path = filepath_dict["meta"]
    with open(meta_path, 'r') as f:
        i = 0
        j = 0
        lock = False
        for line in f:
            line_info = line.strip().split(' ')
            inst_id = int(line_info[0])
            cls_id = int(line_info[1])
            # # background objects and non-existing objects
            # if cls_id == 0 or (inst_id not in all_inst_ids):
            #     continue
            if len(line_info) == 3:
                model_id = line_info[2]    # Real scanned objs
            else:
                model_id = line_info[3]    # CAMERA objs
            temp_mask_path = filepath_dict["temp_mask"] + "_{:06}".format(j)+".png"
            temp_mask = cv2.imread(temp_mask_path, 0)
            _, temp_mask = cv2.threshold(temp_mask, 127, 255, cv2.THRESH_BINARY)
            j += 1
            # process foreground objects
            # inst_mask = np.equal(mask, inst_id)
            # inst_mask = np.equal(temp_mask, 1)
            # plt.imshow(temp_mask)
            # plt.show()
            # bounding box
            # horizontal_indicies = np.where(np.any(inst_mask, axis=0))[0]
            # vertical_indicies = np.where(np.any(inst_mask, axis=1))[0]
            horizontal_indicies = np.where(np.any(temp_mask, axis=0))[0]
            vertical_indicies = np.where(np.any(temp_mask, axis=1))[0]
            # assert horizontal_indicies.shape[0], print(temp_mask_path)
            # if horizontal_indicies.shape[0] and lock == False:
            #     lock = True
            # else:
            #     lock = False
            #     continue
            #assert horizontal_indicies.shape[0], print(img_path)
            # print(horizontal_indicies.shape[0])
            if horizontal_indicies.shape[0] == 0:
                continue
            x1, x2 = horizontal_indicies[[0, -1]]
            y1, y2 = vertical_indicies[[0, -1]]
            # x2 and y2 should not be part of the box. Increment by 1.
            x2 += 1
            y2 += 1
            # object occupies full image, rendering error, happens in CAMERA dataset
            if np.any(np.logical_or((x2-x1) > 600, (y2-y1) > 440)):
                print("occupies")
                return None, None, None, None, None, None
            # not enough valid depth observation
            # final_mask = np.logical_and(inst_mask, depth > 0)
            final_mask = np.logical_and(temp_mask, depth > 0)
            if np.sum(final_mask) < 64:
                print("summmmmmmmmmmmmmmmmmm")
                continue
            class_ids.append(cls_id)
            instance_ids.append(inst_id)
            model_list.append(model_id)
            # masks[:, :, i] = inst_mask
            # coords[:, :, i, :] = np.multiply(coord_map, np.expand_dims(inst_mask, axis=-1))
            masks[:, :, i] = temp_mask
            coords[:, :, i, :] = np.multiply(coord_map, np.expand_dims(temp_mask, axis=-1))
            bboxes[i] = np.array([y1, x1, y2, x2])
            i += 1
    # no valid foreground objects
    if i == 0:
        return None, None, None, None, None, None

    masks = masks[:, :, :i]
    coords = np.clip(coords[:, :, :i, :], 0, 1)
    bboxes = bboxes[:i, :]

    return masks, coords, class_ids, instance_ids, model_list, bboxes


def annotate_camera_train(data_dir, estimator):
    DATASET_NAME = 'NOCS_Data'
    DATASET_DIR = pathlib.Path(f'data/{DATASET_NAME}')
    DATASET_DIR.mkdir(parents=True, exist_ok=True)
    _DATASET = datapoint.make_dataset(f'file://{DATASET_DIR}/CAMERA/train')

    _camera = camera.NOCS_Camera()
    """ Generate gt labels for CAMERA train data. """
    camera_train = open(os.path.join(data_dir, 'CAMERA', 'train_list_all.txt')).read().splitlines()
    # intrinsics = np.array([[577.5, 0, 319.5], [0, 577.5, 239.5], [0, 0, 1]])
    intrinsics = np.array([[572.411363389757, 0.0, 325.2611083984375], [0.0, 573.5704328585578, 242.04899588216654], [0.0, 0.0, 1.0]])

    #TEST MODELS
    obj_model_dir = os.path.join(data_dir, 'models_obj')
    with open(os.path.join(obj_model_dir, 'camera_train.pkl'), 'rb') as f:
        obj_models = cPickle.load(f)
    model_sizes = {}
    for key in obj_models.keys():
        model_sizes[key] = 2 * np.amax(np.abs(obj_models[key]), axis=0)
    # i = len(camera_train)//20
    # num = 11
    # camera_train = camera_train[(num)*i:(num+1)*i]

    # get scene info
    gt_info_path = os.path.join(data_dir, 'CAMERA', "train/000000", "scene_gt.json")
    with open(gt_info_path, "r") as f:
        gt_info_data = json.load(f)

    valid_img_list = []
    for img_path in tqdm(camera_train):
        # img_full_path = os.path.join(data_dir, 'CAMERA', img_path)
        img_full_path = os.path.join(data_dir, 'CAMERA', "train/000000")
        basename = os.path.basename(img_path)[:-4]

        # get gt info data
        gt_info = gt_info_data[str(int(basename))]

        # depth_composed_path = img_path+'_composed.png'
        depth_full_path = os.path.join(data_dir,'camera_full_depths', basename+".png")
        #print("depth_full_path", depth_full_path)

        all_exist = os.path.exists(img_full_path + '/color/'+ basename +'.jpg') and \
                    os.path.exists(img_full_path + '/coord/'+ str(int(basename)) +'.hdf5') and \
                    os.path.exists(img_full_path + '/depth/'+ basename +'.png') and \
                    os.path.exists(img_full_path + '/mask/'+ basename +'_mask.png') and \
                    os.path.exists(img_full_path + '/meta/'+ basename[2:] +'_meta.txt')
        
        filepath_dict = {}
        filepath_dict["color"] = img_full_path + '/color/'+ basename +'.jpg'
        filepath_dict["coord"] = img_full_path + '/coord/'+ str(int(basename)) +'.hdf5'
        filepath_dict["depth"] = img_full_path + '/depth/'+ basename +'.png'
        filepath_dict["mask"] = img_full_path + '/mask/'+ basename +'_mask.png'
        filepath_dict["temp_mask"] = img_full_path + '/mask_no_use/mask/' + basename
        filepath_dict["meta"] = img_full_path + '/meta/'+ basename[2:] +'_meta.txt'
        
        if not all_exist:
            print("no exist", filepath_dict)
            continue
        
        depth = load_depth(depth_full_path)
        masks, coords, class_ids, instance_ids, model_list, bboxes = process_data(img_full_path, depth, filepath_dict)
        # print(masks.shape, coords.shape, class_ids, instance_ids, model_list, bboxes)
        if instance_ids is None:
            print("instance_ids", instance_ids)
            continue
        # Umeyama alignment of GT NOCS map with depth image
        scales, rotations, translations, error_messages, _ = \
            align_nocs_to_depth(masks, coords, depth, intrinsics, instance_ids, img_path, gt_info)
        if error_messages:
            print("error_messages")
            continue
        
        ### GET CENTERPOINT DATAPOINTS
        #get latent embeddings
        model_points = [obj_models["00"+model_list[i]].astype(np.float32) for i in range(len(class_ids))]
        latent_embeddings = get_latent_embeddings(model_points, estimator)
        #get poses 
        abs_poses=[]
        seg_mask = np.zeros([_camera.height, _camera.width])
        masks_list = []

        nocs_dir = os.path.join(data_dir)
        nocs_path = os.path.join(nocs_dir, 'CAMERA/val/000000/gts', 'results_000000_{}.pkl'.format(img_path.split('/')[-1][2:-4]))
        with open(nocs_path, 'rb') as f:
            nocs = cPickle.load(f)

        gt_class_ids = nocs['gt_class_ids']
        gt_bboxes = nocs['gt_bboxes']
        gt_sRT = nocs['gt_RTs'][1:]
        gt_handle_visibility = nocs['gt_handle_visibility']
        map_to_nocs = []
        num_insts = len(instance_ids)

        for i in range(num_insts):
            for j in range(len(gt_class_ids)):
                if gt_class_ids[j] != class_ids[i]:
                    continue
                # if np.sum(np.abs(bboxes[i] - gt_bboxes[j])) > 5:
                #     continue
                # match found
                gt_match = j
                break
            # check match validity
            assert gt_match > -1, print(img_path, instance_ids[i], 'no match for instance')
            # assert gt_match not in map_to_nocs, print(img_path, instance_ids[i], 'duplicate match')
            # map_to_nocs.append(gt_match)
            map_to_nocs.append(i)
        
        sizes = np.zeros((num_insts, 3))
        poses = np.zeros((num_insts, 4, 4))
        scales = np.zeros(num_insts)
        rotations = np.zeros((num_insts, 3, 3))
        translations = np.zeros((num_insts, 3))
        for i in range(num_insts):
            gt_idx = map_to_nocs[i]
            sizes[i] = model_sizes["00"+model_list[i]]
            sRT = gt_sRT[gt_idx]
            s = np.cbrt(np.linalg.det(sRT[:3, :3]))
            R = sRT[:3, :3] / s
            T = sRT[:3, 3]
            # # re-label mug category
            # if class_ids[i] == 6:
            #     T0 = mug_meta[model_list[i]][0]
            #     s0 = mug_meta[model_list[i]][1]
            #     T = T - s * R @ T0
            #     s = s / s0
            # used for test during training
            scales[i] = s
            rotations[i] = R
            translations[i] = T
            # used for evaluation
            sRT = np.identity(4, dtype=np.float32)
            sRT[:3, :3] = s * R
            sRT[:3, 3] = T
            poses[i] = sRT

        for i in range(len(class_ids)):
            R = rotations[i]
            T = translations[i]
            s = scales[i]
            sym_ids = [0, 1, 3]
            cat_id = np.array(class_ids)[i] - 1 
            if cat_id in sym_ids:
                R = align_rotation(R)
            
            scale_matrix = np.eye(4)
            scale_mat = s*np.eye(3, dtype=float)
            scale_matrix[0:3, 0:3] = scale_mat
            camera_T_object = np.eye(4)
            camera_T_object[:3,:3] = R
            camera_T_object[:3,3] = T
            seg_mask[masks[:,:,i] > 0] = np.array(class_ids)[i]

            masks_list.append(masks[:,:,i])
            abs_poses.append(transform.Pose(camera_T_object=camera_T_object, scale_matrix=scale_matrix))
        obb_datapoint = obb_inputs.compute_nocs_network_targets(masks_list, latent_embeddings, abs_poses,_camera.height, _camera.width)
        # color_img = cv2.imread(img_full_path + '_color.png')
        color_img = cv2.imread(filepath_dict["color"])
        colorjitter = transforms.ColorJitter(0.5, 0.3, 0.5, 0.05)
        rgb_img = colorjitter(Image.fromarray(color_img))
        jitter_img = colorjitter(rgb_img)
        color_img = np.asarray(jitter_img)
        depth_array = np.array(depth, dtype=np.float32)/255.0
        DM = DepthManager()
        noisy_depth  = DM.prepare_depth_data(depth_array)
        stereo_datapoint = datapoint.Stereo(left_color=color_img, right_color=noisy_depth)
        panoptic_datapoint = datapoint.Panoptic(
            stereo=stereo_datapoint,
            depth=noisy_depth,
            segmentation=seg_mask,
            object_poses=[obb_datapoint],
            boxes=[],
            detections=[]
        )
        _DATASET.write(panoptic_datapoint)
        ### Finish writing datapoint

def annotate_real_train(data_dir, estimator):

    DATASET_NAME = 'NOCS_Data'
    DATASET_DIR = pathlib.Path(f'data/{DATASET_NAME}')
    DATASET_DIR.mkdir(parents=True, exist_ok=True)
    _DATASET = datapoint.make_dataset(f'file://{DATASET_DIR}/Real/train')

    """ Generate gt labels for Real train data through PnP. """
    _camera = camera.NOCS_Real()
    real_train = open(os.path.join(data_dir, 'Real/train_list_all.txt')).read().splitlines()
    # intrinsics = np.array([[591.0125, 0, 322.525], [0, 590.16775, 244.11084], [0, 0, 1]])
    intrinsics = np.array([[572.411363389757, 0.0, 325.2611083984375], [0.0, 573.5704328585578, 242.04899588216654], [0.0, 0.0, 1.0]])
    # scale factors for all instances
    scale_factors = {}
    path_to_size = glob.glob(os.path.join(data_dir, 'models_obj', '*_norm.txt'))
    for inst_path in sorted(path_to_size):
        instance = os.path.basename(inst_path).split('.')[0]
        bbox_dims = np.loadtxt(inst_path)
        scale_factors[instance[4:10]] = np.linalg.norm(bbox_dims)

    #TEST MODELS
    obj_model_dir = os.path.join(data_dir, 'models_obj')
    with open(os.path.join(obj_model_dir, 'real_train.pkl'), 'rb') as f:
        obj_models = cPickle.load(f)

    # get scene info
    # gt_info_path = os.path.join(data_dir, 'Real', "train/000000", "scene_gt.json")
    # with open(gt_info_path, "r") as f:
    #     gt_info_data = json.load(f)

    valid_img_list = []
    # real_train = np.array(real_train)[np.array([2, 500, 1000, 1500, 1700, 1300, 2000, 2300, 2350, 2750])].tolist()
    for img_path in tqdm(real_train):
        # img_full_path = os.path.join(data_dir, 'Real', img_path)
        img_full_path = os.path.join(data_dir, 'Real', "train/000000")
        basename = os.path.basename(img_path)[:-4]
        
        # # get gt info data
        # gt_info = gt_info_data[str(int(basename))]

        # depth_composed_path = img_path+'_composed.png'
        depth_full_path = os.path.join(data_dir,'camera_full_depths', basename+".png")
        #print("depth_full_path", depth_full_path)

        all_exist = os.path.exists(img_full_path + '/color/'+ basename +'.jpg') and \
                    os.path.exists(img_full_path + '/coord/'+ str(int(basename)) +'.hdf5') and \
                    os.path.exists(img_full_path + '/depth/'+ basename +'.png') and \
                    os.path.exists(img_full_path + '/mask/'+ basename +'_mask.png') and \
                    os.path.exists(img_full_path + '/meta/'+ basename[2:] +'_meta.txt')
        
        filepath_dict = {}
        filepath_dict["color"] = img_full_path + '/color/'+ basename +'.jpg'
        filepath_dict["coord"] = img_full_path + '/coord/'+ str(int(basename)) +'.hdf5'
        filepath_dict["depth"] = img_full_path + '/depth/'+ basename +'.png'
        filepath_dict["mask"] = img_full_path + '/mask/'+ basename +'_mask.png'
        filepath_dict["temp_mask"] = img_full_path + '/mask_no_use/mask/' + basename
        filepath_dict["meta"] = img_full_path + '/meta/'+ basename[2:] +'_meta.txt'
        
        if not all_exist:
            print("no exist", filepath_dict)
            continue

        depth = load_depth(depth_full_path)
        masks, coords, class_ids, instance_ids, model_list, bboxes = process_data(img_full_path, depth, filepath_dict)
        if instance_ids is None:
            continue
        # compute pose
        num_insts = len(class_ids)
        scales = np.zeros(num_insts)
        rotations = np.zeros((num_insts, 3, 3))
        translations = np.zeros((num_insts, 3))
        for i in range(num_insts):
            s = scale_factors["00"+model_list[i]]
            mask = masks[:, :, i]
            idxs = np.where(mask)
            coord = coords[:, :, i, :]
            coord_pts = s * (coord[idxs[0], idxs[1], :] - 0.5)
            coord_pts = coord_pts[:, :, None]
            img_pts = np.array([idxs[1], idxs[0]]).transpose()
            img_pts = img_pts[:, :, None].astype(float)
            distCoeffs = np.zeros((4, 1))    # no distoration
            retval, rvec, tvec = cv2.solvePnP(coord_pts, img_pts, intrinsics, distCoeffs)
            assert retval
            R, _ = cv2.Rodrigues(rvec)
            T = np.squeeze(tvec)
            # # re-label for mug category
            # if class_ids[i] == 6:
            #     T0 = mug_meta[model_list[i]][0]
            #     s0 = mug_meta[model_list[i]][1]
            #     T = T - s * R @ T0
            #     s = s / s0
            scales[i] = s
            rotations[i] = R
            translations[i] = T

        ### GET CENTERPOINT DATAPOINTS
        #get latent embeddings
        model_points = [obj_models["obj_00"+model_list[i]].astype(np.float32) for i in range(len(class_ids))]
        latent_embeddings = get_latent_embeddings(model_points, estimator)
        #get poses 
        abs_poses=[]
        class_num=1
        seg_mask = np.zeros([_camera.height, _camera.width])
        masks_list = []
        for i in range(len(class_ids)):
            R = rotations[i]
            T = translations[i]
            s = scales[i]
            sym_ids = [0, 1, 3]
            cat_id = np.array(class_ids)[i] - 1 
            if cat_id in sym_ids:
                R = align_rotation(R)
            scale_matrix = np.eye(4)
            scale_mat = s*np.eye(3, dtype=float)
            scale_matrix[0:3, 0:3] = scale_mat
            camera_T_object = np.eye(4)
            camera_T_object[:3,:3] = R
            camera_T_object[:3,3] = T
            seg_mask[masks[:,:,i] > 0] = np.array(class_ids)[i]
            class_num += 1
            masks_list.append(masks[:,:,i])
            abs_poses.append(transform.Pose(camera_T_object=camera_T_object, scale_matrix=scale_matrix))

        obb_datapoint = obb_inputs.compute_nocs_network_targets(masks_list, latent_embeddings, abs_poses,_camera.height, _camera.width)

        # color_img = cv2.imread(img_full_path + '_color.png')
        color_img = cv2.imread(filepath_dict["color"])
        colorjitter = transforms.ColorJitter(0.8, 0.5, 0.5, 0.05)
        rgb_img = colorjitter(Image.fromarray(color_img))
        jitter_img = colorjitter(rgb_img)
        color_img = np.asarray(jitter_img)
        depth_array = np.array(depth, dtype=np.float32)/255.0
        DM = DepthManager()
        noisy_depth  = DM.prepare_depth_data(depth_array)
        stereo_datapoint = datapoint.Stereo(left_color=color_img, right_color=noisy_depth)
        panoptic_datapoint = datapoint.Panoptic(
        stereo=stereo_datapoint,
        depth=depth_array,
        segmentation=seg_mask,
        object_poses=[obb_datapoint],
        boxes=[],
        detections=[]
        )
        _DATASET.write(panoptic_datapoint)
        ### Finish writing datapoint

def annotate_test_data(data_dir, estimator, source, subset):
    """ Generate gt labels for test data.
        Properly copy handle_visibility provided by NOCS gts.
    """
    DATASET_NAME = 'NOCS_Data'
    DATASET_DIR = pathlib.Path(f'data/{DATASET_NAME}')
    DATASET_DIR.mkdir(parents=True, exist_ok=True)
    _DATASET = datapoint.make_dataset(f'file://{DATASET_DIR}/{source}/{subset}')
    
    _camera = camera.NOCS_Real()
    camera_val = open(os.path.join(data_dir, 'CAMERA', 'val_list_all.txt')).read().splitlines()
    real_test = open(os.path.join(data_dir, 'Real', 'test_list_all.txt')).read().splitlines()
    # camera_intrinsics = np.array([[577.5, 0, 319.5], [0, 577.5, 239.5], [0, 0, 1]])
    # real_intrinsics = np.array([[591.0125, 0, 322.525], [0, 590.16775, 244.11084], [0, 0, 1]])

    camera_intrinsics = np.array([[572.411363389757, 0.0, 325.2611083984375], [0.0, 573.5704328585578, 242.04899588216654], [0.0, 0.0, 1.0]])
    real_intrinsics = np.array([[572.411363389757, 0.0, 325.2611083984375], [0.0, 573.5704328585578, 242.04899588216654], [0.0, 0.0, 1.0]])

    # compute model size
    model_file_path = ['models_obj/camera_val.pkl', 'models_obj/real_test.pkl']
    models = {}
    for path in model_file_path:
        with open(os.path.join(data_dir, path), 'rb') as f:
            models.update(cPickle.load(f))
    model_sizes = {}
    for key in models.keys():
        model_sizes[key] = 2 * np.amax(np.abs(models[key]), axis=0)

    #TEST MODELS
    obj_model_dir = os.path.join(data_dir, 'models_obj')
    with open(os.path.join(obj_model_dir, 'real_test.pkl'), 'rb') as f:
        obj_models = cPickle.load(f)
    
    if source == 'CAMERA':
        subset_meta = [('CAMERA', camera_val, camera_intrinsics, 'val')]
        subset_temp = "val"
    else:
        subset_meta = [('Real', real_test, real_intrinsics, 'test')]
        subset_temp = "test"
    # subset_meta = [('CAMERA', camera_val, camera_intrinsics, 'val'), ('Real', real_test, real_intrinsics, 'test')]
    for source, img_list, intrinsics, subset in subset_meta:
        valid_img_list = []
        # img_list = np.array(img_list)[np.array([2, 500, 1000, 1500, 1700, 1300, 2000, 2300, 2350, 2750])].tolist()
        for img_path in tqdm(img_list):
            # img_full_path = os.path.join(data_dir, source, img_path)
            img_full_path = os.path.join(data_dir, source, subset_temp, "000000")
            basename = os.path.basename(img_path)[:-4]
            
            # # get gt info data
            # gt_info = gt_info_data[str(int(basename))]

            # depth_composed_path = img_path+'_composed.png'
            depth_full_path_real = os.path.join(data_dir,'camera_full_depths', basename+".png")
            # print("depth_full_path", depth_full_path)

            all_exist = os.path.exists(img_full_path + '/color/'+ basename +'.jpg') and \
                        os.path.exists(img_full_path + '/coord/'+ str(int(basename)) +'.hdf5') and \
                        os.path.exists(img_full_path + '/depth/'+ basename +'.png') and \
                        os.path.exists(img_full_path + '/mask/'+ basename +'_mask.png') and \
                        os.path.exists(img_full_path + '/meta/'+ basename[2:] +'_meta.txt')
            
            filepath_dict = {}
            filepath_dict["color"] = img_full_path + '/color/'+ basename +'.jpg'
            filepath_dict["coord"] = img_full_path + '/coord/'+ str(int(basename)) +'.hdf5'
            filepath_dict["depth"] = img_full_path + '/depth/'+ basename +'.png'
            filepath_dict["mask"] = img_full_path + '/mask/'+ basename +'_mask.png'
            filepath_dict["temp_mask"] = img_full_path + '/mask_no_use/mask/' + basename
            filepath_dict["meta"] = img_full_path + '/meta/'+ basename[2:] +'_meta.txt'
            
            if not all_exist:
                print("no exist", filepath_dict)
                continue
            # depth_full_path_real = img_full_path + '_depth.png'
            depth = load_depth(depth_full_path_real)
            masks, coords, class_ids, instance_ids, model_list, bboxes = process_data(img_full_path, depth, filepath_dict)
            if instance_ids is None:
                continue
            num_insts = len(instance_ids)
            # match each instance with NOCS ground truth to properly assign gt_handle_visibility
            # nocs_dir = os.path.join(os.path.dirname(data_dir), 'results/nocs_results')
            nocs_dir = os.path.join(data_dir)
            if source == 'CAMERA':
                nocs_path = os.path.join(nocs_dir, 'CAMERA/val/000000/gts', 'results_000000_{}.pkl'.format(img_path.split('/')[-1][2:-4]))
            else:
                nocs_path = os.path.join(nocs_dir, 'Real/test/000000/gts', 'results_000000_{}.pkl'.format(img_path.split('/')[-1][2:-4]))
            with open(nocs_path, 'rb') as f:
                nocs = cPickle.load(f)

            gt_class_ids = nocs['gt_class_ids']
            gt_bboxes = nocs['gt_bboxes']
            gt_sRT = nocs['gt_RTs'][1:]
            gt_handle_visibility = nocs['gt_handle_visibility']
            map_to_nocs = []
            for i in range(num_insts):
                for j in range(len(gt_class_ids)):
                    if gt_class_ids[j] != class_ids[i]:
                        continue
                    # if np.sum(np.abs(bboxes[i] - gt_bboxes[j])) > 5:
                    #     continue
                    # match found
                    gt_match = j
                    break
                # check match validity
                assert gt_match > -1, print(img_path, instance_ids[i], 'no match for instance')
                # assert gt_match not in map_to_nocs, print(img_path, instance_ids[i], 'duplicate match')
                # map_to_nocs.append(gt_match)
                map_to_nocs.append(i)
            # copy from ground truth, re-label for mug category
            handle_visibility = gt_handle_visibility[map_to_nocs]
            sizes = np.zeros((num_insts, 3))
            poses = np.zeros((num_insts, 4, 4))
            scales = np.zeros(num_insts)
            rotations = np.zeros((num_insts, 3, 3))
            translations = np.zeros((num_insts, 3))
            for i in range(num_insts):
                gt_idx = map_to_nocs[i]
                sizes[i] = model_sizes["00"+model_list[i]]
                sRT = gt_sRT[gt_idx]
                s = np.cbrt(np.linalg.det(sRT[:3, :3]))
                R = sRT[:3, :3] / s
                T = sRT[:3, 3]
                # # re-label mug category
                # if class_ids[i] == 6:
                #     T0 = mug_meta[model_list[i]][0]
                #     s0 = mug_meta[model_list[i]][1]
                #     T = T - s * R @ T0
                #     s = s / s0
                # used for test during training
                scales[i] = s
                rotations[i] = R
                translations[i] = T
                # used for evaluation
                sRT = np.identity(4, dtype=np.float32)
                sRT[:3, :3] = s * R
                sRT[:3, 3] = T
                poses[i] = sRT

            ### GET CENTERPOINT DATAPOINTS
            #get latent embeddings
            model_points = [models["00"+model_list[i]].astype(np.float32) for i in range(len(class_ids))]
            latent_embeddings = get_latent_embeddings(model_points, estimator)
            #get poses 
            abs_poses=[]
            class_num=1
            seg_mask = np.zeros([_camera.height, _camera.width])
            masks_list = []
            for i in range(len(class_ids)):
                R = rotations[i]
                T = translations[i]
                s = scales[i]
                sym_ids = [0, 1, 3]
                cat_id = np.array(class_ids)[i] - 1 
                if cat_id in sym_ids:
                    R = align_rotation(R)
                scale_matrix = np.eye(4)
                scale_mat = s*np.eye(3, dtype=float)
                scale_matrix[0:3, 0:3] = scale_mat
                camera_T_object = np.eye(4)
                camera_T_object[:3,:3] = R
                camera_T_object[:3,3] = T
                seg_mask[masks[:,:,i] > 0] = np.array(class_ids)[i]
                class_num += 1
                masks_list.append(masks[:,:,i])
                abs_poses.append(transform.Pose(camera_T_object=camera_T_object, scale_matrix=scale_matrix))
            obb_datapoint = obb_inputs.compute_nocs_network_targets(masks_list, latent_embeddings, abs_poses,_camera.height, _camera.width)

            # color_img = cv2.imread(img_full_path + '_color.png')
            color_img = cv2.imread(filepath_dict["color"])
            depth_array = np.array(depth, dtype=np.float32)/255.0

            DM = DepthManager()
            noisy_depth  = DM.prepare_depth_data(depth_array)
            stereo_datapoint = datapoint.Stereo(left_color=color_img, right_color=noisy_depth)
            panoptic_datapoint = datapoint.Panoptic(
            stereo=stereo_datapoint,
            depth=noisy_depth,
            segmentation=seg_mask,
            object_poses=[obb_datapoint],
            boxes=[],
            detections=[]
            )
            _DATASET.write(panoptic_datapoint)
            ### Finish writing datapoint

def get_latent_embeddings(point_clouds, estimator):
    latent_embeddings =[]
    for i in range(len(point_clouds)):
        batch_xyz = torch.from_numpy(point_clouds[i]).to(device="cuda", dtype=torch.float)
        batch_xyz = batch_xyz.unsqueeze(0)
        emb, _ = estimator(batch_xyz)
        emb = emb.squeeze(0).cpu().detach().numpy()
        latent_embeddings.append(emb)
    return latent_embeddings

if __name__ == '__main__':

    data_dir = "/path/to/makeNOCS-raspi/makeNOCS/output_data/bop_data/lm"
    emb_dim = 128
    # emb_dim = 512
    n_pts = 2048
    # n_pts = 1024
    n_cat = 1

    model_path = os.path.join(data_dir, 'auto_encoder_model', 'model_100.pth')
    estimator = PointCloudAE(emb_dim, n_pts)
    estimator.cuda()
    estimator.load_state_dict(torch.load(model_path))
    estimator.eval()
    
    print("Generating image lists")
    create_img_list(data_dir)
    print("Image lists generated...\n")
    print("Generating Camera Train data...")
    annotate_camera_train(data_dir, estimator)
    print("Generating Real Train data...")
    annotate_real_train(data_dir, estimator)
    print("Generating Camera Val data...")
    annotate_test_data(data_dir, estimator, 'CAMERA', 'val')
    print("Generating Real Test data...")
    annotate_test_data(data_dir, estimator, 'Real', 'test')

上記を実行します。

cd makeNOCS/CenterSnap
python3 -m prepare_data.generate_data_nocs_custom

CenterSnap/data/NOCS_data にバイナリデータ(zstd形式)が格納されます。

おわりに

今回は CenterSnap 学習用の中間ファイルを作成する方法について説明しました。

中間ファイルの生成プログラムは長いですが、BlenderProc 用に改良した部分も多いので注意が必要です。

次回は実際に CenterSnap を学習させる方法について説明します。