はじめに
前回は AutoEncoder を学習させる方法について説明しました。
今回は、CenterSnap を学習させるための中間ファイルを作成する方法について説明します。
前提条件
前提条件は以下の通りです。
- Windows11 (三次元モデルの準備にのみ使用)
- Ubuntu22 (モデル準備以降に使用)
- Python3.10.x
- CloudCompare
- open3d == 0.16.0
- こちらの記事を参考に 三次元モデルを作成していること
- シーンの作成が完了していること
- こちらの記事を参考に bop_toolkit_lib のインストールとプログラムの修正が完了していること
- マスクデータの作成が完了していること
- アノテーションデータの作成が完了していること
- オブジェクトのモデル情報の作成が完了していること
- ShapeNetCore の HDF5 ファイルの作成が完了していること
- object-deformnet 用の中間ファイル作成が完了していること
CenterSnap インストール
CenterSnap をインストールしていきます。
cd makeNOCS
git clone https://github.com/zubair-irshad/CenterSnap.git
cd CenterSnap
requirements.txt を以下のようにしてください。
open3d
absl-py==0.13.0
backcall==0.2.0
boto3==1.18.1
botocore==1.21.1
cachetools==4.2.2
certifi==2021.5.30
charset-normalizer==2.0.3
click==8.0.1
colour==0.1.5
configparser==5.0.2
cycler==0.10.0
decorator==4.4.2
docker-pycreds==0.4.0
future==0.18.2
gitdb==4.0.7
GitPython==3.1.18
google-auth==1.33.0
google-auth-oauthlib==0.4.4
greenlet==1.1.0
grpcio==1.38.1
idna==3.2
#imageio==2.9.0
ipython==7.25.0
ipython-genutils==0.2.0
jedi==0.18.0
jmespath==0.10.0
kiwisolver==1.3.1
Markdown==3.3.4
matplotlib==3.1.3
matplotlib-inline==0.1.2
msgpack==1.0.2
networkx==2.5.1
numpy==1.23.5
oauthlib==3.1.1
#opencv-python==4.5.3.56
parso==0.8.2
pathtools==0.1.2
pexpect==4.8.0
pickleshare==0.7.5
#Pillow==8.3.1
promise==2.3
prompt-toolkit==3.0.19
protobuf==3.17.3
psutil==5.8.0
ptyprocess==0.7.0
pyasn1==0.4.8
pyasn1-modules==0.2.8
Pygments==2.9.0
pynvim==0.4.3
pyparsing==2.4.7
python-dateutil==2.8.2
pytorch-lightning==0.7.5
PyWavelets==1.1.1
PyYAML==5.4.1
requests==2.26.0
requests-oauthlib==1.3.0
rsa==4.7.2
s3transfer==0.5.0
scikit-image==0.18.2
#scipy==1.7.0
sentry-sdk==1.3.0
shortuuid==1.0.1
six==1.16.0
smmap==4.0.0
subprocess32==3.5.4
tensorboard==2.5.0
tensorboard-data-server==0.6.1
tensorboard-plugin-wit==1.8.0
tifffile==2021.7.2
fvcore
#torch==1.7.1
#torchvision==0.8.2
tqdm==4.61.2
#traitlets==5.0.5
urllib3==1.26.6
wandb==0.11.0
wcwidth==0.2.5
#Werkzeug==2.0.1
ydiff==1.2
zstandard==0.15.2
plotly==5.6.0
argparse
h5py
python3 -m pip install -r requirements.txt
また、cuda にシンボリックリンクを貼っておきます。
sudo ln -s /usr/lib/wsl/lib/libcuda.so.1 /usr/local/cuda/lib64/libcuda.so
中間ファイルの作成
続いて、中間ファイルを作成していきます。
touch prepare_data/generate_data_nocs_custom.py
generate_data_nocs_custom.py
import os
import pathlib
import math
import glob
import cv2
import numpy as np
import torch
import torchvision.transforms as transforms
import _pickle as cPickle
from tqdm import tqdm
from utils.nocs_utils import load_depth, align_nocs_to_depth
from simnet.lib import camera
from simnet.lib import transform
from simnet.lib import datapoint
from simnet.lib.net.pre_processing import obb_inputs
from simnet.lib.depth_noise import DepthManager
from simnet.lib.net.models.auto_encoder import PointCloudAE
from PIL import Image
import argparse
import h5py
import json
from matplotlib import pyplot as plt
def create_img_list(data_dir):
""" Create train/val/test data list for CAMERA and Real. """
# CAMERA dataset
for subset in ['train', 'val']:
img_list = []
img_dir = os.path.join(data_dir, 'CAMERA', subset)
file_list = glob.glob(img_dir+"/000000/color/*.jpg")
for f in file_list:
basename = os.path.basename(f)
img_path = os.path.join(subset, "000000/color", basename)
img_list.append(img_path)
with open(os.path.join(data_dir, 'CAMERA', subset+'_list_all.txt'), 'w') as f:
for img_path in img_list:
f.write("%s\n" % img_path)
# Real dataset
for subset in ['train', 'test']:
img_list = []
img_dir = os.path.join(data_dir, 'Real', subset)
file_list = glob.glob(img_dir+"/000000/color/*.jpg")
for f in file_list:
basename = os.path.basename(f)
img_path = os.path.join(subset, "000000/color", basename)
img_list.append(img_path)
with open(os.path.join(data_dir, 'Real', subset+'_list_all.txt'), 'w') as f:
for img_path in img_list:
f.write("%s\n" % img_path)
print('Write all data paths to file done!')
def align_rotation(R):
""" Align rotations for symmetric objects.
Args:
sRT: 4 x 4
"""
theta_x = R[0, 0] + R[2, 2]
theta_y = R[0, 2] - R[2, 0]
r_norm = math.sqrt(theta_x**2 + theta_y**2)
s_map = np.array([[theta_x/r_norm, 0.0, -theta_y/r_norm],
[0.0, 1.0, 0.0 ],
[theta_y/r_norm, 0.0, theta_x/r_norm]])
rotation = R @ s_map
return rotation
def process_data(img_path, depth, filepath_dict):
""" Load instance masks for the objects in the image. """
# mask_path = img_path + '_mask.png'
mask_path = filepath_dict["mask"]
mask = cv2.imread(mask_path)[:, :, 2]
mask = np.array(mask, dtype=np.int32)
all_inst_ids = sorted(list(np.unique(mask)))
assert all_inst_ids[-1] == 255
del all_inst_ids[-1] # remove background
num_all_inst = len(all_inst_ids)
meta_path = filepath_dict["meta"]
with open(meta_path, 'r') as f:
i = 0
for line in f:
i += 1
num_all_inst = i
h, w = mask.shape
# coord_path = img_path + '_coord.png'
coord_path = filepath_dict["coord"]
with h5py.File(coord_path, "r") as h5:
coord_map = h5["nocs"]
coord_map = coord_map[:, :, :3]
# print(coord_map.shape)
# coord_map = cv2.imread(coord_path)[:, :, :3]
coord_map = coord_map[:, :, (2, 1, 0)]
# flip z axis of coord map
coord_map = np.array(coord_map, dtype=np.float32)# / 255
coord_map[:, :, 2] = 1 - coord_map[:, :, 2]
class_ids = []
instance_ids = []
model_list = []
masks = np.zeros([h, w, num_all_inst], dtype=np.uint8)
coords = np.zeros((h, w, num_all_inst, 3), dtype=np.float32)
bboxes = np.zeros((num_all_inst, 4), dtype=np.int32)
# meta_path = img_path + '_meta.txt'
meta_path = filepath_dict["meta"]
with open(meta_path, 'r') as f:
i = 0
j = 0
lock = False
for line in f:
line_info = line.strip().split(' ')
inst_id = int(line_info[0])
cls_id = int(line_info[1])
# # background objects and non-existing objects
# if cls_id == 0 or (inst_id not in all_inst_ids):
# continue
if len(line_info) == 3:
model_id = line_info[2] # Real scanned objs
else:
model_id = line_info[3] # CAMERA objs
temp_mask_path = filepath_dict["temp_mask"] + "_{:06}".format(j)+".png"
temp_mask = cv2.imread(temp_mask_path, 0)
_, temp_mask = cv2.threshold(temp_mask, 127, 255, cv2.THRESH_BINARY)
j += 1
# process foreground objects
# inst_mask = np.equal(mask, inst_id)
# inst_mask = np.equal(temp_mask, 1)
# plt.imshow(temp_mask)
# plt.show()
# bounding box
# horizontal_indicies = np.where(np.any(inst_mask, axis=0))[0]
# vertical_indicies = np.where(np.any(inst_mask, axis=1))[0]
horizontal_indicies = np.where(np.any(temp_mask, axis=0))[0]
vertical_indicies = np.where(np.any(temp_mask, axis=1))[0]
# assert horizontal_indicies.shape[0], print(temp_mask_path)
# if horizontal_indicies.shape[0] and lock == False:
# lock = True
# else:
# lock = False
# continue
#assert horizontal_indicies.shape[0], print(img_path)
# print(horizontal_indicies.shape[0])
if horizontal_indicies.shape[0] == 0:
continue
x1, x2 = horizontal_indicies[[0, -1]]
y1, y2 = vertical_indicies[[0, -1]]
# x2 and y2 should not be part of the box. Increment by 1.
x2 += 1
y2 += 1
# object occupies full image, rendering error, happens in CAMERA dataset
if np.any(np.logical_or((x2-x1) > 600, (y2-y1) > 440)):
print("occupies")
return None, None, None, None, None, None
# not enough valid depth observation
# final_mask = np.logical_and(inst_mask, depth > 0)
final_mask = np.logical_and(temp_mask, depth > 0)
if np.sum(final_mask) < 64:
print("summmmmmmmmmmmmmmmmmm")
continue
class_ids.append(cls_id)
instance_ids.append(inst_id)
model_list.append(model_id)
# masks[:, :, i] = inst_mask
# coords[:, :, i, :] = np.multiply(coord_map, np.expand_dims(inst_mask, axis=-1))
masks[:, :, i] = temp_mask
coords[:, :, i, :] = np.multiply(coord_map, np.expand_dims(temp_mask, axis=-1))
bboxes[i] = np.array([y1, x1, y2, x2])
i += 1
# no valid foreground objects
if i == 0:
return None, None, None, None, None, None
masks = masks[:, :, :i]
coords = np.clip(coords[:, :, :i, :], 0, 1)
bboxes = bboxes[:i, :]
return masks, coords, class_ids, instance_ids, model_list, bboxes
def annotate_camera_train(data_dir, estimator):
DATASET_NAME = 'NOCS_Data'
DATASET_DIR = pathlib.Path(f'data/{DATASET_NAME}')
DATASET_DIR.mkdir(parents=True, exist_ok=True)
_DATASET = datapoint.make_dataset(f'file://{DATASET_DIR}/CAMERA/train')
_camera = camera.NOCS_Camera()
""" Generate gt labels for CAMERA train data. """
camera_train = open(os.path.join(data_dir, 'CAMERA', 'train_list_all.txt')).read().splitlines()
# intrinsics = np.array([[577.5, 0, 319.5], [0, 577.5, 239.5], [0, 0, 1]])
intrinsics = np.array([[572.411363389757, 0.0, 325.2611083984375], [0.0, 573.5704328585578, 242.04899588216654], [0.0, 0.0, 1.0]])
#TEST MODELS
obj_model_dir = os.path.join(data_dir, 'models_obj')
with open(os.path.join(obj_model_dir, 'camera_train.pkl'), 'rb') as f:
obj_models = cPickle.load(f)
model_sizes = {}
for key in obj_models.keys():
model_sizes[key] = 2 * np.amax(np.abs(obj_models[key]), axis=0)
# i = len(camera_train)//20
# num = 11
# camera_train = camera_train[(num)*i:(num+1)*i]
# get scene info
gt_info_path = os.path.join(data_dir, 'CAMERA', "train/000000", "scene_gt.json")
with open(gt_info_path, "r") as f:
gt_info_data = json.load(f)
valid_img_list = []
for img_path in tqdm(camera_train):
# img_full_path = os.path.join(data_dir, 'CAMERA', img_path)
img_full_path = os.path.join(data_dir, 'CAMERA', "train/000000")
basename = os.path.basename(img_path)[:-4]
# get gt info data
gt_info = gt_info_data[str(int(basename))]
# depth_composed_path = img_path+'_composed.png'
depth_full_path = os.path.join(data_dir,'camera_full_depths', basename+".png")
#print("depth_full_path", depth_full_path)
all_exist = os.path.exists(img_full_path + '/color/'+ basename +'.jpg') and \
os.path.exists(img_full_path + '/coord/'+ str(int(basename)) +'.hdf5') and \
os.path.exists(img_full_path + '/depth/'+ basename +'.png') and \
os.path.exists(img_full_path + '/mask/'+ basename +'_mask.png') and \
os.path.exists(img_full_path + '/meta/'+ basename[2:] +'_meta.txt')
filepath_dict = {}
filepath_dict["color"] = img_full_path + '/color/'+ basename +'.jpg'
filepath_dict["coord"] = img_full_path + '/coord/'+ str(int(basename)) +'.hdf5'
filepath_dict["depth"] = img_full_path + '/depth/'+ basename +'.png'
filepath_dict["mask"] = img_full_path + '/mask/'+ basename +'_mask.png'
filepath_dict["temp_mask"] = img_full_path + '/mask_no_use/mask/' + basename
filepath_dict["meta"] = img_full_path + '/meta/'+ basename[2:] +'_meta.txt'
if not all_exist:
print("no exist", filepath_dict)
continue
depth = load_depth(depth_full_path)
masks, coords, class_ids, instance_ids, model_list, bboxes = process_data(img_full_path, depth, filepath_dict)
# print(masks.shape, coords.shape, class_ids, instance_ids, model_list, bboxes)
if instance_ids is None:
print("instance_ids", instance_ids)
continue
# Umeyama alignment of GT NOCS map with depth image
scales, rotations, translations, error_messages, _ = \
align_nocs_to_depth(masks, coords, depth, intrinsics, instance_ids, img_path, gt_info)
if error_messages:
print("error_messages")
continue
### GET CENTERPOINT DATAPOINTS
#get latent embeddings
model_points = [obj_models["00"+model_list[i]].astype(np.float32) for i in range(len(class_ids))]
latent_embeddings = get_latent_embeddings(model_points, estimator)
#get poses
abs_poses=[]
seg_mask = np.zeros([_camera.height, _camera.width])
masks_list = []
nocs_dir = os.path.join(data_dir)
nocs_path = os.path.join(nocs_dir, 'CAMERA/val/000000/gts', 'results_000000_{}.pkl'.format(img_path.split('/')[-1][2:-4]))
with open(nocs_path, 'rb') as f:
nocs = cPickle.load(f)
gt_class_ids = nocs['gt_class_ids']
gt_bboxes = nocs['gt_bboxes']
gt_sRT = nocs['gt_RTs'][1:]
gt_handle_visibility = nocs['gt_handle_visibility']
map_to_nocs = []
num_insts = len(instance_ids)
for i in range(num_insts):
for j in range(len(gt_class_ids)):
if gt_class_ids[j] != class_ids[i]:
continue
# if np.sum(np.abs(bboxes[i] - gt_bboxes[j])) > 5:
# continue
# match found
gt_match = j
break
# check match validity
assert gt_match > -1, print(img_path, instance_ids[i], 'no match for instance')
# assert gt_match not in map_to_nocs, print(img_path, instance_ids[i], 'duplicate match')
# map_to_nocs.append(gt_match)
map_to_nocs.append(i)
sizes = np.zeros((num_insts, 3))
poses = np.zeros((num_insts, 4, 4))
scales = np.zeros(num_insts)
rotations = np.zeros((num_insts, 3, 3))
translations = np.zeros((num_insts, 3))
for i in range(num_insts):
gt_idx = map_to_nocs[i]
sizes[i] = model_sizes["00"+model_list[i]]
sRT = gt_sRT[gt_idx]
s = np.cbrt(np.linalg.det(sRT[:3, :3]))
R = sRT[:3, :3] / s
T = sRT[:3, 3]
# # re-label mug category
# if class_ids[i] == 6:
# T0 = mug_meta[model_list[i]][0]
# s0 = mug_meta[model_list[i]][1]
# T = T - s * R @ T0
# s = s / s0
# used for test during training
scales[i] = s
rotations[i] = R
translations[i] = T
# used for evaluation
sRT = np.identity(4, dtype=np.float32)
sRT[:3, :3] = s * R
sRT[:3, 3] = T
poses[i] = sRT
for i in range(len(class_ids)):
R = rotations[i]
T = translations[i]
s = scales[i]
sym_ids = [0, 1, 3]
cat_id = np.array(class_ids)[i] - 1
if cat_id in sym_ids:
R = align_rotation(R)
scale_matrix = np.eye(4)
scale_mat = s*np.eye(3, dtype=float)
scale_matrix[0:3, 0:3] = scale_mat
camera_T_object = np.eye(4)
camera_T_object[:3,:3] = R
camera_T_object[:3,3] = T
seg_mask[masks[:,:,i] > 0] = np.array(class_ids)[i]
masks_list.append(masks[:,:,i])
abs_poses.append(transform.Pose(camera_T_object=camera_T_object, scale_matrix=scale_matrix))
obb_datapoint = obb_inputs.compute_nocs_network_targets(masks_list, latent_embeddings, abs_poses,_camera.height, _camera.width)
# color_img = cv2.imread(img_full_path + '_color.png')
color_img = cv2.imread(filepath_dict["color"])
colorjitter = transforms.ColorJitter(0.5, 0.3, 0.5, 0.05)
rgb_img = colorjitter(Image.fromarray(color_img))
jitter_img = colorjitter(rgb_img)
color_img = np.asarray(jitter_img)
depth_array = np.array(depth, dtype=np.float32)/255.0
DM = DepthManager()
noisy_depth = DM.prepare_depth_data(depth_array)
stereo_datapoint = datapoint.Stereo(left_color=color_img, right_color=noisy_depth)
panoptic_datapoint = datapoint.Panoptic(
stereo=stereo_datapoint,
depth=noisy_depth,
segmentation=seg_mask,
object_poses=[obb_datapoint],
boxes=[],
detections=[]
)
_DATASET.write(panoptic_datapoint)
### Finish writing datapoint
def annotate_real_train(data_dir, estimator):
DATASET_NAME = 'NOCS_Data'
DATASET_DIR = pathlib.Path(f'data/{DATASET_NAME}')
DATASET_DIR.mkdir(parents=True, exist_ok=True)
_DATASET = datapoint.make_dataset(f'file://{DATASET_DIR}/Real/train')
""" Generate gt labels for Real train data through PnP. """
_camera = camera.NOCS_Real()
real_train = open(os.path.join(data_dir, 'Real/train_list_all.txt')).read().splitlines()
# intrinsics = np.array([[591.0125, 0, 322.525], [0, 590.16775, 244.11084], [0, 0, 1]])
intrinsics = np.array([[572.411363389757, 0.0, 325.2611083984375], [0.0, 573.5704328585578, 242.04899588216654], [0.0, 0.0, 1.0]])
# scale factors for all instances
scale_factors = {}
path_to_size = glob.glob(os.path.join(data_dir, 'models_obj', '*_norm.txt'))
for inst_path in sorted(path_to_size):
instance = os.path.basename(inst_path).split('.')[0]
bbox_dims = np.loadtxt(inst_path)
scale_factors[instance[4:10]] = np.linalg.norm(bbox_dims)
#TEST MODELS
obj_model_dir = os.path.join(data_dir, 'models_obj')
with open(os.path.join(obj_model_dir, 'real_train.pkl'), 'rb') as f:
obj_models = cPickle.load(f)
# get scene info
# gt_info_path = os.path.join(data_dir, 'Real', "train/000000", "scene_gt.json")
# with open(gt_info_path, "r") as f:
# gt_info_data = json.load(f)
valid_img_list = []
# real_train = np.array(real_train)[np.array([2, 500, 1000, 1500, 1700, 1300, 2000, 2300, 2350, 2750])].tolist()
for img_path in tqdm(real_train):
# img_full_path = os.path.join(data_dir, 'Real', img_path)
img_full_path = os.path.join(data_dir, 'Real', "train/000000")
basename = os.path.basename(img_path)[:-4]
# # get gt info data
# gt_info = gt_info_data[str(int(basename))]
# depth_composed_path = img_path+'_composed.png'
depth_full_path = os.path.join(data_dir,'camera_full_depths', basename+".png")
#print("depth_full_path", depth_full_path)
all_exist = os.path.exists(img_full_path + '/color/'+ basename +'.jpg') and \
os.path.exists(img_full_path + '/coord/'+ str(int(basename)) +'.hdf5') and \
os.path.exists(img_full_path + '/depth/'+ basename +'.png') and \
os.path.exists(img_full_path + '/mask/'+ basename +'_mask.png') and \
os.path.exists(img_full_path + '/meta/'+ basename[2:] +'_meta.txt')
filepath_dict = {}
filepath_dict["color"] = img_full_path + '/color/'+ basename +'.jpg'
filepath_dict["coord"] = img_full_path + '/coord/'+ str(int(basename)) +'.hdf5'
filepath_dict["depth"] = img_full_path + '/depth/'+ basename +'.png'
filepath_dict["mask"] = img_full_path + '/mask/'+ basename +'_mask.png'
filepath_dict["temp_mask"] = img_full_path + '/mask_no_use/mask/' + basename
filepath_dict["meta"] = img_full_path + '/meta/'+ basename[2:] +'_meta.txt'
if not all_exist:
print("no exist", filepath_dict)
continue
depth = load_depth(depth_full_path)
masks, coords, class_ids, instance_ids, model_list, bboxes = process_data(img_full_path, depth, filepath_dict)
if instance_ids is None:
continue
# compute pose
num_insts = len(class_ids)
scales = np.zeros(num_insts)
rotations = np.zeros((num_insts, 3, 3))
translations = np.zeros((num_insts, 3))
for i in range(num_insts):
s = scale_factors["00"+model_list[i]]
mask = masks[:, :, i]
idxs = np.where(mask)
coord = coords[:, :, i, :]
coord_pts = s * (coord[idxs[0], idxs[1], :] - 0.5)
coord_pts = coord_pts[:, :, None]
img_pts = np.array([idxs[1], idxs[0]]).transpose()
img_pts = img_pts[:, :, None].astype(float)
distCoeffs = np.zeros((4, 1)) # no distoration
retval, rvec, tvec = cv2.solvePnP(coord_pts, img_pts, intrinsics, distCoeffs)
assert retval
R, _ = cv2.Rodrigues(rvec)
T = np.squeeze(tvec)
# # re-label for mug category
# if class_ids[i] == 6:
# T0 = mug_meta[model_list[i]][0]
# s0 = mug_meta[model_list[i]][1]
# T = T - s * R @ T0
# s = s / s0
scales[i] = s
rotations[i] = R
translations[i] = T
### GET CENTERPOINT DATAPOINTS
#get latent embeddings
model_points = [obj_models["obj_00"+model_list[i]].astype(np.float32) for i in range(len(class_ids))]
latent_embeddings = get_latent_embeddings(model_points, estimator)
#get poses
abs_poses=[]
class_num=1
seg_mask = np.zeros([_camera.height, _camera.width])
masks_list = []
for i in range(len(class_ids)):
R = rotations[i]
T = translations[i]
s = scales[i]
sym_ids = [0, 1, 3]
cat_id = np.array(class_ids)[i] - 1
if cat_id in sym_ids:
R = align_rotation(R)
scale_matrix = np.eye(4)
scale_mat = s*np.eye(3, dtype=float)
scale_matrix[0:3, 0:3] = scale_mat
camera_T_object = np.eye(4)
camera_T_object[:3,:3] = R
camera_T_object[:3,3] = T
seg_mask[masks[:,:,i] > 0] = np.array(class_ids)[i]
class_num += 1
masks_list.append(masks[:,:,i])
abs_poses.append(transform.Pose(camera_T_object=camera_T_object, scale_matrix=scale_matrix))
obb_datapoint = obb_inputs.compute_nocs_network_targets(masks_list, latent_embeddings, abs_poses,_camera.height, _camera.width)
# color_img = cv2.imread(img_full_path + '_color.png')
color_img = cv2.imread(filepath_dict["color"])
colorjitter = transforms.ColorJitter(0.8, 0.5, 0.5, 0.05)
rgb_img = colorjitter(Image.fromarray(color_img))
jitter_img = colorjitter(rgb_img)
color_img = np.asarray(jitter_img)
depth_array = np.array(depth, dtype=np.float32)/255.0
DM = DepthManager()
noisy_depth = DM.prepare_depth_data(depth_array)
stereo_datapoint = datapoint.Stereo(left_color=color_img, right_color=noisy_depth)
panoptic_datapoint = datapoint.Panoptic(
stereo=stereo_datapoint,
depth=depth_array,
segmentation=seg_mask,
object_poses=[obb_datapoint],
boxes=[],
detections=[]
)
_DATASET.write(panoptic_datapoint)
### Finish writing datapoint
def annotate_test_data(data_dir, estimator, source, subset):
""" Generate gt labels for test data.
Properly copy handle_visibility provided by NOCS gts.
"""
DATASET_NAME = 'NOCS_Data'
DATASET_DIR = pathlib.Path(f'data/{DATASET_NAME}')
DATASET_DIR.mkdir(parents=True, exist_ok=True)
_DATASET = datapoint.make_dataset(f'file://{DATASET_DIR}/{source}/{subset}')
_camera = camera.NOCS_Real()
camera_val = open(os.path.join(data_dir, 'CAMERA', 'val_list_all.txt')).read().splitlines()
real_test = open(os.path.join(data_dir, 'Real', 'test_list_all.txt')).read().splitlines()
# camera_intrinsics = np.array([[577.5, 0, 319.5], [0, 577.5, 239.5], [0, 0, 1]])
# real_intrinsics = np.array([[591.0125, 0, 322.525], [0, 590.16775, 244.11084], [0, 0, 1]])
camera_intrinsics = np.array([[572.411363389757, 0.0, 325.2611083984375], [0.0, 573.5704328585578, 242.04899588216654], [0.0, 0.0, 1.0]])
real_intrinsics = np.array([[572.411363389757, 0.0, 325.2611083984375], [0.0, 573.5704328585578, 242.04899588216654], [0.0, 0.0, 1.0]])
# compute model size
model_file_path = ['models_obj/camera_val.pkl', 'models_obj/real_test.pkl']
models = {}
for path in model_file_path:
with open(os.path.join(data_dir, path), 'rb') as f:
models.update(cPickle.load(f))
model_sizes = {}
for key in models.keys():
model_sizes[key] = 2 * np.amax(np.abs(models[key]), axis=0)
#TEST MODELS
obj_model_dir = os.path.join(data_dir, 'models_obj')
with open(os.path.join(obj_model_dir, 'real_test.pkl'), 'rb') as f:
obj_models = cPickle.load(f)
if source == 'CAMERA':
subset_meta = [('CAMERA', camera_val, camera_intrinsics, 'val')]
subset_temp = "val"
else:
subset_meta = [('Real', real_test, real_intrinsics, 'test')]
subset_temp = "test"
# subset_meta = [('CAMERA', camera_val, camera_intrinsics, 'val'), ('Real', real_test, real_intrinsics, 'test')]
for source, img_list, intrinsics, subset in subset_meta:
valid_img_list = []
# img_list = np.array(img_list)[np.array([2, 500, 1000, 1500, 1700, 1300, 2000, 2300, 2350, 2750])].tolist()
for img_path in tqdm(img_list):
# img_full_path = os.path.join(data_dir, source, img_path)
img_full_path = os.path.join(data_dir, source, subset_temp, "000000")
basename = os.path.basename(img_path)[:-4]
# # get gt info data
# gt_info = gt_info_data[str(int(basename))]
# depth_composed_path = img_path+'_composed.png'
depth_full_path_real = os.path.join(data_dir,'camera_full_depths', basename+".png")
# print("depth_full_path", depth_full_path)
all_exist = os.path.exists(img_full_path + '/color/'+ basename +'.jpg') and \
os.path.exists(img_full_path + '/coord/'+ str(int(basename)) +'.hdf5') and \
os.path.exists(img_full_path + '/depth/'+ basename +'.png') and \
os.path.exists(img_full_path + '/mask/'+ basename +'_mask.png') and \
os.path.exists(img_full_path + '/meta/'+ basename[2:] +'_meta.txt')
filepath_dict = {}
filepath_dict["color"] = img_full_path + '/color/'+ basename +'.jpg'
filepath_dict["coord"] = img_full_path + '/coord/'+ str(int(basename)) +'.hdf5'
filepath_dict["depth"] = img_full_path + '/depth/'+ basename +'.png'
filepath_dict["mask"] = img_full_path + '/mask/'+ basename +'_mask.png'
filepath_dict["temp_mask"] = img_full_path + '/mask_no_use/mask/' + basename
filepath_dict["meta"] = img_full_path + '/meta/'+ basename[2:] +'_meta.txt'
if not all_exist:
print("no exist", filepath_dict)
continue
# depth_full_path_real = img_full_path + '_depth.png'
depth = load_depth(depth_full_path_real)
masks, coords, class_ids, instance_ids, model_list, bboxes = process_data(img_full_path, depth, filepath_dict)
if instance_ids is None:
continue
num_insts = len(instance_ids)
# match each instance with NOCS ground truth to properly assign gt_handle_visibility
# nocs_dir = os.path.join(os.path.dirname(data_dir), 'results/nocs_results')
nocs_dir = os.path.join(data_dir)
if source == 'CAMERA':
nocs_path = os.path.join(nocs_dir, 'CAMERA/val/000000/gts', 'results_000000_{}.pkl'.format(img_path.split('/')[-1][2:-4]))
else:
nocs_path = os.path.join(nocs_dir, 'Real/test/000000/gts', 'results_000000_{}.pkl'.format(img_path.split('/')[-1][2:-4]))
with open(nocs_path, 'rb') as f:
nocs = cPickle.load(f)
gt_class_ids = nocs['gt_class_ids']
gt_bboxes = nocs['gt_bboxes']
gt_sRT = nocs['gt_RTs'][1:]
gt_handle_visibility = nocs['gt_handle_visibility']
map_to_nocs = []
for i in range(num_insts):
for j in range(len(gt_class_ids)):
if gt_class_ids[j] != class_ids[i]:
continue
# if np.sum(np.abs(bboxes[i] - gt_bboxes[j])) > 5:
# continue
# match found
gt_match = j
break
# check match validity
assert gt_match > -1, print(img_path, instance_ids[i], 'no match for instance')
# assert gt_match not in map_to_nocs, print(img_path, instance_ids[i], 'duplicate match')
# map_to_nocs.append(gt_match)
map_to_nocs.append(i)
# copy from ground truth, re-label for mug category
handle_visibility = gt_handle_visibility[map_to_nocs]
sizes = np.zeros((num_insts, 3))
poses = np.zeros((num_insts, 4, 4))
scales = np.zeros(num_insts)
rotations = np.zeros((num_insts, 3, 3))
translations = np.zeros((num_insts, 3))
for i in range(num_insts):
gt_idx = map_to_nocs[i]
sizes[i] = model_sizes["00"+model_list[i]]
sRT = gt_sRT[gt_idx]
s = np.cbrt(np.linalg.det(sRT[:3, :3]))
R = sRT[:3, :3] / s
T = sRT[:3, 3]
# # re-label mug category
# if class_ids[i] == 6:
# T0 = mug_meta[model_list[i]][0]
# s0 = mug_meta[model_list[i]][1]
# T = T - s * R @ T0
# s = s / s0
# used for test during training
scales[i] = s
rotations[i] = R
translations[i] = T
# used for evaluation
sRT = np.identity(4, dtype=np.float32)
sRT[:3, :3] = s * R
sRT[:3, 3] = T
poses[i] = sRT
### GET CENTERPOINT DATAPOINTS
#get latent embeddings
model_points = [models["00"+model_list[i]].astype(np.float32) for i in range(len(class_ids))]
latent_embeddings = get_latent_embeddings(model_points, estimator)
#get poses
abs_poses=[]
class_num=1
seg_mask = np.zeros([_camera.height, _camera.width])
masks_list = []
for i in range(len(class_ids)):
R = rotations[i]
T = translations[i]
s = scales[i]
sym_ids = [0, 1, 3]
cat_id = np.array(class_ids)[i] - 1
if cat_id in sym_ids:
R = align_rotation(R)
scale_matrix = np.eye(4)
scale_mat = s*np.eye(3, dtype=float)
scale_matrix[0:3, 0:3] = scale_mat
camera_T_object = np.eye(4)
camera_T_object[:3,:3] = R
camera_T_object[:3,3] = T
seg_mask[masks[:,:,i] > 0] = np.array(class_ids)[i]
class_num += 1
masks_list.append(masks[:,:,i])
abs_poses.append(transform.Pose(camera_T_object=camera_T_object, scale_matrix=scale_matrix))
obb_datapoint = obb_inputs.compute_nocs_network_targets(masks_list, latent_embeddings, abs_poses,_camera.height, _camera.width)
# color_img = cv2.imread(img_full_path + '_color.png')
color_img = cv2.imread(filepath_dict["color"])
depth_array = np.array(depth, dtype=np.float32)/255.0
DM = DepthManager()
noisy_depth = DM.prepare_depth_data(depth_array)
stereo_datapoint = datapoint.Stereo(left_color=color_img, right_color=noisy_depth)
panoptic_datapoint = datapoint.Panoptic(
stereo=stereo_datapoint,
depth=noisy_depth,
segmentation=seg_mask,
object_poses=[obb_datapoint],
boxes=[],
detections=[]
)
_DATASET.write(panoptic_datapoint)
### Finish writing datapoint
def get_latent_embeddings(point_clouds, estimator):
latent_embeddings =[]
for i in range(len(point_clouds)):
batch_xyz = torch.from_numpy(point_clouds[i]).to(device="cuda", dtype=torch.float)
batch_xyz = batch_xyz.unsqueeze(0)
emb, _ = estimator(batch_xyz)
emb = emb.squeeze(0).cpu().detach().numpy()
latent_embeddings.append(emb)
return latent_embeddings
if __name__ == '__main__':
data_dir = "/path/to/makeNOCS-raspi/makeNOCS/output_data/bop_data/lm"
emb_dim = 128
# emb_dim = 512
n_pts = 2048
# n_pts = 1024
n_cat = 1
model_path = os.path.join(data_dir, 'auto_encoder_model', 'model_100.pth')
estimator = PointCloudAE(emb_dim, n_pts)
estimator.cuda()
estimator.load_state_dict(torch.load(model_path))
estimator.eval()
print("Generating image lists")
create_img_list(data_dir)
print("Image lists generated...\n")
print("Generating Camera Train data...")
annotate_camera_train(data_dir, estimator)
print("Generating Real Train data...")
annotate_real_train(data_dir, estimator)
print("Generating Camera Val data...")
annotate_test_data(data_dir, estimator, 'CAMERA', 'val')
print("Generating Real Test data...")
annotate_test_data(data_dir, estimator, 'Real', 'test')
上記を実行します。
cd makeNOCS/CenterSnap
python3 -m prepare_data.generate_data_nocs_custom
CenterSnap/data/NOCS_data にバイナリデータ(zstd形式)が格納されます。
おわりに
今回は CenterSnap 学習用の中間ファイルを作成する方法について説明しました。
中間ファイルの生成プログラムは長いですが、BlenderProc 用に改良した部分も多いので注意が必要です。
次回は実際に CenterSnap を学習させる方法について説明します。
コメント