|
|
import numpy as np |
|
|
|
|
|
class ActionFeature(object): |
|
|
"""Action feature indices for human and robot hand parameters. |
|
|
|
|
|
Defines the start and end indices for different hand feature components |
|
|
in the concatenated action feature vector. |
|
|
""" |
|
|
|
|
|
ALL_FEATURES = (0, 192) |
|
|
|
|
|
HUMAN_LEFT_HAND = (0, 51) |
|
|
HUMAN_RIGHT_HAND = (51, 102) |
|
|
HUMAN_LEFT_TRANS = (0, 3) |
|
|
HUMAN_LEFT_ROT = (3, 6) |
|
|
HUMAN_LEFT_6D = (0, 6) |
|
|
HUMAN_LEFT_JOINTS = (6, 51) |
|
|
HUMAN_RIGHT_TRANS = (51, 54) |
|
|
HUMAN_RIGHT_ROT = (54, 57) |
|
|
HUMAN_RIGHT_6D = (51, 57) |
|
|
HUMAN_RIGHT_JOINTS = (57, 102) |
|
|
PADDING_FEATURES = (102, 192) |
|
|
@classmethod |
|
|
def get_concatenated_action_feature_from_dict(cls, action_feature_dict): |
|
|
"""Concatenate action features from a dictionary into a single feature vector. |
|
|
|
|
|
Args: |
|
|
action_feature_dict: Dictionary mapping feature names to their values |
|
|
|
|
|
Returns: |
|
|
Tuple of (features, feature_mask) where features is the concatenated array |
|
|
and feature_mask indicates which features are present |
|
|
""" |
|
|
batch_size = next(iter(action_feature_dict.values())).shape[0] |
|
|
features = np.zeros((batch_size, cls.ALL_FEATURES[1]), dtype=np.float32) |
|
|
feature_mask = np.zeros((batch_size, cls.ALL_FEATURES[1]), dtype=bool) |
|
|
|
|
|
for key, value in action_feature_dict.items(): |
|
|
assert len(value.shape) == 2 |
|
|
start, end = getattr(cls, key) |
|
|
k = value.shape[1] |
|
|
features[:, start:start + k] = value |
|
|
feature_mask[:, start:start + k] = True |
|
|
return features, feature_mask |
|
|
|
|
|
@classmethod |
|
|
def get_dict_from_concatenated_action_feature(cls, feature, feature_mask): |
|
|
"""Extract action features from concatenated vector into a dictionary. |
|
|
|
|
|
Args: |
|
|
feature: Concatenated feature array |
|
|
feature_mask: Boolean mask indicating which features are present |
|
|
|
|
|
Returns: |
|
|
Dictionary mapping feature names to their extracted values |
|
|
""" |
|
|
action_feature_dict = {} |
|
|
consts = { |
|
|
name: getattr(cls, name) |
|
|
for name in dir(cls) |
|
|
if name.isupper() and "ALL" not in name |
|
|
} |
|
|
for key, (start, end) in consts.items(): |
|
|
k = np.sum(feature_mask[0, start:end]) |
|
|
if k == 0: |
|
|
continue |
|
|
action_feature_dict[key] = feature[:, start:start + k] |
|
|
return action_feature_dict |
|
|
|
|
|
@classmethod |
|
|
def get_loss_components(cls, action_type='angle'): |
|
|
"""Get loss component definitions for different action types. |
|
|
|
|
|
Uses existing feature index constants to avoid hardcoding numbers. |
|
|
|
|
|
Args: |
|
|
action_type: 'angle' or 'keypoints' |
|
|
|
|
|
Returns: |
|
|
dict: Dictionary mapping component names to (start, end, weight) tuples |
|
|
""" |
|
|
if action_type == 'angle': |
|
|
|
|
|
return { |
|
|
'left_hand_6d': (*cls.HUMAN_LEFT_6D, 1.0), |
|
|
'left_hand_joints': (*cls.HUMAN_LEFT_JOINTS, 1.0), |
|
|
'right_hand_6d': (*cls.HUMAN_RIGHT_6D, 1.0), |
|
|
'right_hand_joints': (*cls.HUMAN_RIGHT_JOINTS, 1.0), |
|
|
} |
|
|
elif action_type == 'keypoints': |
|
|
|
|
|
left_joints_start = cls.HUMAN_LEFT_6D[1] |
|
|
left_joints_end = left_joints_start + 63 |
|
|
right_joints_start = cls.HUMAN_RIGHT_6D[1] |
|
|
right_joints_end = right_joints_start + 63 |
|
|
|
|
|
return { |
|
|
'left_6d': (*cls.HUMAN_LEFT_6D, 1.0), |
|
|
'left_joints': (left_joints_start, left_joints_end, 1.0), |
|
|
'right_6d': (*cls.HUMAN_RIGHT_6D, 1.0), |
|
|
'right_joints': (right_joints_start, right_joints_end, 1.0), |
|
|
} |
|
|
else: |
|
|
raise ValueError(f"Unknown action type: {action_type}") |
|
|
|
|
|
@classmethod |
|
|
def get_hand_group_mapping(cls, action_type='angle'): |
|
|
"""Get mapping from loss components to hand groups for weighted averaging. |
|
|
|
|
|
Returns: |
|
|
dict: Dictionary mapping hand group names to list of component names |
|
|
""" |
|
|
return { |
|
|
'left_hand': ['left_trans', 'left_rot', 'left_joints'], |
|
|
'right_hand': ['right_trans', 'right_rot', 'right_joints'], |
|
|
} |
|
|
|
|
|
@classmethod |
|
|
def get_xhand_loss_components(cls): |
|
|
"""Get loss components specific to XHand dataset.""" |
|
|
return { |
|
|
'left_hand_6d': (*cls.HUMAN_LEFT_6D, 1.6), |
|
|
'left_hand_joints': (*cls.HUMAN_LEFT_JOINTS, 0.4), |
|
|
'right_hand_6d': (*cls.HUMAN_RIGHT_6D, 1.6), |
|
|
'right_hand_joints': (*cls.HUMAN_RIGHT_JOINTS, 0.4), |
|
|
} |
|
|
|
|
|
|
|
|
class StateFeature(ActionFeature): |
|
|
"""Extended feature indices including state features like hand shape parameters (beta). |
|
|
|
|
|
Inherits from ActionFeature and adds additional state-specific features. |
|
|
""" |
|
|
ALL_FEATURES = (0, 212) |
|
|
HUMAN_LEFT_BETA = (192, 202) |
|
|
HUMAN_RIGHT_BETA = (202, 212) |
|
|
|
|
|
def calculate_fov(h, w, intrinsics): |
|
|
"""Calculate horizontal and vertical field of view (FOV) from camera intrinsics. |
|
|
Args: |
|
|
h: Image height |
|
|
w: Image width |
|
|
intrinsics: 3x3 camera intrinsic matrix |
|
|
Returns: |
|
|
fov: np.array of shape (2,) containing horizontal and vertical FOV in radians |
|
|
""" |
|
|
|
|
|
hfov = 2 * np.arctan(w / (2 * intrinsics[0][0])) |
|
|
vfov = 2 * np.arctan(h / (2 * intrinsics[1][1])) |
|
|
fov = np.array([hfov, vfov], dtype=np.float32) |
|
|
|
|
|
return fov |
|
|
|
|
|
def compute_new_intrinsics_crop(original_intrinsics, original_size, crop_size, resize_size): |
|
|
"""Compute new camera intrinsics after square crop and resize operations. |
|
|
|
|
|
Args: |
|
|
original_intrinsics: Original 3x3 camera intrinsic matrix |
|
|
original_size: Original image size (single dimension for square) |
|
|
crop_size: Size of the square crop |
|
|
resize_size: Target size after resizing |
|
|
|
|
|
Returns: |
|
|
Updated 3x3 intrinsic matrix accounting for crop and resize |
|
|
""" |
|
|
original_fx = original_intrinsics[0][0] |
|
|
original_fy = original_intrinsics[1][1] |
|
|
original_cx = original_intrinsics[0][2] |
|
|
original_cy = original_intrinsics[1][2] |
|
|
|
|
|
|
|
|
crop_offset = (original_size - crop_size) / 2 |
|
|
|
|
|
|
|
|
cropped_cx = original_cx - crop_offset |
|
|
cropped_cy = original_cy - crop_offset |
|
|
|
|
|
|
|
|
scale = resize_size / crop_size |
|
|
|
|
|
|
|
|
new_fx = original_fx * scale |
|
|
new_fy = original_fy * scale |
|
|
new_cx = cropped_cx * scale |
|
|
new_cy = cropped_cy * scale |
|
|
|
|
|
intrinsics_matrix = np.array([ |
|
|
[new_fx, 0, new_cx], |
|
|
[0, new_fy, new_cy], |
|
|
[0, 0, 1] |
|
|
]) |
|
|
return intrinsics_matrix |
|
|
|
|
|
def compute_new_intrinsics_resize(original_intrinsics, resize_size): |
|
|
"""Compute new camera intrinsics after resize operation. |
|
|
|
|
|
Args: |
|
|
original_intrinsics: Original 3x3 camera intrinsic matrix |
|
|
resize_size: Target size as (H, W) tuple |
|
|
|
|
|
Returns: |
|
|
Updated 3x3 intrinsic matrix accounting for the resize |
|
|
""" |
|
|
original_fx = original_intrinsics[0][0] |
|
|
original_fy = original_intrinsics[1][1] |
|
|
original_cx = original_intrinsics[0][2] |
|
|
original_cy = original_intrinsics[1][2] |
|
|
|
|
|
H, W = resize_size |
|
|
|
|
|
|
|
|
scale_x = W / (2*original_cx) |
|
|
scale_y = H / (2*original_cy) |
|
|
|
|
|
|
|
|
new_fx = original_fx * scale_x |
|
|
new_fy = original_fy * scale_y |
|
|
new_cx = original_cx * scale_x |
|
|
new_cy = original_cy * scale_y |
|
|
|
|
|
intrinsics_matrix = np.array([ |
|
|
[new_fx, 0, new_cx], |
|
|
[0, new_fy, new_cy], |
|
|
[0, 0, 1] |
|
|
]) |
|
|
|
|
|
return intrinsics_matrix |
|
|
|