openpangu-vl support video input (#34134)
Signed-off-by: hujiaxin <524446785@qq.com> Signed-off-by: Emilie1001 <79921183+Emilie1001@users.noreply.github.com> Co-authored-by: Emilie1001 <79921183+Emilie1001@users.noreply.github.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
@@ -747,3 +747,90 @@ class Molmo2VideoBackend(VideoLoader):
|
||||
**kwargs,
|
||||
)
|
||||
return out
|
||||
|
||||
|
||||
@VIDEO_LOADER_REGISTRY.register("openpangu")
|
||||
class OpenCVDynamicOpenPanguVideoBackend(OpenCVVideoBackend):
|
||||
@classmethod
|
||||
def load_bytes(
|
||||
cls,
|
||||
data: bytes,
|
||||
num_frames: int = 32,
|
||||
fps: int = 1,
|
||||
max_duration: int = 300,
|
||||
frame_recovery: bool = False,
|
||||
**kwargs,
|
||||
) -> tuple[npt.NDArray, dict[str, Any]]:
|
||||
"""
|
||||
Load video frames with dynamic sampling based on duration.
|
||||
Assume that total_num_frames = 10 and fps = 1.
|
||||
The timestamp of frame 0 is 0.0.
|
||||
The timestamp of frame 1 is 1.0.…
|
||||
The timestamp of frame 9 (the last frame) should be 9.0, that is,
|
||||
(total_frames_num – 1) / original_fps.
|
||||
|
||||
Args:
|
||||
data: Raw video bytes
|
||||
num_frames: Not used in dynamic backend
|
||||
fps: Target FPS for sampling (default: 1)
|
||||
|
||||
Returns:
|
||||
Tuple of (frames_array, metadata_dict)
|
||||
"""
|
||||
import cv2
|
||||
|
||||
backend = cls().get_cv2_video_api()
|
||||
cap = cv2.VideoCapture(BytesIO(data), backend, [])
|
||||
if not cap.isOpened():
|
||||
raise ValueError("Could not open video stream")
|
||||
|
||||
total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
original_fps = float(cap.get(cv2.CAP_PROP_FPS))
|
||||
# The timestamp of the rightmost frame, cannot be used to calculate frame 0.
|
||||
if total_frames_num >= 1 and original_fps > 0:
|
||||
total_duration = (total_frames_num - 1) / original_fps
|
||||
else:
|
||||
total_duration = 0
|
||||
|
||||
# `fps` is the FPS parameter passed in for sampling,
|
||||
# -1 indicates that sampling can be performed directly without FPS limitation.
|
||||
if fps > 0:
|
||||
# Num_frames is the maximum number of frames to sample.
|
||||
# If fewer frames are sampled at this sample_fps, the update duration will be longer. # noqa: E501
|
||||
if num_frames >= int(total_duration * fps) + 1:
|
||||
num_frames = int(total_duration * fps) + 1
|
||||
# Under the new maximum frame rate, the video duration of the rightmost frame, # noqa: E501
|
||||
# cannot be calculated for frame 0.
|
||||
total_duration = min(total_duration, (num_frames - 1) / fps)
|
||||
elif fps != -1:
|
||||
raise ValueError(
|
||||
f"requires dataset fps is -1 or greater than 0 but got {fps}"
|
||||
)
|
||||
|
||||
sample_frame_timestamps = np.linspace(
|
||||
0, total_duration, num_frames, dtype=float
|
||||
)
|
||||
frames_indices = [
|
||||
min(total_frames_num - 1, round(t * original_fps))
|
||||
for t in sample_frame_timestamps
|
||||
]
|
||||
|
||||
frames, valid_frame_indices, recovered_map = cls._read_frames_with_recovery(
|
||||
cap, frames_indices, total_frames_num
|
||||
)
|
||||
|
||||
if recovered_map:
|
||||
logger.info(
|
||||
"Frame recovery: %d frames recovered using forward scan.",
|
||||
len(recovered_map),
|
||||
)
|
||||
|
||||
metadata = {
|
||||
"total_num_frames": total_frames_num,
|
||||
"fps": original_fps,
|
||||
"duration": total_duration,
|
||||
"video_backend": "opencv_dynamic_openpangu",
|
||||
"frames_indices": valid_frame_indices,
|
||||
"do_sample_frames": False,
|
||||
}
|
||||
return frames, metadata
|
||||
|
||||
Reference in New Issue
Block a user