ladybird/Libraries/LibWeb/MediaSourceExtensions/SourceBufferProcessor.cpp

613 lines
27 KiB
C++
Raw Permalink Normal View History

/*
* Copyright (c) 2026-present, the Ladybird developers.
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/Format.h>
#include <AK/Math.h>
#include <AK/NonnullOwnPtr.h>
#include <LibMedia/DecoderError.h>
#include <LibMedia/ReadonlyBytesCursor.h>
#include <LibWeb/MediaSourceExtensions/ByteStreamParser.h>
#include <LibWeb/MediaSourceExtensions/SourceBufferProcessor.h>
#include <LibWeb/MediaSourceExtensions/TrackBuffer.h>
#include <LibWeb/MediaSourceExtensions/TrackBufferDemuxer.h>
namespace Web::MediaSourceExtensions {
SourceBufferProcessor::SourceBufferProcessor()
: m_cursor(adopt_ref(*new Media::ReadonlyBytesCursor({})))
{
}
SourceBufferProcessor::~SourceBufferProcessor() = default;
void SourceBufferProcessor::set_parser(NonnullOwnPtr<ByteStreamParser>&& parser)
{
m_parser = move(parser);
}
AppendMode SourceBufferProcessor::mode() const
{
return m_mode;
}
bool SourceBufferProcessor::is_parsing_media_segment() const
{
return m_append_state == AppendState::ParsingMediaSegment;
}
bool SourceBufferProcessor::generate_timestamps_flag() const
{
return m_generate_timestamps_flag;
}
AK::Duration SourceBufferProcessor::group_end_timestamp() const
{
return m_group_end_timestamp;
}
bool SourceBufferProcessor::is_buffer_full() const
{
return m_buffer_full_flag;
}
void SourceBufferProcessor::set_mode(AppendMode mode)
{
m_mode = mode;
}
void SourceBufferProcessor::set_generate_timestamps_flag(bool flag)
{
m_generate_timestamps_flag = flag;
}
void SourceBufferProcessor::set_group_start_timestamp(Optional<AK::Duration> timestamp)
{
m_group_start_timestamp = timestamp;
}
bool SourceBufferProcessor::first_initialization_segment_received_flag() const
{
return m_first_initialization_segment_received_flag;
}
void SourceBufferProcessor::set_first_initialization_segment_received_flag(bool flag)
{
m_first_initialization_segment_received_flag = flag;
}
void SourceBufferProcessor::set_pending_initialization_segment_for_change_type_flag(bool flag)
{
m_pending_initialization_segment_for_change_type_flag = flag;
}
void SourceBufferProcessor::set_duration_change_callback(DurationChangeCallback callback)
{
m_duration_change_callback = move(callback);
}
void SourceBufferProcessor::set_first_initialization_segment_callback(InitializationSegmentCallback callback)
{
m_first_initialization_segment_callback = move(callback);
}
void SourceBufferProcessor::set_append_error_callback(AppendErrorCallback callback)
{
m_append_error_callback = move(callback);
}
void SourceBufferProcessor::set_coded_frame_processing_done_callback(CodedFrameProcessingDoneCallback callback)
{
m_coded_frame_processing_done_callback = move(callback);
}
void SourceBufferProcessor::set_append_done_callback(AppendDoneCallback callback)
{
m_append_done_callback = move(callback);
}
void SourceBufferProcessor::append_to_input_buffer(ReadonlyBytes bytes)
{
m_input_buffer.append(bytes);
m_cursor->set_data(m_input_buffer.bytes());
}
// https://w3c.github.io/media-source/#sourcebuffer-segment-parser-loop
void SourceBufferProcessor::run_segment_parser_loop()
{
VERIFY(m_parser);
while (true) {
// 1. Loop Top: If the [[input buffer]] is empty, then jump to the need more data step below.
if (m_cursor->position() >= m_cursor->size())
goto need_more_data;
// 2. If the [[input buffer]] contains bytes that violate the SourceBuffer byte stream format specification,
// then run the append error algorithm and abort this algorithm.
// AD-HOC: We'll react to this below when actually parsing the segments.
// 3. Remove any bytes that the byte stream format specifications say MUST be ignored from the start of
// the [[input buffer]].
{
auto skip_result = m_parser->skip_ignored_bytes(*m_cursor);
if (skip_result.is_error()) {
if (skip_result.error().category() == Media::DecoderErrorCategory::EndOfStream)
goto need_more_data;
m_append_error_callback();
return;
}
drop_consumed_bytes_from_input_buffer();
}
// 4. If the [[append state]] equals WAITING_FOR_SEGMENT, then run the following steps:
if (m_append_state == AppendState::WaitingForSegment) {
auto sniff_result = m_parser->sniff_segment_type(*m_cursor);
if (sniff_result.is_error()) {
m_append_error_callback();
return;
}
auto segment_type = sniff_result.value();
// 1. If the beginning of the [[input buffer]] indicates the start of an initialization segment, set the
// [[append state]] to PARSING_INIT_SEGMENT.
if (segment_type == SegmentType::InitializationSegment) {
m_append_state = AppendState::ParsingInitSegment;
// 2. If the beginning of the [[input buffer]] indicates the start of a media segment, set [[append
// state]] to PARSING_MEDIA_SEGMENT.
} else if (segment_type == SegmentType::MediaSegment) {
m_append_state = AppendState::ParsingMediaSegment;
} else if (segment_type == SegmentType::Incomplete) {
// NB: If we cannot determine the type due to an incomplete segment, this is equivalent to if we were
// parsing an initialization segment and didn't have enough data, which would result in jumping to
// the need more data step.
goto need_more_data;
} else {
VERIFY(segment_type == SegmentType::Unknown);
m_append_error_callback();
return;
}
// 3. Jump to the loop top step above.
continue;
}
// 5. If the [[append state]] equals PARSING_INIT_SEGMENT, then run the following steps:
if (m_append_state == AppendState::ParsingInitSegment) {
// 1. If the [[input buffer]] does not contain a complete initialization segment yet, then jump to the need
// more data step below.
auto parse_result = m_parser->parse_initialization_segment(*m_cursor);
if (parse_result.is_error()) {
if (parse_result.error().category() == Media::DecoderErrorCategory::EndOfStream)
goto need_more_data;
// AD-HOC: Handle bytes that violate the byte stream format specification as specified above.
m_append_error_callback();
return;
}
// 2. Run the initialization segment received algorithm.
initialization_segment_received();
// 3. Remove the initialization segment bytes from the beginning of the [[input buffer]].
drop_consumed_bytes_from_input_buffer();
// 4. Set [[append state]] to WAITING_FOR_SEGMENT.
m_append_state = AppendState::WaitingForSegment;
// 5. Jump to the loop top step above.
continue;
}
// 6. If the [[append state]] equals PARSING_MEDIA_SEGMENT, then run the following steps:
if (m_append_state == AppendState::ParsingMediaSegment) {
// 1. If the [[first initialization segment received flag]] is false or the [[pending initialization
// segment for changeType flag]] is true, then run the append error algorithm and abort this algorithm.
if (!m_first_initialization_segment_received_flag || m_pending_initialization_segment_for_change_type_flag) {
m_append_error_callback();
return;
}
{
// 2. If the [[input buffer]] contains one or more complete coded frames, then run the coded frame
// processing algorithm.
auto parse_result = m_parser->parse_media_segment(*m_cursor);
if (parse_result.is_error()) {
// AD-HOC: Handle bytes that violate the byte stream format specification as specified above.
m_append_error_callback();
return;
}
run_coded_frame_processing(parse_result.value().coded_frames);
// FIXME: 3. If this SourceBuffer is full and cannot accept more media data, then set the [[buffer full flag]]
// to true.
// 4. If the [[input buffer]] does not contain a complete media segment, then jump to the need more
// data step below.
if (!parse_result.value().completed_segment)
goto need_more_data;
}
// 5. Remove the media segment bytes from the beginning of the [[input buffer]].
drop_consumed_bytes_from_input_buffer();
// 6. Set [[append state]] to WAITING_FOR_SEGMENT.
m_append_state = AppendState::WaitingForSegment;
// 7. Jump to the loop top step above.
continue;
}
// 7. Need more data: Return control to the calling algorithm.
need_more_data:
drop_consumed_bytes_from_input_buffer();
m_append_done_callback();
return;
}
}
// https://w3c.github.io/media-source/#sourcebuffer-reset-parser-state
void SourceBufferProcessor::reset_parser_state()
{
// 1. If the [[append state]] equals PARSING_MEDIA_SEGMENT and the [[input buffer]] contains some
// complete coded frames, then run the coded frame processing algorithm until all of these
// complete coded frames have been processed.
if (m_append_state == AppendState::ParsingMediaSegment) {
// FIXME: Process any complete coded frames
}
// 2. Unset the last decode timestamp on all track buffers.
// 3. Unset the last frame duration on all track buffers.
// 4. Unset the highest end timestamp on all track buffers.
unset_all_track_buffer_timestamps();
// 5. Set the need random access point flag on all track buffers to true.
set_need_random_access_point_flag_on_all_track_buffers(true);
// 6. If the mode attribute equals "sequence", then
if (m_mode == AppendMode::Sequence) {
// set the [[group start timestamp]] to the [[group end timestamp]]
m_group_start_timestamp = m_group_end_timestamp;
}
// 7. Remove all bytes from the [[input buffer]].
m_input_buffer.clear();
m_cursor->set_data({});
MUST(m_cursor->seek(0, SeekMode::SetPosition));
// 8. Set [[append state]] to WAITING_FOR_SEGMENT.
m_append_state = AppendState::WaitingForSegment;
}
// https://w3c.github.io/media-source/#sourcebuffer-init-segment-received
void SourceBufferProcessor::initialization_segment_received()
{
// 1. Update the duration attribute if it currently equals NaN:
// AD-HOC: Pass off the duration to the callback, and allow it to check for NaN.
{
// If the initialization segment contains a duration:
if (m_parser->duration().has_value()) {
// Run the duration change algorithm with new duration set to the duration in the initialization segment.
m_duration_change_callback(m_parser->duration().value().to_seconds_f64());
}
// Otherwise:
else {
// Run the duration change algorithm with new duration set to positive Infinity.
m_duration_change_callback(AK::Infinity<double>);
}
}
// 2. If the initialization segment has no audio, video, or text tracks, then run the append error algorithm
// and abort these steps.
if (m_parser->video_tracks().is_empty() && m_parser->audio_tracks().is_empty() && m_parser->text_tracks().is_empty()) {
m_append_error_callback();
return;
}
// 3. If the [[first initialization segment received flag]] is true, then run the following steps:
if (m_first_initialization_segment_received_flag) {
// FIXME: 1. Verify the following properties. If any of the checks fail then run the append error algorithm
// and abort these steps.
// - The number of audio, video, and text tracks match what was in the first initialization segment.
// - If more than one track for a single type are present (e.g., 2 audio tracks), then the Track IDs
// match the ones in the first initialization segment.
// - The codecs for each track are supported by the user agent.
// FIXME: 2. Add the appropriate track descriptions from this initialization segment to each of the track buffers.
// 3. Set the need random access point flag on all track buffers to true.
set_need_random_access_point_flag_on_all_track_buffers(true);
}
// 4. Let active track flag equal false.
// NB: active track flag is never true unless [[first initialization segment received flag]] is true, and it is
// used only by the synchronous code, so we handle this in the callback invoked below.
// 5. If the [[first initialization segment received flag]] is false, then run the following steps:
if (!m_first_initialization_segment_received_flag) {
// FIXME: 1. If the initialization segment contains tracks with codecs the user agent does not support,
// then run the append error algorithm and abort these steps.
auto build_tracks = [&](Vector<Media::Track> const& tracks) {
Vector<InitializationSegmentTrack> result;
// 2. For each audio track in the initialization segment, run following steps:
// 3. For each video track in the initialization segment, run following steps:
// 4. For each text track in the initialization segment, run following steps:
for (auto const& track : tracks) {
// AD-HOC: Steps 1-6 are handled in the callback invoked below.
// 7. Create a new track buffer to store coded frames for this track.
// 8. Add the track description for this track to the track buffer.
auto codec_id = m_parser->codec_id_for_track(track.identifier());
auto codec_init_data = MUST(ByteBuffer::copy(m_parser->codec_initialization_data_for_track(track.identifier())));
auto demuxer = make_ref_counted<TrackBufferDemuxer>(track, codec_id, move(codec_init_data));
auto track_buffer = make<TrackBuffer>(demuxer);
m_track_buffers.set(track.identifier(), move(track_buffer));
// AD-HOC: Pass off the track information to the callback so that it can initialize the DOM objects.
result.append({ .track = track, .demuxer = demuxer });
}
return result;
};
m_first_initialization_segment_callback({
.audio_tracks = build_tracks(m_parser->audio_tracks()),
.video_tracks = build_tracks(m_parser->video_tracks()),
.text_tracks = build_tracks(m_parser->text_tracks()),
});
// 6. Set [[first initialization segment received flag]] to true.
m_first_initialization_segment_received_flag = true;
}
// 6. Set [[pending initialization segment for changeType flag]] to false.
m_pending_initialization_segment_for_change_type_flag = false;
// 7. If the active track flag equals true, then run the following steps:
// NB: Steps 8-9 (updating the element's readyState) are handled by the initialization segment callback invoked
// above. Since active track flag is only true if the first initialization segment was being received, this
// will only need to happen when that callback is invoked, so we don't need separate one.
}
// https://w3c.github.io/media-source/#sourcebuffer-coded-frame-processing
void SourceBufferProcessor::run_coded_frame_processing(Vector<DemuxedCodedFrame>& coded_frames)
{
// 1. For each coded frame in the media segment run the following steps:
for (auto& demuxed_frame : coded_frames) {
auto& frame = demuxed_frame.coded_frame;
// 1. Loop Top:
loop_top:
// FIXME: If generate timestamps flag equals true:
// 1. Let presentation timestamp equal 0.
// 2. Let decode timestamp equal 0.
// Otherwise:
// 1. Let presentation timestamp be a double precision floating point representation
// of the coded frame's presentation timestamp in seconds.
// 2. Let decode timestamp be a double precision floating point representation
// of the coded frame's decode timestamp in seconds.
auto presentation_timestamp = frame.timestamp();
// FIXME: For VP9, decode timestamp equals presentation timestamp. This will need to differ when H.264 is
// supported by MSE.
auto decode_timestamp = frame.timestamp();
// 2. Let frame duration be a double precision floating point representation of the coded
// frame's duration in seconds.
auto frame_duration = frame.duration();
// FIXME: 3. If mode equals "sequence" and group start timestamp is set, then run the following steps:
// FIXME: 4. If timestampOffset is not 0, then run the following steps:
// 5. Let track buffer equal the track buffer that the coded frame will be added to.
auto maybe_track_buffer = m_track_buffers.get(demuxed_frame.track_number);
// AD-HOC: If we're passed a media segment containing coded frames from a track we don't know about, don't
// crash on it.
if (!maybe_track_buffer.has_value())
continue;
auto& track_buffer = *maybe_track_buffer.release_value();
auto& demuxer = track_buffer.demuxer();
auto last_decode_timestamp = track_buffer.last_decode_timestamp();
auto last_frame_duration = track_buffer.last_frame_duration();
// 6.
if (
// -> If last decode timestamp for track buffer is set
(last_decode_timestamp.has_value()
// and decode timestamp is less than last decode timestamp:
&& decode_timestamp < last_decode_timestamp.value())
// OR
||
// -> If last decode timestamp for track buffer is set
(last_decode_timestamp.has_value()
// and the difference between decode timestamp and last decode timestamp is greater than 2 times last frame duration:
&& decode_timestamp - last_decode_timestamp.value() > (last_frame_duration.value() + last_frame_duration.value()))) {
// 1. -> If mode equals "segments":
if (m_mode == AppendMode::Segments) {
// Set [[group end timestamp]] to presentation timestamp.
m_group_end_timestamp = presentation_timestamp;
}
// -> If mode equals "sequence":
if (m_mode == AppendMode::Sequence) {
// -> Set [[group start timestamp]] equal to the [[group end timestamp]].
m_group_start_timestamp = m_group_end_timestamp;
}
for (auto& [id, track_buffer] : m_track_buffers) {
// 2. Unset the last decode timestamp on all track buffers.
track_buffer->unset_last_decode_timestamp();
// 3. Unset the last frame duration on all track buffers.
track_buffer->unset_last_frame_duration();
// 4. Unset the highest end timestamp on all track buffers.
track_buffer->unset_highest_end_timestamp();
// 5. Set the need random access point flag on all track buffers to true.
track_buffer->set_need_random_access_point_flag(true);
}
// 6. Jump to the Loop Top step above to restart processing of the current coded frame.
goto loop_top;
}
// 7. Let frame end timestamp equal the sum of presentation timestamp and frame duration.
auto frame_end_timestamp = presentation_timestamp + frame_duration;
// FIXME: 8. If presentation timestamp is less than appendWindowStart, then set the need random access
// point flag to true, drop the coded frame, and jump to the top of the loop.
// FIXME: 9. If frame end timestamp is greater than appendWindowEnd, then set the need random access
// point flag to true, drop the coded frame, and jump to the top of the loop.
// 10. If the need random access point flag on track buffer equals true, then run the following steps:
if (track_buffer.need_random_access_point_flag()) {
// 1. If the coded frame is not a random access point, then drop the coded frame and jump to
// the top of the loop.
if (!frame.is_keyframe())
continue;
// 2. Set the need random access point flag on track buffer to false.
track_buffer.set_need_random_access_point_flag(false);
}
// FIXME: 11. Let spliced audio frame be an unset variable for holding audio splice information
// FIXME: 12. Let spliced timed text frame be an unset variable for holding timed text splice information
// FIXME: 13. If last decode timestamp for track buffer is unset and presentation timestamp falls within
// the presentation interval of a coded frame in track buffer, then run the following steps:
// 14. Remove all coded frames from track buffer that have a presentation timestamp greater than
// or equal to presentation timestamp and less than frame end timestamp.
// 15. Remove all possible decoding dependencies on the coded frames removed in the previous step
// by removing all coded frames from track buffer between those frames removed in the previous
// step and the next random access point after those removed frames.
demuxer.remove_coded_frames_and_dependants_in_range(presentation_timestamp, frame_end_timestamp);
// 16. If spliced audio frame is set:
// Add spliced audio frame to the track buffer.
// If spliced timed text frame is set:
// Add spliced timed text frame to the track buffer.
// Otherwise:
// Add the coded frame with the presentation timestamp, decode timestamp, and frame
// duration to the track buffer.
demuxer.add_coded_frame(move(frame));
// 17. Set last decode timestamp for track buffer to decode timestamp.
track_buffer.set_last_decode_timestamp(decode_timestamp);
// 18. Set last frame duration for track buffer to frame duration.
track_buffer.set_last_frame_duration(frame_duration);
// 19. If highest end timestamp for track buffer is unset or frame end timestamp is greater
// than highest end timestamp, then
if (!track_buffer.highest_end_timestamp().has_value()
|| frame_end_timestamp > track_buffer.highest_end_timestamp().value()) {
// set highest end timestamp for track buffer to frame end timestamp.
track_buffer.set_highest_end_timestamp(frame_end_timestamp);
}
// 20. If frame end timestamp is greater than group end timestamp, then set group end timestamp
// equal to frame end timestamp.
if (frame_end_timestamp > m_group_end_timestamp)
m_group_end_timestamp = frame_end_timestamp;
// FIXME: 21. If generate timestamps flag equals true, then set timestampOffset equal to
// frame end timestamp.
}
// AD-HOC: Steps 2-5 are handled by the callback, as they mutate the DOM.
m_coded_frame_processing_done_callback();
}
// https://w3c.github.io/media-source/#sourcebuffer-coded-frame-eviction
void SourceBufferProcessor::run_coded_frame_eviction()
{
// FIXME: 1. Let new data equal the data that is about to be appended to this SourceBuffer.
// 2. If the [[buffer full flag]] equals false, then abort these steps.
// 3. Let removal ranges equal a list of presentation time ranges that can be evicted from the presentation
// to make room for the new data.
// 4. For each range in removal ranges, run the coded frame removal algorithm with start and end equal to
// the removal range start and end timestamp respectively.
}
void SourceBufferProcessor::drop_consumed_bytes_from_input_buffer()
{
auto consumed = m_cursor->position();
if (consumed == 0)
return;
VERIFY(consumed <= m_input_buffer.size());
auto remaining_bytes = m_input_buffer.bytes().slice(consumed);
AK::TypedTransfer<u8>::move(m_input_buffer.data(), remaining_bytes.data(), remaining_bytes.size());
m_input_buffer.trim(remaining_bytes.size(), false);
m_cursor->set_data(m_input_buffer.bytes());
MUST(m_cursor->seek(0, SeekMode::SetPosition));
}
void SourceBufferProcessor::unset_all_track_buffer_timestamps()
{
for (auto& [track_id, track_buffer] : m_track_buffers) {
track_buffer->unset_last_decode_timestamp();
track_buffer->unset_last_frame_duration();
track_buffer->unset_highest_end_timestamp();
}
}
void SourceBufferProcessor::set_need_random_access_point_flag_on_all_track_buffers(bool flag)
{
for (auto& [track_id, track_buffer] : m_track_buffers) {
track_buffer->set_need_random_access_point_flag(flag);
}
}
void SourceBufferProcessor::set_reached_end_of_stream()
{
for (auto& [track_id, track_buffer] : m_track_buffers)
track_buffer->demuxer().set_reached_end_of_stream();
}
void SourceBufferProcessor::clear_reached_end_of_stream()
{
for (auto& [track_id, track_buffer] : m_track_buffers)
track_buffer->demuxer().clear_reached_end_of_stream();
}
// https://w3c.github.io/media-source/#dom-sourcebuffer-buffered
Media::TimeRanges SourceBufferProcessor::buffered_ranges() const
{
// 2. Let highest end time be the largest track buffer ranges end time across all the track buffers
// managed by this SourceBuffer object.
AK::Duration highest_end_time;
for (auto const& [track_id, track_buffer] : m_track_buffers) {
auto end_time = track_buffer->demuxer().track_buffer_ranges().highest_end_time();
highest_end_time = max(highest_end_time, end_time);
}
// 3. Let intersection ranges equal a TimeRanges object containing a single range from 0 to highest end time.
Media::TimeRanges intersection;
if (highest_end_time > AK::Duration::zero())
intersection.add_range(AK::Duration::zero(), highest_end_time);
// 4. For each audio and video track buffer managed by this SourceBuffer, run the following steps:
for (auto const& [track_id, track_buffer] : m_track_buffers) {
// 1. Let track ranges equal the track buffer ranges for the current track buffer.
auto track_ranges = track_buffer->demuxer().track_buffer_ranges();
// 2. If readyState is "ended", then set the end time on the last range in track ranges to
// highest end time.
// FIXME: Check readyState from the parent MediaSource.
// 3. Let new intersection ranges equal the intersection between the intersection ranges and
// the track ranges.
// 4. Replace the ranges in intersection ranges with the new intersection ranges.
intersection = intersection.intersection(track_ranges);
}
return intersection;
}
}