Full Theora video support in VideoStreamPlayer

- Implement `set_stream_position` and `get_stream_length`. - Don't show blank frame when stopping the video (smooth loops). - Fix audio for videos with up to 8 channels. - Improve internal audio handling.
2025-10-19 16:03:29 +00:00 · 2025-01-28 17:39:46 +01:00 · 2025-01-28 17:39:46 +01:00 · b9bebf7081
commit b9bebf7081
parent 4a44078451
6 changed files with 541 additions and 218 deletions
--- a/doc/classes/VideoStreamPlayer.xml
+++ b/doc/classes/VideoStreamPlayer.xml
@ -16,7 +16,6 @@
 			<return type="float" />
 			<description>
 				The length of the current stream, in seconds.
-				[b]Note:[/b] For [VideoStreamTheora] streams (the built-in format supported by Godot), this value will always be zero, as getting the stream length is not implemented yet. The feature may be supported by video formats implemented by a GDExtension add-on.
 			</description>
 		</method>
 		<method name="get_stream_name" qualifiers="const">
@ -79,7 +78,6 @@
 		</member>
 		<member name="stream_position" type="float" setter="set_stream_position" getter="get_stream_position">
 			The current position of the stream, in seconds.
-			[b]Note:[/b] Changing this value won't have any effect as seeking is not implemented yet, except in video formats implemented by a GDExtension add-on.
 		</member>
 		<member name="volume" type="float" setter="set_volume" getter="get_volume">
 			Audio volume as a linear value.
--- a/modules/theora/video_stream_theora.cpp
+++ b/modules/theora/video_stream_theora.cpp
@ -41,17 +41,15 @@ int VideoStreamPlaybackTheora::buffer_data() {

 	uint64_t bytes = file->get_buffer((uint8_t *)buffer, 4096);
 	ogg_sync_wrote(&oy, bytes);
-	return (bytes);
+	return bytes;
 }

 int VideoStreamPlaybackTheora::queue_page(ogg_page *page) {
-	if (theora_p) {
-		ogg_stream_pagein(&to, page);
-		if (to.e_o_s) {
-			theora_eos = true;
-		}
+	ogg_stream_pagein(&to, page);
+	if (to.e_o_s) {
+		theora_eos = true;
 	}
-	if (vorbis_p) {
+	if (has_audio) {
 		ogg_stream_pagein(&vo, page);
 		if (vo.e_o_s) {
 			vorbis_eos = true;
@ -60,6 +58,179 @@ int VideoStreamPlaybackTheora::queue_page(ogg_page *page) {
 	return 0;
 }

+int VideoStreamPlaybackTheora::read_page(ogg_page *page) {
+	int ret = 0;
+
+	while (ret <= 0) {
+		ret = ogg_sync_pageout(&oy, page);
+		if (ret <= 0) {
+			int bytes = buffer_data();
+			if (bytes == 0) {
+				return 0;
+			}
+		}
+	}
+
+	return ret;
+}
+
+double VideoStreamPlaybackTheora::get_page_time(ogg_page *page) {
+	uint64_t granulepos = ogg_page_granulepos(page);
+	int page_serialno = ogg_page_serialno(page);
+	double page_time = -1;
+
+	if (page_serialno == to.serialno) {
+		page_time = th_granule_time(td, granulepos);
+	}
+	if (has_audio && page_serialno == vo.serialno) {
+		page_time = vorbis_granule_time(&vd, granulepos);
+	}
+
+	return page_time;
+}
+
+// Read one buffer worth of pages and feed them to the streams.
+int VideoStreamPlaybackTheora::feed_pages() {
+	int pages = 0;
+	ogg_page og;
+
+	while (pages == 0) {
+		while (ogg_sync_pageout(&oy, &og) > 0) {
+			queue_page(&og);
+			pages++;
+		}
+		if (pages == 0) {
+			int bytes = buffer_data();
+			if (bytes == 0) {
+				break;
+			}
+		}
+	}
+
+	return pages;
+}
+
+// Seek the video and audio streams simultaneously to find the granulepos where we should start decoding.
+// It will return the position where we should start reading pages, and the video and audio granulepos.
+int64_t VideoStreamPlaybackTheora::seek_streams(double p_time, int64_t &cur_video_granulepos, int64_t &cur_audio_granulepos) {
+	// Backtracking less than this is probably a waste of time.
+	const int64_t min_seek = 512 * 1024;
+	int64_t target_video_granulepos;
+	int64_t target_audio_granulepos;
+	double target_time = 0;
+	int64_t seek_pos;
+
+	// Make a guess where we should start reading in the file, and scan from there.
+	// We base the guess on the mean bitrate of the streams. It would be theoretically faster to use the bisect method but
+	// in practice there's a lot of linear scanning to do to find the right pages.
+	// We want to catch the previous keyframe to the seek time. Since we only know the max GOP, we use that.
+	if (p_time == -1) { // This is a special case to find the last packets and calculate the video length.
+		seek_pos = MAX(stream_data_size - min_seek, stream_data_offset);
+		target_video_granulepos = INT64_MAX;
+		target_audio_granulepos = INT64_MAX;
+	} else {
+		int64_t video_frame = (int64_t)(p_time / frame_duration);
+		target_video_granulepos = MAX(1LL, video_frame - (1LL << ti.keyframe_granule_shift)) << ti.keyframe_granule_shift;
+		target_audio_granulepos = 0;
+		seek_pos = MAX(((target_video_granulepos >> ti.keyframe_granule_shift) - 1) * frame_duration * stream_data_size / stream_length, stream_data_offset);
+		target_time = th_granule_time(td, target_video_granulepos);
+		if (has_audio) {
+			target_audio_granulepos = video_frame * frame_duration * vi.rate;
+			target_time = MIN(target_time, vorbis_granule_time(&vd, target_audio_granulepos));
+		}
+	}
+
+	int64_t video_seek_pos = seek_pos;
+	int64_t audio_seek_pos = seek_pos;
+	double backtrack_time = 0;
+	bool video_catch = false;
+	bool audio_catch = false;
+	int64_t last_video_granule_seek_pos = seek_pos;
+	int64_t last_audio_granule_seek_pos = seek_pos;
+
+	cur_video_granulepos = -1;
+	cur_audio_granulepos = -1;
+
+	while (!video_catch || (has_audio && !audio_catch)) { // Backtracking loop
+		if (seek_pos < stream_data_offset) {
+			seek_pos = stream_data_offset;
+		}
+		file->seek(seek_pos);
+		ogg_sync_reset(&oy);
+
+		backtrack_time = 0;
+		last_video_granule_seek_pos = seek_pos;
+		last_audio_granule_seek_pos = seek_pos;
+		while (!video_catch || (has_audio && !audio_catch)) { // Page scanning loop
+			ogg_page page;
+			uint64_t last_seek_pos = file->get_position() - oy.fill + oy.returned;
+			int ret = read_page(&page);
+			if (ret <= 0) { // End of file.
+				if (seek_pos < stream_data_offset) { // We've already searched the whole file
+					return -1;
+				}
+				seek_pos -= min_seek;
+				break;
+			}
+			int64_t cur_granulepos = ogg_page_granulepos(&page);
+			if (cur_granulepos >= 0) {
+				int page_serialno = ogg_page_serialno(&page);
+				if (!video_catch && page_serialno == to.serialno) {
+					if (cur_granulepos >= target_video_granulepos) {
+						video_catch = true;
+						if (cur_video_granulepos < 0) {
+							// Adding 1s helps catching the start of the page and avoids backtrack_time = 0.
+							backtrack_time = MAX(backtrack_time, 1 + th_granule_time(td, cur_granulepos) - target_time);
+						}
+					} else {
+						video_seek_pos = last_video_granule_seek_pos;
+						cur_video_granulepos = cur_granulepos;
+					}
+					last_video_granule_seek_pos = last_seek_pos;
+				}
+				if ((has_audio && !audio_catch) && page_serialno == vo.serialno) {
+					if (cur_granulepos >= target_audio_granulepos) {
+						audio_catch = true;
+						if (cur_audio_granulepos < 0) {
+							// Adding 1s helps catching the start of the page and avoids backtrack_time = 0.
+							backtrack_time = MAX(backtrack_time, 1 + vorbis_granule_time(&vd, cur_granulepos) - target_time);
+						}
+					} else {
+						audio_seek_pos = last_audio_granule_seek_pos;
+						cur_audio_granulepos = cur_granulepos;
+					}
+					last_audio_granule_seek_pos = last_seek_pos;
+				}
+			}
+		}
+		if (backtrack_time > 0) {
+			if (seek_pos <= stream_data_offset) {
+				break;
+			}
+			int64_t delta_seek = MAX(backtrack_time * stream_data_size / stream_length, min_seek);
+			seek_pos -= delta_seek;
+		}
+		video_catch = cur_video_granulepos != -1;
+		audio_catch = cur_audio_granulepos != -1;
+	}
+
+	if (cur_video_granulepos < (1LL << ti.keyframe_granule_shift)) {
+		video_seek_pos = stream_data_offset;
+		cur_video_granulepos = 1LL << ti.keyframe_granule_shift;
+	}
+	if (has_audio) {
+		if (cur_audio_granulepos == -1) {
+			audio_seek_pos = stream_data_offset;
+			cur_audio_granulepos = 0;
+		}
+		seek_pos = MIN(video_seek_pos, audio_seek_pos);
+	} else {
+		seek_pos = video_seek_pos;
+	}
+
+	return seek_pos;
+}
+
 void VideoStreamPlaybackTheora::video_write(th_ycbcr_buffer yuv) {
 	uint8_t *w = frame_data.ptrw();
 	char *dst = (char *)w;
@ -77,83 +248,53 @@ void VideoStreamPlaybackTheora::video_write(th_ycbcr_buffer yuv) {
 	Ref<Image> img;
 	img.instantiate(region.size.x, region.size.y, false, Image::FORMAT_RGBA8, frame_data); //zero copy image creation

-	texture->update(img); //zero copy send to rendering server
+	texture->update(img); // Zero-copy send to rendering server.
 }

 void VideoStreamPlaybackTheora::clear() {
-	if (file.is_null()) {
-		return;
+	if (!file.is_null()) {
+		file.unref();
 	}
-
-	if (vorbis_p) {
-		ogg_stream_clear(&vo);
-		if (vorbis_p >= 3) {
-			vorbis_block_clear(&vb);
-			vorbis_dsp_clear(&vd);
-		}
+	if (has_audio) {
+		vorbis_block_clear(&vb);
+		vorbis_dsp_clear(&vd);
 		vorbis_comment_clear(&vc);
 		vorbis_info_clear(&vi);
-		vorbis_p = 0;
+		ogg_stream_clear(&vo);
+		if (audio_buffer_size) {
+			memdelete_arr(audio_buffer);
+		}
 	}
-	if (theora_p) {
-		ogg_stream_clear(&to);
+	if (has_video) {
 		th_decode_free(td);
 		th_comment_clear(&tc);
 		th_info_clear(&ti);
-		theora_p = 0;
+		ogg_stream_clear(&to);
+		ogg_sync_clear(&oy);
 	}
-	ogg_sync_clear(&oy);

-	theora_p = 0;
-	vorbis_p = 0;
-	next_frame_time = 0;
-	current_frame_time = 0;
+	audio_buffer = nullptr;
+	playing = false;
+	has_video = false;
+	has_audio = false;
 	theora_eos = false;
 	vorbis_eos = false;
-	video_ready = false;
-	video_done = false;
-	audio_done = false;
-
-	file.unref();
-	playing = false;
 }

-void VideoStreamPlaybackTheora::set_file(const String &p_file) {
-	ERR_FAIL_COND(playing);
+void VideoStreamPlaybackTheora::find_streams(th_setup_info *&ts) {
+	ogg_stream_state test;
 	ogg_packet op;
-	th_setup_info *ts = nullptr;
-
-	file_name = p_file;
-	file = FileAccess::open(p_file, FileAccess::READ);
-	ERR_FAIL_COND_MSG(file.is_null(), "Cannot open file '" + p_file + "'.");
-
-	ogg_sync_init(&oy);
-
-	/* init supporting Vorbis structures needed in header parsing */
-	vorbis_info_init(&vi);
-	vorbis_comment_init(&vc);
-
-	/* init supporting Theora structures needed in header parsing */
-	th_comment_init(&tc);
-	th_info_init(&ti);
-
-	theora_eos = false;
-	vorbis_eos = false;
-
-	/* Ogg file open; parse the headers */
-	/* Only interested in Vorbis/Theora streams */
+	ogg_page og;
 	int stateflag = 0;
-
 	int audio_track_skip = audio_track;

+	/* Only interested in Vorbis/Theora streams */
 	while (!stateflag) {
 		int ret = buffer_data();
-		if (ret == 0) {
+		if (!ret) {
 			break;
 		}
 		while (ogg_sync_pageout(&oy, &og) > 0) {
-			ogg_stream_state test;
-
 			/* is this a mandated initial header? If not, stop parsing */
 			if (!ogg_page_bos(&og)) {
 				/* don't leak the page; get it into the appropriate stream */
@ -167,11 +308,11 @@ void VideoStreamPlaybackTheora::set_file(const String &p_file) {
 			ogg_stream_packetout(&test, &op);

 			/* identify the codec: try theora */
-			if (!theora_p && th_decode_headerin(&ti, &tc, &ts, &op) >= 0) {
+			if (!has_video && th_decode_headerin(&ti, &tc, &ts, &op) >= 0) {
 				/* it is theora */
 				memcpy(&to, &test, sizeof(test));
-				theora_p = 1;
-			} else if (!vorbis_p && vorbis_synthesis_headerin(&vi, &vc, &op) >= 0) {
+				has_video = true;
+			} else if (!has_audio && vorbis_synthesis_headerin(&vi, &vc, &op) >= 0) {
 				/* it is vorbis */
 				if (audio_track_skip) {
 					vorbis_info_clear(&vi);
@ -179,141 +320,165 @@ void VideoStreamPlaybackTheora::set_file(const String &p_file) {
 					ogg_stream_clear(&test);
 					vorbis_info_init(&vi);
 					vorbis_comment_init(&vc);
-
 					audio_track_skip--;
 				} else {
 					memcpy(&vo, &test, sizeof(test));
-					vorbis_p = 1;
+					has_audio = true;
 				}
 			} else {
 				/* whatever it is, we don't care about it */
 				ogg_stream_clear(&test);
 			}
 		}
-		/* fall through to non-bos page parsing */
 	}
+}
+
+void VideoStreamPlaybackTheora::read_headers(th_setup_info *&ts) {
+	ogg_packet op;
+	int theora_header_packets = 1;
+	int vorbis_header_packets = 1;

 	/* we're expecting more header packets. */
-	while ((theora_p && theora_p < 3) || (vorbis_p && vorbis_p < 3)) {
-		int ret = 0;
-
+	while (theora_header_packets < 3 || (has_audio && vorbis_header_packets < 3)) {
 		/* look for further theora headers */
-		if (theora_p && theora_p < 3) {
-			ret = ogg_stream_packetout(&to, &op);
-		}
-		while (theora_p && theora_p < 3 && ret) {
-			if (ret < 0) {
-				fprintf(stderr, "Error parsing Theora stream headers; corrupt stream?\n");
-				clear();
-				return;
+		// The API says there can be more than three but only three are mandatory.
+		while (theora_header_packets < 3 && ogg_stream_packetout(&to, &op) > 0) {
+			if (th_decode_headerin(&ti, &tc, &ts, &op) > 0) {
+				theora_header_packets++;
 			}
-			if (!th_decode_headerin(&ti, &tc, &ts, &op)) {
-				fprintf(stderr, "Error parsing Theora stream headers; corrupt stream?\n");
-				clear();
-				return;
-			}
-			ret = ogg_stream_packetout(&to, &op);
-			theora_p++;
 		}

 		/* look for more vorbis header packets */
-		if (vorbis_p && vorbis_p < 3) {
-			ret = ogg_stream_packetout(&vo, &op);
+		while (has_audio && vorbis_header_packets < 3 && ogg_stream_packetout(&vo, &op) > 0) {
+			if (!vorbis_synthesis_headerin(&vi, &vc, &op)) {
+				vorbis_header_packets++;
+			}
 		}
-		while (vorbis_p && vorbis_p < 3 && ret) {
-			if (ret < 0) {
-				fprintf(stderr, "Error parsing Vorbis stream headers; corrupt stream?\n");
-				clear();
-				return;
-			}
-			ret = vorbis_synthesis_headerin(&vi, &vc, &op);
-			if (ret) {
-				fprintf(stderr, "Error parsing Vorbis stream headers; corrupt stream?\n");
-				clear();
-				return;
-			}
-			vorbis_p++;
-			if (vorbis_p == 3) {
+
+		/* The header pages/packets will arrive before anything else we care about, or the stream is not obeying spec */
+		if (theora_header_packets < 3 || (has_audio && vorbis_header_packets < 3)) {
+			ogg_page page;
+			if (read_page(&page)) {
+				queue_page(&page);
+			} else {
+				fprintf(stderr, "End of file while searching for codec headers.\n");
 				break;
 			}
-			ret = ogg_stream_packetout(&vo, &op);
 		}
+	}

-		/* The header pages/packets will arrive before anything else we
-		care about, or the stream is not obeying spec */
+	has_video = theora_header_packets == 3;
+	has_audio = vorbis_header_packets == 3;
+}

-		if (ogg_sync_pageout(&oy, &og) > 0) {
-			queue_page(&og); /* demux into the appropriate stream */
-		} else {
-			int ret2 = buffer_data(); /* someone needs more data */
-			if (ret2 == 0) {
-				fprintf(stderr, "End of file while searching for codec headers.\n");
-				clear();
-				return;
-			}
+void VideoStreamPlaybackTheora::set_file(const String &p_file) {
+	ERR_FAIL_COND(playing);
+	th_setup_info *ts = nullptr;
+
+	clear();
+
+	file = FileAccess::open(p_file, FileAccess::READ);
+	ERR_FAIL_COND_MSG(file.is_null(), "Cannot open file '" + p_file + "'.");
+
+	file_name = p_file;
+
+	ogg_sync_init(&oy);
+
+	/* init supporting Vorbis structures needed in header parsing */
+	vorbis_info_init(&vi);
+	vorbis_comment_init(&vc);
+
+	/* init supporting Theora structures needed in header parsing */
+	th_comment_init(&tc);
+	th_info_init(&ti);
+
+	/* Zero stream state structs so they can be checked later. */
+	memset(&to, 0, sizeof(to));
+	memset(&vo, 0, sizeof(vo));
+
+	/* Ogg file open; parse the headers */
+	find_streams(ts);
+	read_headers(ts);
+
+	if (!has_audio) {
+		vorbis_comment_clear(&vc);
+		vorbis_info_clear(&vi);
+		if (!ogg_stream_check(&vo)) {
+			ogg_stream_clear(&vo);
 		}
 	}

+	// One video stream is mandatory.
+	if (!has_video) {
+		th_setup_free(ts);
+		th_comment_clear(&tc);
+		th_info_clear(&ti);
+		if (!ogg_stream_check(&to)) {
+			ogg_stream_clear(&to);
+		}
+		file.unref();
+		return;
+	}
+
 	/* And now we have it all. Initialize decoders. */
-	if (theora_p) {
-		td = th_decode_alloc(&ti, ts);
-		px_fmt = ti.pixel_fmt;
-		switch (ti.pixel_fmt) {
-			case TH_PF_420:
-				//printf(" 4:2:0 video\n");
-				break;
-			case TH_PF_422:
-				//printf(" 4:2:2 video\n");
-				break;
-			case TH_PF_444:
-				//printf(" 4:4:4 video\n");
-				break;
-			case TH_PF_RSVD:
-			default:
-				printf(" video\n  (UNKNOWN Chroma sampling!)\n");
-				break;
-		}
-		th_decode_ctl(td, TH_DECCTL_GET_PPLEVEL_MAX, &pp_level_max,
-				sizeof(pp_level_max));
-		pp_level = 0;
-		th_decode_ctl(td, TH_DECCTL_SET_PPLEVEL, &pp_level, sizeof(pp_level));
-		pp_inc = 0;
-
-		size.x = ti.frame_width;
-		size.y = ti.frame_height;
-		region.position.x = ti.pic_x;
-		region.position.y = ti.pic_y;
-		region.size.x = ti.pic_width;
-		region.size.y = ti.pic_height;
-
-		Ref<Image> img = Image::create_empty(region.size.x, region.size.y, false, Image::FORMAT_RGBA8);
-		texture->set_image(img);
-		frame_data.resize(region.size.x * region.size.y * 4);
-
-		frame_duration = (double)ti.fps_denominator / ti.fps_numerator;
-	} else {
-		/* tear down the partial theora setup */
-		th_info_clear(&ti);
-		th_comment_clear(&tc);
-	}
-
+	td = th_decode_alloc(&ti, ts);
 	th_setup_free(ts);
+	px_fmt = ti.pixel_fmt;
+	switch (ti.pixel_fmt) {
+		case TH_PF_420:
+		case TH_PF_422:
+		case TH_PF_444:
+			break;
+		default:
+			WARN_PRINT(" video\n  (UNKNOWN Chroma sampling!)\n");
+			break;
+	}
+	th_decode_ctl(td, TH_DECCTL_GET_PPLEVEL_MAX, &pp_level_max, sizeof(pp_level_max));
+	pp_level = 0;
+	th_decode_ctl(td, TH_DECCTL_SET_PPLEVEL, &pp_level, sizeof(pp_level));
+	pp_inc = 0;

-	if (vorbis_p) {
+	size.x = ti.frame_width;
+	size.y = ti.frame_height;
+	region.position.x = ti.pic_x;
+	region.position.y = ti.pic_y;
+	region.size.x = ti.pic_width;
+	region.size.y = ti.pic_height;
+
+	Ref<Image> img = Image::create_empty(region.size.x, region.size.y, false, Image::FORMAT_RGBA8);
+	texture->set_image(img);
+	frame_data.resize(region.size.x * region.size.y * 4);
+
+	frame_duration = (double)ti.fps_denominator / ti.fps_numerator;
+
+	if (has_audio) {
 		vorbis_synthesis_init(&vd, &vi);
 		vorbis_block_init(&vd, &vb);
-		//_setup(vi.channels, vi.rate);
-	} else {
-		/* tear down the partial vorbis setup */
-		vorbis_info_clear(&vi);
-		vorbis_comment_clear(&vc);
+		audio_buffer_size = MIN(vi.channels, 8) * 1024;
+		audio_buffer = memnew_arr(float, audio_buffer_size);
 	}

-	playing = false;
-	buffering = true;
-	time = 0;
-	video_done = !theora_p;
-	audio_done = !vorbis_p;
+	stream_data_offset = file->get_position() - oy.fill + oy.returned;
+	stream_data_size = file->get_length() - stream_data_offset;
+
+	// Sync to last page to find video length.
+	int64_t seek_pos = MAX(stream_data_offset, (int64_t)file->get_length() - 64 * 1024);
+	int64_t video_granulepos = INT64_MAX;
+	int64_t audio_granulepos = INT64_MAX;
+	file->seek(seek_pos);
+	seek_pos = seek_streams(-1, video_granulepos, audio_granulepos);
+	file->seek(seek_pos);
+	ogg_sync_reset(&oy);
+
+	stream_length = 0;
+	ogg_page page;
+	while (read_page(&page) > 0) {
+		// Use MAX because, even though pages are ordered, page time can be -1
+		// for pages without full frames. Streams could be truncated too.
+		stream_length = MAX(stream_length, get_page_time(&page));
+	}
+
+	seek(0);
 }

 double VideoStreamPlaybackTheora::get_time() const {
@ -346,28 +511,32 @@ void VideoStreamPlaybackTheora::update(double p_delta) {
 		ogg_packet op;

 		while (!audio_ready && !audio_done) {
+			// Send remaining frames
+			if (!send_audio()) {
+				audio_ready = true;
+				break;
+			}
+
 			float **pcm;
 			int ret = vorbis_synthesis_pcmout(&vd, &pcm);
 			if (ret > 0) {
-				const int AUXBUF_LEN = 4096;
-				int to_read = ret;
-				float aux_buffer[AUXBUF_LEN];
-				while (to_read) {
-					int m = MIN(AUXBUF_LEN / vi.channels, to_read);
+				int frames_read = 0;
+				while (frames_read < ret) {
+					int m = MIN(audio_buffer_size / vi.channels, ret - frames_read);
 					int count = 0;
 					for (int j = 0; j < m; j++) {
 						for (int i = 0; i < vi.channels; i++) {
-							aux_buffer[count++] = pcm[i][j];
+							audio_buffer[count++] = pcm[i][frames_read + j];
 						}
 					}
-					int mixed = mix_callback(mix_udata, aux_buffer, m);
-					to_read -= mixed;
-					if (mixed != m) { //could mix no more
+					frames_read += m;
+					audio_ptr_end = m;
+					if (!send_audio()) {
 						audio_ready = true;
 						break;
 					}
 				}
-				vorbis_synthesis_read(&vd, ret - to_read);
+				vorbis_synthesis_read(&vd, frames_read);
 			} else {
 				/* no pending audio; is there a pending packet to decode? */
 				if (ogg_stream_packetout(&vo, &op) > 0) {
@ -383,17 +552,10 @@ void VideoStreamPlaybackTheora::update(double p_delta) {

 		while (!video_ready && !video_done) {
 			if (ogg_stream_packetout(&to, &op) > 0) {
-				/*HACK: This should be set after a seek or a gap, but we might not have
-				a granulepos for the first packet (we only have them for the last
-				packet on a page), so we just set it as often as we get it.
-				To do this right, we should back-track from the last packet on the
-				page and compute the correct granulepos for the first packet after
-				a seek or a gap.*/
 				if (op.granulepos >= 0) {
-					th_decode_ctl(td, TH_DECCTL_SET_GRANPOS, &op.granulepos,
-							sizeof(op.granulepos));
+					th_decode_ctl(td, TH_DECCTL_SET_GRANPOS, &op.granulepos, sizeof(op.granulepos));
 				}
-				ogg_int64_t videobuf_granulepos;
+				int64_t videobuf_granulepos;
 				int ret = th_decode_packetin(td, &op, &videobuf_granulepos);
 				if (ret == 0 || ret == TH_DUPFRAME) {
 					next_frame_time = th_granule_time(td, videobuf_granulepos);
@ -412,12 +574,8 @@ void VideoStreamPlaybackTheora::update(double p_delta) {
 		}

 		if (!video_ready || !audio_ready) {
-			int ret = buffer_data();
-			if (ret > 0) {
-				while (ogg_sync_pageout(&oy, &og) > 0) {
-					queue_page(&og);
-				}
-			} else {
+			int ret = feed_pages();
+			if (ret == 0) {
 				vorbis_eos = true;
 				theora_eos = true;
 				break;
@ -452,10 +610,8 @@ void VideoStreamPlaybackTheora::update(double p_delta) {
 }

 void VideoStreamPlaybackTheora::play() {
-	if (!playing) {
-		time = 0;
-	} else {
-		stop();
+	if (playing) {
+		return;
 	}

 	playing = true;
@ -464,12 +620,8 @@ void VideoStreamPlaybackTheora::play() {
 }

 void VideoStreamPlaybackTheora::stop() {
-	if (playing) {
-		clear();
-		set_file(file_name); //reset
-	}
 	playing = false;
-	time = 0;
+	seek(0);
 }

 bool VideoStreamPlaybackTheora::is_playing() const {
@ -485,7 +637,7 @@ bool VideoStreamPlaybackTheora::is_paused() const {
 }

 double VideoStreamPlaybackTheora::get_length() const {
-	return 0;
+	return stream_length;
 }

 double VideoStreamPlaybackTheora::get_playback_position() const {
@ -493,7 +645,123 @@ double VideoStreamPlaybackTheora::get_playback_position() const {
 }

 void VideoStreamPlaybackTheora::seek(double p_time) {
-	WARN_PRINT_ONCE("Seeking in Theora videos is not implemented yet (it's only supported for GDExtension-provided video streams).");
+	if (file.is_null()) {
+		return;
+	}
+	if (p_time >= stream_length) {
+		return;
+	}
+
+	video_ready = false;
+	next_frame_time = 0;
+	current_frame_time = -1;
+	dup_frame = false;
+	video_done = false;
+	audio_done = !has_audio;
+	theora_eos = false;
+	vorbis_eos = false;
+	audio_ptr_start = 0;
+	audio_ptr_end = 0;
+
+	ogg_stream_reset(&to);
+	if (has_audio) {
+		ogg_stream_reset(&vo);
+		vorbis_synthesis_restart(&vd);
+	}
+
+	int64_t seek_pos;
+	int64_t video_granulepos;
+	int64_t audio_granulepos;
+	// Find the granules we need so we can start playing at the seek time.
+	seek_pos = seek_streams(p_time, video_granulepos, audio_granulepos);
+	if (seek_pos < 0) {
+		return;
+	}
+	file->seek(seek_pos);
+	ogg_sync_reset(&oy);
+
+	time = p_time;
+
+	double last_audio_time = 0;
+	double last_video_time = 0;
+	bool first_frame_decoded = false;
+	bool start_audio = (audio_granulepos == 0);
+	bool start_video = (video_granulepos == (1LL << ti.keyframe_granule_shift));
+	bool keyframe_found = false;
+	uint64_t current_frame = 0;
+
+	// Read from the streams skipping pages until we reach the granules we want. We won't skip pages from both video and
+	// audio streams, only one of them, until decoding of both starts.
+	// video_granulepos and audio_granulepos are guaranteed to be found by checking the granulepos in the packets, no
+	// need to keep track of packets with granulepos == -1 until decoding starts.
+	while ((has_audio && last_audio_time < p_time) || (last_video_time <= p_time)) {
+		ogg_packet op;
+		if (feed_pages() == 0) {
+			break;
+		}
+		while (has_audio && last_audio_time < p_time && ogg_stream_packetout(&vo, &op) > 0) {
+			if (start_audio) {
+				if (vorbis_synthesis(&vb, &op) == 0) { /* test for success! */
+					vorbis_synthesis_blockin(&vd, &vb);
+					float **pcm;
+					int samples_left = ceil((p_time - last_audio_time) * vi.rate);
+					int samples_read = vorbis_synthesis_pcmout(&vd, &pcm);
+					int samples_consumed = MIN(samples_left, samples_read);
+					vorbis_synthesis_read(&vd, samples_consumed);
+					last_audio_time += (double)samples_consumed / vi.rate;
+				}
+			} else if (op.granulepos >= audio_granulepos) {
+				last_audio_time = vorbis_granule_time(&vd, op.granulepos);
+				// Start tracking audio now. This won't produce any samples but will update the decoder state.
+				if (vorbis_synthesis_trackonly(&vb, &op) == 0) {
+					vorbis_synthesis_blockin(&vd, &vb);
+				}
+				start_audio = true;
+			}
+		}
+		while (last_video_time <= p_time && ogg_stream_packetout(&to, &op) > 0) {
+			if (!start_video && (op.granulepos >= video_granulepos || video_granulepos == (1LL << ti.keyframe_granule_shift))) {
+				if (op.granulepos > 0) {
+					current_frame = th_granule_frame(td, op.granulepos);
+				}
+				start_video = true;
+			}
+			// Don't start decoding until a keyframe is found, but count frames.
+			if (start_video) {
+				if (!keyframe_found && th_packet_iskeyframe(&op)) {
+					keyframe_found = true;
+					int64_t cur_granulepos = (current_frame + 1) << ti.keyframe_granule_shift;
+					th_decode_ctl(td, TH_DECCTL_SET_GRANPOS, &cur_granulepos, sizeof(cur_granulepos));
+				}
+				if (keyframe_found) {
+					int64_t videobuf_granulepos;
+					if (op.granulepos >= 0) {
+						th_decode_ctl(td, TH_DECCTL_SET_GRANPOS, &op.granulepos, sizeof(op.granulepos));
+					}
+					int ret = th_decode_packetin(td, &op, &videobuf_granulepos);
+					if (ret == 0 || ret == TH_DUPFRAME) {
+						last_video_time = th_granule_time(td, videobuf_granulepos);
+						first_frame_decoded = true;
+					}
+				} else {
+					current_frame++;
+				}
+			}
+		}
+	}
+
+	if (first_frame_decoded) {
+		if (is_playing()) {
+			// Draw the current frame.
+			th_ycbcr_buffer yuv;
+			th_decode_ycbcr_out(td, yuv);
+			video_write(yuv);
+			current_frame_time = last_video_time;
+		} else {
+			next_frame_time = current_frame_time;
+			video_ready = true;
+		}
+	}
 }

 int VideoStreamPlaybackTheora::get_channels() const {
--- a/modules/theora/video_stream_theora.h
+++ b/modules/theora/video_stream_theora.h
@ -51,8 +51,19 @@ class VideoStreamPlaybackTheora : public VideoStreamPlayback {
 	Point2i size;
 	Rect2i region;

+	float *audio_buffer = nullptr;
+	int audio_buffer_size = 0;
+	int audio_ptr_start = 0;
+	int audio_ptr_end = 0;
+
 	int buffer_data();
 	int queue_page(ogg_page *page);
+	int read_page(ogg_page *page);
+	int feed_pages();
+	double get_page_time(ogg_page *page);
+	int64_t seek_streams(double p_time, int64_t &video_granulepos, int64_t &audio_granulepos);
+	void find_streams(th_setup_info *&ts);
+	void read_headers(th_setup_info *&ts);
 	void video_write(th_ycbcr_buffer yuv);
 	double get_time() const;

@ -60,7 +71,6 @@ class VideoStreamPlaybackTheora : public VideoStreamPlayback {
 	bool vorbis_eos = false;

 	ogg_sync_state oy;
-	ogg_page og;
 	ogg_stream_state vo;
 	ogg_stream_state to;
 	th_info ti;
@ -71,19 +81,21 @@ class VideoStreamPlaybackTheora : public VideoStreamPlayback {
 	vorbis_block vb;
 	vorbis_comment vc;
 	th_pixel_fmt px_fmt;
-	double frame_duration;
+	double frame_duration = 0;
+	double stream_length = 0;
+	int64_t stream_data_offset = 0;
+	int64_t stream_data_size = 0;

-	int theora_p = 0;
-	int vorbis_p = 0;
 	int pp_level_max = 0;
 	int pp_level = 0;
 	int pp_inc = 0;

 	bool playing = false;
-	bool buffering = false;
 	bool paused = false;

 	bool dup_frame = false;
+	bool has_video = false;
+	bool has_audio = false;
 	bool video_ready = false;
 	bool video_done = false;
 	bool audio_done = false;
@ -100,6 +112,20 @@ class VideoStreamPlaybackTheora : public VideoStreamPlayback {
 protected:
 	void clear();

+	_FORCE_INLINE_ bool send_audio() {
+		if (audio_ptr_end > 0) {
+			int mixed = mix_callback(mix_udata, &audio_buffer[audio_ptr_start * vi.channels], audio_ptr_end - audio_ptr_start);
+			audio_ptr_start += mixed;
+			if (audio_ptr_start == audio_ptr_end) {
+				audio_ptr_start = 0;
+				audio_ptr_end = 0;
+			} else {
+				return false;
+			}
+		}
+		return true;
+	}
+
 public:
 	virtual void play() override;
 	virtual void stop() override;
--- a/scene/gui/video_stream_player.cpp
+++ b/scene/gui/video_stream_player.cpp
@ -339,7 +339,6 @@ void VideoStreamPlayer::play() {
 	if (playback.is_null()) {
 		return;
 	}
-	playback->stop();
 	playback->play();
 	set_process_internal(true);
 	last_audio_time = 0;
@ -468,7 +467,9 @@ double VideoStreamPlayer::get_stream_position() const {

 void VideoStreamPlayer::set_stream_position(double p_position) {
 	if (playback.is_valid()) {
+		resampler.flush();
 		playback->seek(p_position);
+		last_audio_time = 0;
 	}
 }

--- a/servers/audio/audio_rb_resampler.cpp
+++ b/servers/audio/audio_rb_resampler.cpp
@ -75,23 +75,37 @@ uint32_t AudioRBResampler::_resample(AudioFrame *p_dest, int p_todo, int32_t p_i
 			p_dest[i] = AudioFrame(v0, v1);
 		}

-		// This will probably never be used, but added anyway
+		// Downmix to stereo. Apply -3dB to center, and sides, -6dB to rear.
+
+		// four channels - channel order: front left, front right, rear left, rear right
 		if constexpr (C == 4) {
-			float v0 = rb[(pos << 2) + 0];
-			float v1 = rb[(pos << 2) + 1];
-			float v0n = rb[(pos_next << 2) + 0];
-			float v1n = rb[(pos_next << 2) + 1];
+			float v0 = rb[(pos << 2) + 0] + rb[(pos << 2) + 2] / 2;
+			float v1 = rb[(pos << 2) + 1] + rb[(pos << 2) + 3] / 2;
+			float v0n = rb[(pos_next << 2) + 0] + rb[(pos_next << 2) + 2] / 2;
+			float v1n = rb[(pos_next << 2) + 1] + rb[(pos_next << 2) + 3] / 2;
 			v0 += (v0n - v0) * frac;
 			v1 += (v1n - v1) * frac;
 			p_dest[i] = AudioFrame(v0, v1);
 		}

+		// six channels - channel order: front left, center, front right, rear left, rear right, LFE
 		if constexpr (C == 6) {
-			float v0 = rb[(pos * 6) + 0];
-			float v1 = rb[(pos * 6) + 1];
-			float v0n = rb[(pos_next * 6) + 0];
-			float v1n = rb[(pos_next * 6) + 1];
+			float v0 = rb[(pos * 6) + 0] + rb[(pos * 6) + 1] / Math::SQRT2 + rb[(pos * 6) + 3] / 2;
+			float v1 = rb[(pos * 6) + 2] + rb[(pos * 6) + 1] / Math::SQRT2 + rb[(pos * 6) + 4] / 2;
+			float v0n = rb[(pos_next * 6) + 0] + rb[(pos_next * 6) + 1] / Math::SQRT2 + rb[(pos_next * 6) + 3] / 2;
+			float v1n = rb[(pos_next * 6) + 2] + rb[(pos_next * 6) + 1] / Math::SQRT2 + rb[(pos_next * 6) + 4] / 2;
+			v0 += (v0n - v0) * frac;
+			v1 += (v1n - v1) * frac;
+			p_dest[i] = AudioFrame(v0, v1);
+		}

+		// eight channels - channel order: front left, center, front right, side left, side right, rear left, rear
+		// right, LFE
+		if constexpr (C == 8) {
+			float v0 = rb[(pos << 3) + 0] + rb[(pos << 3) + 1] / Math::SQRT2 + rb[(pos << 3) + 3] / Math::SQRT2 + rb[(pos << 3) + 5] / 2;
+			float v1 = rb[(pos << 3) + 2] + rb[(pos << 3) + 1] / Math::SQRT2 + rb[(pos << 3) + 4] / Math::SQRT2 + rb[(pos << 3) + 6] / 2;
+			float v0n = rb[(pos_next << 3) + 0] + rb[(pos_next << 3) + 1] / Math::SQRT2 + rb[(pos_next << 3) + 3] / Math::SQRT2 + rb[(pos_next << 3) + 5] / 2;
+			float v1n = rb[(pos_next << 3) + 2] + rb[(pos_next << 3) + 1] / Math::SQRT2 + rb[(pos_next << 3) + 4] / Math::SQRT2 + rb[(pos_next << 3) + 6] / 2;
 			v0 += (v0n - v0) * frac;
 			v1 += (v1n - v1) * frac;
 			p_dest[i] = AudioFrame(v0, v1);
@ -125,6 +139,9 @@ bool AudioRBResampler::mix(AudioFrame *p_dest, int p_frames) {
 			case 6:
 				src_read = _resample<6>(p_dest, target_todo, increment);
 				break;
+			case 8:
+				src_read = _resample<8>(p_dest, target_todo, increment);
+				break;
 		}

 		if (src_read > read_space) {
@ -159,7 +176,7 @@ int AudioRBResampler::get_num_of_ready_frames() {
 }

 Error AudioRBResampler::setup(int p_channels, int p_src_mix_rate, int p_target_mix_rate, int p_buffer_msec, int p_minbuff_needed) {
-	ERR_FAIL_COND_V(p_channels != 1 && p_channels != 2 && p_channels != 4 && p_channels != 6, ERR_INVALID_PARAMETER);
+	ERR_FAIL_COND_V(p_channels != 1 && p_channels != 2 && p_channels != 4 && p_channels != 6 && p_channels != 8, ERR_INVALID_PARAMETER);

 	int desired_rb_bits = nearest_shift(MAX((p_buffer_msec / 1000.0) * p_src_mix_rate, p_minbuff_needed));

--- a/servers/audio/audio_rb_resampler.h
+++ b/servers/audio/audio_rb_resampler.h
@ -152,6 +152,19 @@ public:
 					wp = (wp + 1) & rb_mask;
 				}
 			} break;
+			case 8: {
+				for (uint32_t i = 0; i < p_frames; i++) {
+					rb[(wp << 3) + 0] = read_buf[(i << 3) + 0];
+					rb[(wp << 3) + 1] = read_buf[(i << 3) + 1];
+					rb[(wp << 3) + 2] = read_buf[(i << 3) + 2];
+					rb[(wp << 3) + 3] = read_buf[(i << 3) + 3];
+					rb[(wp << 3) + 4] = read_buf[(i << 3) + 4];
+					rb[(wp << 3) + 5] = read_buf[(i << 3) + 5];
+					rb[(wp << 3) + 6] = read_buf[(i << 3) + 6];
+					rb[(wp << 3) + 7] = read_buf[(i << 3) + 7];
+					wp = (wp + 1) & rb_mask;
+				}
+			} break;
 		}

 		rb_write_pos.set(wp);