scuffle_transmuxer/
lib.rs

1//! A crate for transmuxing video streams.
2#![cfg_attr(feature = "docs", doc = "\n\nSee the [changelog][changelog] for a full release history.")]
3#![cfg_attr(feature = "docs", doc = "## Feature flags")]
4#![cfg_attr(feature = "docs", doc = document_features::document_features!())]
5//! ## License
6//!
7//! This project is licensed under the MIT or Apache-2.0 license.
8//! You can choose between one of them if you use this work.
9//!
10//! `SPDX-License-Identifier: MIT OR Apache-2.0`
11#![allow(clippy::single_match)]
12// #![deny(missing_docs)]
13#![deny(unsafe_code)]
14#![deny(unreachable_pub)]
15#![deny(clippy::mod_module_files)]
16
17use std::collections::VecDeque;
18use std::fmt::Debug;
19use std::io;
20
21use byteorder::{BigEndian, ReadBytesExt};
22use bytes::{Buf, Bytes};
23use scuffle_flv::audio::AudioData;
24use scuffle_flv::audio::body::AudioTagBody;
25use scuffle_flv::audio::body::legacy::LegacyAudioTagBody;
26use scuffle_flv::audio::body::legacy::aac::AacAudioData;
27use scuffle_flv::audio::header::AudioTagHeader;
28use scuffle_flv::audio::header::legacy::{LegacyAudioTagHeader, SoundType};
29use scuffle_flv::script::{OnMetaData, ScriptData};
30use scuffle_flv::tag::{FlvTag, FlvTagData};
31use scuffle_flv::video::VideoData;
32use scuffle_flv::video::body::VideoTagBody;
33use scuffle_flv::video::body::enhanced::{ExVideoTagBody, VideoPacket, VideoPacketCodedFrames, VideoPacketSequenceStart};
34use scuffle_flv::video::body::legacy::LegacyVideoTagBody;
35use scuffle_flv::video::header::enhanced::VideoFourCc;
36use scuffle_flv::video::header::legacy::{LegacyVideoTagHeader, LegacyVideoTagHeaderAvcPacket};
37use scuffle_flv::video::header::{VideoFrameType, VideoTagHeader, VideoTagHeaderData};
38use scuffle_h264::Sps;
39use scuffle_mp4::BoxType;
40use scuffle_mp4::codec::{AudioCodec, VideoCodec};
41use scuffle_mp4::types::ftyp::{FourCC, Ftyp};
42use scuffle_mp4::types::hdlr::{HandlerType, Hdlr};
43use scuffle_mp4::types::mdat::Mdat;
44use scuffle_mp4::types::mdhd::Mdhd;
45use scuffle_mp4::types::mdia::Mdia;
46use scuffle_mp4::types::mfhd::Mfhd;
47use scuffle_mp4::types::minf::Minf;
48use scuffle_mp4::types::moof::Moof;
49use scuffle_mp4::types::moov::Moov;
50use scuffle_mp4::types::mvex::Mvex;
51use scuffle_mp4::types::mvhd::Mvhd;
52use scuffle_mp4::types::smhd::Smhd;
53use scuffle_mp4::types::stbl::Stbl;
54use scuffle_mp4::types::stco::Stco;
55use scuffle_mp4::types::stsc::Stsc;
56use scuffle_mp4::types::stsd::Stsd;
57use scuffle_mp4::types::stsz::Stsz;
58use scuffle_mp4::types::stts::Stts;
59use scuffle_mp4::types::tfdt::Tfdt;
60use scuffle_mp4::types::tfhd::Tfhd;
61use scuffle_mp4::types::tkhd::Tkhd;
62use scuffle_mp4::types::traf::Traf;
63use scuffle_mp4::types::trak::Trak;
64use scuffle_mp4::types::trex::Trex;
65use scuffle_mp4::types::trun::Trun;
66use scuffle_mp4::types::vmhd::Vmhd;
67
68mod codecs;
69mod define;
70mod errors;
71
72pub use define::*;
73pub use errors::TransmuxError;
74
75struct Tags<'a> {
76    video_sequence_header: Option<VideoSequenceHeader>,
77    audio_sequence_header: Option<AudioSequenceHeader>,
78    scriptdata_tag: Option<OnMetaData<'a>>,
79}
80
81#[derive(Debug, Clone)]
82pub struct Transmuxer<'a> {
83    // These durations are measured in timescales
84    /// sample_freq * 1000
85    audio_duration: u64,
86    /// fps * 1000
87    video_duration: u64,
88    sequence_number: u32,
89    last_video_timestamp: u32,
90    settings: Option<(VideoSettings, AudioSettings)>,
91    tags: VecDeque<FlvTag<'a>>,
92}
93
94impl Default for Transmuxer<'_> {
95    fn default() -> Self {
96        Self::new()
97    }
98}
99
100impl<'a> Transmuxer<'a> {
101    pub fn new() -> Self {
102        Self {
103            sequence_number: 1,
104            tags: VecDeque::new(),
105            audio_duration: 0,
106            video_duration: 0,
107            last_video_timestamp: 0,
108            settings: None,
109        }
110    }
111
112    /// Feed raw FLV data to the transmuxer.
113    pub fn demux(&mut self, data: Bytes) -> Result<(), TransmuxError> {
114        let mut cursor = io::Cursor::new(data);
115        while cursor.has_remaining() {
116            cursor.read_u32::<BigEndian>()?; // previous tag size
117            if !cursor.has_remaining() {
118                break;
119            }
120
121            let tag = FlvTag::demux(&mut cursor)?;
122            self.tags.push_back(tag);
123        }
124
125        Ok(())
126    }
127
128    /// Feed a single FLV tag to the transmuxer.
129    pub fn add_tag(&mut self, tag: FlvTag<'a>) {
130        self.tags.push_back(tag);
131    }
132
133    /// Get the next transmuxed packet. This will return `None` if there is not
134    /// enough data to create a packet.
135    pub fn mux(&mut self) -> Result<Option<TransmuxResult>, TransmuxError> {
136        let mut writer = Vec::new();
137
138        let Some((video_settings, _)) = &self.settings else {
139            let Some((video_settings, audio_settings)) = self.init_sequence(&mut writer)? else {
140                if self.tags.len() > 30 {
141                    // We are clearly not getting any sequence headers, so we should just give up
142                    return Err(TransmuxError::NoSequenceHeaders);
143                }
144
145                // We don't have enough tags to create an init segment yet
146                return Ok(None);
147            };
148
149            self.settings = Some((video_settings.clone(), audio_settings.clone()));
150
151            return Ok(Some(TransmuxResult::InitSegment {
152                data: Bytes::from(writer),
153                audio_settings,
154                video_settings,
155            }));
156        };
157
158        loop {
159            let Some(tag) = self.tags.pop_front() else {
160                return Ok(None);
161            };
162
163            let mdat_data;
164            let total_duration;
165            let trun_sample;
166            let mut is_audio = false;
167            let mut is_keyframe = false;
168
169            let duration =
170                if self.last_video_timestamp == 0 || tag.timestamp_ms == 0 || tag.timestamp_ms < self.last_video_timestamp {
171                    1000 // the first frame is always 1000 ticks where the
172                // timescale is 1000 * fps.
173                } else {
174                    // Since the delta is in milliseconds (ie 1/1000 of a second)
175                    // Rounding errors happen. Our presision is only 1/1000 of a second.
176                    // So if we have a 30fps video the delta should be 33.33ms (1000/30)
177                    // But we can only represent this as 33ms or 34ms. So we will get rounding
178                    // errors. To fix this we just check if the delta is 1 more or 1 less than the
179                    // expected delta. And if it is we just use the expected delta.
180                    // The reason we use a timescale which is 1000 * fps is because then we can
181                    // always represent the delta as an integer. If we use a timescale of 1000, we
182                    // would run into the same rounding errors.
183                    let delta = tag.timestamp_ms as f64 - self.last_video_timestamp as f64;
184                    let expected_delta = 1000.0 / video_settings.framerate;
185                    if (delta - expected_delta).abs() <= 1.0 {
186                        1000
187                    } else {
188                        (delta * video_settings.framerate) as u32
189                    }
190                };
191
192            match tag.data {
193                FlvTagData::Audio(AudioData {
194                    body: AudioTagBody::Legacy(LegacyAudioTagBody::Aac(AacAudioData::Raw(data))),
195                    ..
196                }) => {
197                    let (sample, duration) = codecs::aac::trun_sample(&data)?;
198
199                    trun_sample = sample;
200                    mdat_data = data;
201                    total_duration = duration;
202                    is_audio = true;
203                }
204                FlvTagData::Video(VideoData {
205                    header:
206                        VideoTagHeader {
207                            frame_type,
208                            data:
209                                VideoTagHeaderData::Legacy(LegacyVideoTagHeader::AvcPacket(
210                                    LegacyVideoTagHeaderAvcPacket::Nalu { composition_time_offset },
211                                )),
212                        },
213                    body: VideoTagBody::Legacy(LegacyVideoTagBody::Other { data }),
214                    ..
215                }) => {
216                    let composition_time =
217                        ((composition_time_offset as f64 * video_settings.framerate) / 1000.0).floor() * 1000.0;
218
219                    let sample = codecs::avc::trun_sample(frame_type, composition_time as u32, duration, &data)?;
220
221                    trun_sample = sample;
222                    total_duration = duration;
223                    mdat_data = data;
224
225                    is_keyframe = frame_type == VideoFrameType::KeyFrame;
226                }
227                FlvTagData::Video(VideoData {
228                    header: VideoTagHeader { frame_type, .. },
229                    body:
230                        VideoTagBody::Enhanced(ExVideoTagBody::NoMultitrack {
231                            video_four_cc: VideoFourCc::Av1,
232                            packet: VideoPacket::CodedFrames(VideoPacketCodedFrames::Other(data)),
233                        }),
234                    ..
235                }) => {
236                    let sample = codecs::av1::trun_sample(frame_type, duration, &data)?;
237
238                    trun_sample = sample;
239                    total_duration = duration;
240                    mdat_data = data;
241
242                    is_keyframe = frame_type == VideoFrameType::KeyFrame;
243                }
244                FlvTagData::Video(VideoData {
245                    header: VideoTagHeader { frame_type, .. },
246                    body:
247                        VideoTagBody::Enhanced(ExVideoTagBody::NoMultitrack {
248                            video_four_cc: VideoFourCc::Hevc,
249                            packet,
250                        }),
251                    ..
252                }) => {
253                    let (composition_time, data) = match packet {
254                        VideoPacket::CodedFrames(VideoPacketCodedFrames::Hevc {
255                            composition_time_offset,
256                            data,
257                        }) => (Some(composition_time_offset), data),
258                        VideoPacket::CodedFramesX { data } => (None, data),
259                        _ => continue,
260                    };
261
262                    let composition_time =
263                        ((composition_time.unwrap_or_default() as f64 * video_settings.framerate) / 1000.0).floor() * 1000.0;
264
265                    let sample = codecs::hevc::trun_sample(frame_type, composition_time as i32, duration, &data)?;
266
267                    trun_sample = sample;
268                    total_duration = duration;
269                    mdat_data = data;
270
271                    is_keyframe = frame_type == VideoFrameType::KeyFrame;
272                }
273                _ => {
274                    // We don't support anything else
275                    continue;
276                }
277            }
278
279            let trafs = {
280                let (main_duration, main_id) = if is_audio {
281                    (self.audio_duration, 2)
282                } else {
283                    (self.video_duration, 1)
284                };
285
286                let mut traf = Traf::new(
287                    Tfhd::new(main_id, None, None, None, None, None),
288                    Some(Trun::new(vec![trun_sample], None)),
289                    Some(Tfdt::new(main_duration)),
290                );
291                traf.optimize();
292
293                vec![traf]
294            };
295
296            let mut moof = Moof::new(Mfhd::new(self.sequence_number), trafs);
297
298            // We need to get the moof size so that we can set the data offsets.
299            let moof_size = moof.size();
300
301            // We just created the moof, and therefore we know that the first traf is the
302            // video traf and the second traf is the audio traf. So we can just unwrap them
303            // and set the data offsets.
304            let traf = moof.traf.get_mut(0).expect("we just created the moof with a traf");
305
306            // Again we know that these exist because we just created it.
307            let trun = traf.trun.as_mut().expect("we just created the video traf with a trun");
308
309            // We now define the offsets.
310            // So the video offset will be the size of the moof + 8 bytes for the mdat
311            // header.
312            trun.data_offset = Some(moof_size as i32 + 8);
313
314            // We then write the moof to the writer.
315            moof.mux(&mut writer)?;
316
317            // We create an mdat box and write it to the writer.
318            Mdat::new(vec![mdat_data]).mux(&mut writer)?;
319
320            // Increase our sequence number and duration.
321            self.sequence_number += 1;
322
323            if is_audio {
324                self.audio_duration += total_duration as u64;
325                return Ok(Some(TransmuxResult::MediaSegment(MediaSegment {
326                    data: Bytes::from(writer),
327                    ty: MediaType::Audio,
328                    keyframe: false,
329                    timestamp: self.audio_duration - total_duration as u64,
330                })));
331            } else {
332                self.video_duration += total_duration as u64;
333                self.last_video_timestamp = tag.timestamp_ms;
334                return Ok(Some(TransmuxResult::MediaSegment(MediaSegment {
335                    data: Bytes::from(writer),
336                    ty: MediaType::Video,
337                    keyframe: is_keyframe,
338                    timestamp: self.video_duration - total_duration as u64,
339                })));
340            }
341        }
342    }
343
344    /// Internal function to find the tags we need to create the init segment.
345    fn find_tags(&self) -> Tags<'a> {
346        let tags = self.tags.iter();
347        let mut video_sequence_header = None;
348        let mut audio_sequence_header = None;
349        let mut scriptdata_tag = None;
350
351        for tag in tags {
352            if video_sequence_header.is_some() && audio_sequence_header.is_some() && scriptdata_tag.is_some() {
353                break;
354            }
355
356            match &tag.data {
357                FlvTagData::Video(VideoData {
358                    body: VideoTagBody::Legacy(LegacyVideoTagBody::AvcVideoPacketSeqHdr(data)),
359                    ..
360                }) => {
361                    video_sequence_header = Some(VideoSequenceHeader::Avc(data.clone()));
362                }
363                FlvTagData::Video(VideoData {
364                    body:
365                        VideoTagBody::Enhanced(ExVideoTagBody::NoMultitrack {
366                            video_four_cc: VideoFourCc::Av1,
367                            packet: VideoPacket::SequenceStart(VideoPacketSequenceStart::Av1(config)),
368                        }),
369                    ..
370                }) => {
371                    video_sequence_header = Some(VideoSequenceHeader::Av1(config.clone()));
372                }
373                FlvTagData::Video(VideoData {
374                    body:
375                        VideoTagBody::Enhanced(ExVideoTagBody::NoMultitrack {
376                            video_four_cc: VideoFourCc::Hevc,
377                            packet: VideoPacket::SequenceStart(VideoPacketSequenceStart::Hevc(config)),
378                        }),
379                    ..
380                }) => {
381                    video_sequence_header = Some(VideoSequenceHeader::Hevc(config.clone()));
382                }
383                FlvTagData::Audio(AudioData {
384                    body: AudioTagBody::Legacy(LegacyAudioTagBody::Aac(AacAudioData::SequenceHeader(data))),
385                    header:
386                        AudioTagHeader::Legacy(LegacyAudioTagHeader {
387                            sound_size, sound_type, ..
388                        }),
389                    ..
390                }) => {
391                    audio_sequence_header = Some(AudioSequenceHeader {
392                        data: AudioSequenceHeaderData::Aac(data.clone()),
393                        sound_size: *sound_size,
394                        sound_type: *sound_type,
395                    });
396                }
397                FlvTagData::ScriptData(ScriptData::OnMetaData(metadata)) => {
398                    scriptdata_tag = Some(*metadata.clone());
399                }
400                _ => {}
401            }
402        }
403
404        Tags {
405            video_sequence_header,
406            audio_sequence_header,
407            scriptdata_tag,
408        }
409    }
410
411    /// Create the init segment.
412    fn init_sequence(
413        &mut self,
414        writer: &mut impl io::Write,
415    ) -> Result<Option<(VideoSettings, AudioSettings)>, TransmuxError> {
416        // We need to find the tag that is the video sequence header
417        // and the audio sequence header
418        let Tags {
419            video_sequence_header,
420            audio_sequence_header,
421            scriptdata_tag,
422        } = self.find_tags();
423
424        let Some(video_sequence_header) = video_sequence_header else {
425            return Ok(None);
426        };
427        let Some(audio_sequence_header) = audio_sequence_header else {
428            return Ok(None);
429        };
430
431        let video_codec;
432        let audio_codec;
433        let video_width;
434        let video_height;
435        let audio_channels;
436        let audio_sample_rate;
437        let mut video_fps = 0.0;
438
439        let mut estimated_video_bitrate = 0;
440        let mut estimated_audio_bitrate = 0;
441
442        if let Some(scriptdata_tag) = scriptdata_tag {
443            video_fps = scriptdata_tag.framerate.unwrap_or(0.0);
444            estimated_video_bitrate = scriptdata_tag.videodatarate.map(|v| (v * 1024.0) as u32).unwrap_or(0);
445            estimated_audio_bitrate = scriptdata_tag.audiodatarate.map(|v| (v * 1024.0) as u32).unwrap_or(0);
446        }
447
448        let mut compatable_brands = vec![FourCC::Iso5, FourCC::Iso6];
449
450        let video_stsd_entry = match video_sequence_header {
451            VideoSequenceHeader::Avc(config) => {
452                compatable_brands.push(FourCC::Avc1);
453                video_codec = VideoCodec::Avc {
454                    constraint_set: config.profile_compatibility,
455                    level: config.level_indication,
456                    profile: config.profile_indication,
457                };
458
459                let sps = Sps::parse_with_emulation_prevention(io::Cursor::new(&config.sps[0]))
460                    .map_err(|_| TransmuxError::InvalidAVCDecoderConfigurationRecord)?;
461                video_width = sps.width() as u32;
462                video_height = sps.height() as u32;
463
464                let frame_rate = sps.frame_rate();
465                if let Some(frame_rate) = frame_rate {
466                    video_fps = frame_rate;
467                }
468
469                codecs::avc::stsd_entry(config, &sps)?
470            }
471            VideoSequenceHeader::Av1(config) => {
472                compatable_brands.push(FourCC::Av01);
473                let (entry, seq_obu) = codecs::av1::stsd_entry(config)?;
474
475                video_height = seq_obu.max_frame_height as u32;
476                video_width = seq_obu.max_frame_width as u32;
477
478                let op_point = &seq_obu.operating_points[0];
479
480                video_codec = VideoCodec::Av1 {
481                    profile: seq_obu.seq_profile,
482                    level: op_point.seq_level_idx,
483                    tier: op_point.seq_tier,
484                    depth: seq_obu.color_config.bit_depth as u8,
485                    monochrome: seq_obu.color_config.mono_chrome,
486                    sub_sampling_x: seq_obu.color_config.subsampling_x,
487                    sub_sampling_y: seq_obu.color_config.subsampling_y,
488                    color_primaries: seq_obu.color_config.color_primaries,
489                    transfer_characteristics: seq_obu.color_config.transfer_characteristics,
490                    matrix_coefficients: seq_obu.color_config.matrix_coefficients,
491                    full_range_flag: seq_obu.color_config.full_color_range,
492                };
493
494                entry
495            }
496            VideoSequenceHeader::Hevc(config) => {
497                compatable_brands.push(FourCC::Hev1);
498                video_codec = VideoCodec::Hevc {
499                    constraint_indicator: config.general_constraint_indicator_flags,
500                    level: config.general_level_idc,
501                    profile: config.general_profile_idc,
502                    profile_compatibility: config.general_profile_compatibility_flags,
503                    tier: config.general_tier_flag,
504                    general_profile_space: config.general_profile_space,
505                };
506
507                let (entry, sps) = codecs::hevc::stsd_entry(config)?;
508                if let Some(info) = sps.vui_parameters.as_ref().and_then(|p| p.vui_timing_info.as_ref()) {
509                    video_fps = info.time_scale.get() as f64 / info.num_units_in_tick.get() as f64;
510                }
511
512                video_width = sps.cropped_width() as u32;
513                video_height = sps.cropped_height() as u32;
514
515                entry
516            }
517        };
518
519        let audio_stsd_entry = match audio_sequence_header.data {
520            AudioSequenceHeaderData::Aac(data) => {
521                compatable_brands.push(FourCC::Mp41);
522                let (entry, config) =
523                    codecs::aac::stsd_entry(audio_sequence_header.sound_size, audio_sequence_header.sound_type, data)?;
524
525                audio_sample_rate = config.sampling_frequency;
526
527                audio_codec = AudioCodec::Aac {
528                    object_type: config.audio_object_type,
529                };
530                audio_channels = match audio_sequence_header.sound_type {
531                    SoundType::Mono => 1,
532                    SoundType::Stereo => 2,
533                    _ => return Err(TransmuxError::InvalidAudioChannels),
534                };
535
536                entry
537            }
538        };
539
540        if video_fps == 0.0 {
541            return Err(TransmuxError::InvalidVideoFrameRate);
542        }
543
544        if video_width == 0 || video_height == 0 {
545            return Err(TransmuxError::InvalidVideoDimensions);
546        }
547
548        if audio_sample_rate == 0 {
549            return Err(TransmuxError::InvalidAudioSampleRate);
550        }
551
552        // The reason we multiply the FPS by 1000 is to avoid rounding errors
553        // Consider If we had a video with a framerate of 30fps. That would imply each
554        // frame is 33.333333ms So we are limited to a u32 and therefore we could only
555        // represent 33.333333ms as 33ms. So this value is 30 * 1000 = 30000 timescale
556        // units per second, making each frame 1000 units long instead of 33ms long.
557        let video_timescale = (1000.0 * video_fps) as u32;
558
559        Ftyp::new(FourCC::Iso5, 512, compatable_brands).mux(writer)?;
560        Moov::new(
561            Mvhd::new(0, 0, 1000, 0, 1),
562            vec![
563                Trak::new(
564                    Tkhd::new(0, 0, 1, 0, Some((video_width, video_height))),
565                    None,
566                    Mdia::new(
567                        Mdhd::new(0, 0, video_timescale, 0),
568                        Hdlr::new(HandlerType::Vide, "VideoHandler".to_string()),
569                        Minf::new(
570                            Stbl::new(
571                                Stsd::new(vec![video_stsd_entry]),
572                                Stts::new(vec![]),
573                                Stsc::new(vec![]),
574                                Stco::new(vec![]),
575                                Some(Stsz::new(0, vec![])),
576                            ),
577                            Some(Vmhd::new()),
578                            None,
579                        ),
580                    ),
581                ),
582                Trak::new(
583                    Tkhd::new(0, 0, 2, 0, None),
584                    None,
585                    Mdia::new(
586                        Mdhd::new(0, 0, audio_sample_rate, 0),
587                        Hdlr::new(HandlerType::Soun, "SoundHandler".to_string()),
588                        Minf::new(
589                            Stbl::new(
590                                Stsd::new(vec![audio_stsd_entry]),
591                                Stts::new(vec![]),
592                                Stsc::new(vec![]),
593                                Stco::new(vec![]),
594                                Some(Stsz::new(0, vec![])),
595                            ),
596                            None,
597                            Some(Smhd::new()),
598                        ),
599                    ),
600                ),
601            ],
602            Some(Mvex::new(vec![Trex::new(1), Trex::new(2)], None)),
603        )
604        .mux(writer)?;
605
606        Ok(Some((
607            VideoSettings {
608                width: video_width,
609                height: video_height,
610                framerate: video_fps,
611                codec: video_codec,
612                bitrate: estimated_video_bitrate,
613                timescale: video_timescale,
614            },
615            AudioSettings {
616                codec: audio_codec,
617                sample_rate: audio_sample_rate,
618                channels: audio_channels,
619                bitrate: estimated_audio_bitrate,
620                timescale: audio_sample_rate,
621            },
622        )))
623    }
624}
625
626/// Changelogs generated by [scuffle_changelog]
627#[cfg(feature = "docs")]
628#[scuffle_changelog::changelog]
629pub mod changelog {}
630
631#[cfg(test)]
632mod tests;