该 Box 则包含了音频的编码信息和音频码率信息,所以解码音频时非常关键。Esds中可以分为三层,每层为包含关系,分别为 MP4ESDescr,MP4DecConfigDescr,MP4DecSpecificDescr。
直接分析字段值,这个 Box 是 Full Box,这里 length, type 和 version flag 就不分析了,具体可以看上面,下面我们就直接分析 Data 数据。这里要注意上面红框标注的 80 80 80,这个相当于分隔符,对应 Esds 每一层的数据。当然有一些 MP4文件下的 Esds Box' 是没有这个80 80 80,那就直接解析就行了
名称 | 实际值(16进制) | 具体值(10进制 / ASCII) | |
---|---|---|---|
es description(ed) tag | 03 | 3 | 基本流描述标记:默认0x03 |
ed tag szie | (跳过80 80 80)1F | 30 | 表示后面有30字节 |
ed track id | 00 00 | 0 | 0表示音频的原始es数据的id是0,一般一路音频,这个值就默认是0; |
ed flag | 00 | 0 | |
decoder config descriptor(dcd) tag | 04 | 4 | 默认值0x04 |
dcd tag size | 14 | 20 | 长度 |
dcd mepg-4 audio | 40 | 0x40 是 Audio ISO/IEC 14496-3 | |
dcd audio stream | 15 | 一般默认0x15 | |
dcd buffersize db | 00 60 00 | 建议的解码器缓存大小 | |
dcd max bitrate | 00 02 EE 00 | 192000(187.5kb/s) | 音频数据最大码率 |
dcd avg bitrate | 00 02 EE 00 | 192000(187.5kb/s) | 音频数据平均码率 |
decoder specific info description(dsid) tag | 05 | 5 | 解码规格标记,默认值:0x05 |
dsid tag szie | 02 | 2 | 解码规格标记及其后面值大小 |
dsid audio specific config(asc) | 11 90 | 0b10001 0b10010000 | 音频规格数据,见下面各个bit位解释 |
asc object type | 11 | 0b00010 | 5bit,AAC Main |
asc frequency index | 11 90 | 0b0011 | 4bit,48000 Hz |
asc channel configuration | 90 | 0b0010 | 4bit,双声道 |
asc frame length flag | 90 | 0b0 | 1bit,1024 samples,每个包的大小为 1024字节 也就是一帧音频的大小。 |
asc depends on core coder | 90 | 0b0 | 1bit,不太重要 |
asc extesion flag | 90 | 0b0 | 1bit,不太重要 |
上面分析基本是和程序是一致的,下面是对上面各个字段不同值的补充解释。
ed flag:
一般默认00:0x00:00000000
其中每个bit还代表是否后面有相应的字段。
第一bit为1,则有16bit的dependOn_ES_IS字段;
第二bit为1,则有8bit的URL ing字段;
第三bit为1,则有16bit的OCR_ES_ID字段;
最后5bit,代表streamPriority
asc object type**(5bit)**:
0: Null
1: AAC Main
2: AAC LC (Low Complexity)
3: AAC SSR (Scalable Sample Rate)
asc frequency index**(4bit)**
0: 96000 Hz
1: 88200 Hz
2: 64000 Hz
3: 48000 Hz
4: 44100 Hz
5: 32000 Hz
6: 24000 Hz
7: 22050 Hz
8: 16000 Hz
9: 12000 Hz
10: 11025 Hz
11: 8000 Hz
12: 7350 Hz
13: Reserved
14: Reserved
15: frequency is written explictly
asc channel configuration**(4bit)**
0: Defined in AOT Specifc Config
1: 1channel: front-center 单声道
2: 2channels:front-left, front-right 双声道
3: 3channels:front-center,front-left, front-right 3声道
asc frame length flag**(1bit)**:
0: Each packet contains 1024 samples
1: Each packet contains 960 samples
参照: https://wiki.multimedia.cx/index.php?title=MPEG-4_Audio
// BaseBox.h
// ...
// 其他 Box 的定义
class TimeEsdsBox :public BaseBox {
public:
Timebyte version = 0;
Timebyte flags = 0;
unsigned char es_description_tag = 0; // 基本流描述标记:默认0x03
unsigned char ed_tag_szie = 0; // 长度
unsigned short ed_track_id = 0; // es id 原始音频流的id
unsigned char ed_flag = 0; // 一般默认00
unsigned char decoder_config_descriptor_tag = 0; // 解码配置参数描述标记:默认0x04
unsigned char dcd_tag_size = 0; // Length Field长度
unsigned char dcd_mepg_audio = 0; // 如果是mp4则计算得到:0x40
unsigned char dcd_audio_stream = 0; // 按照标准或计算得到:此处一般默认0x15
unsigned int dcd_buffersize_db = 0; // 3byte 建议的解码器缓存大小
unsigned int dcd_max_bitrate = 0; // 音频数据最大码率
unsigned int dcd_avg_bitrate = 0; // 音频数据平均码率
unsigned char decoder_specific_info_description_tag = 0; // 解码规格标记 默认值:0x05
unsigned char dsid_tag_szie = 0 ;// 解码规格标记及其后面值大小
// 音频规格数据(2byte) 16bit代表的含义
unsigned char asc_object_type = 0 ;// 5bit 表示采用的音频编码规格
unsigned char asc_frequency_index = 0 ;// 4bit 表示采样率
unsigned char asc_channel_configuration = 0 ;// 4bit 表示通道数
unsigned char asc_frame_length_flag = 0 ;// 1bit 表示一帧音频的大小
// ... 还有2bite 暂时不关心
TimeEsdsBox(BoxHeader h, Timebyte * d): BaseBox(h, d){};
void PrintDataInfo() override;
};
// TimeEsdsBox.cpp
void TimeEsdsBox::PrintDataInfo() {
TimeBufferStream bufferStream(data, h.GetDataSize());
version = bufferStream.GetUChar();
bufferStream.GetLenData(&flags, 3);
{
es_description_tag = bufferStream.GetUChar();
// 0x80 0x80 0x80
unsigned char catNumb[3] = {0};
bufferStream.GetLenData(catNumb, 3);
if (catNumb[0] == 0x80 && catNumb[1] == 0x80 && catNumb[2] == 0x80) {
ed_tag_szie = bufferStream.GetUChar();
ed_track_id = bufferStream.GetUShort();
} else {
ed_tag_szie = catNumb[0];
ed_track_id = catNumb[1] << 8 | catNumb[2];
}
ed_flag = bufferStream.GetUChar();
}
{
decoder_config_descriptor_tag = bufferStream.GetUChar();
// 0x80 0x80 0x80
unsigned char catNumb[3] = {0};
bufferStream.GetLenData(catNumb, 3);
if (catNumb[0] == 0x80 && catNumb[1] == 0x80 && catNumb[2] == 0x80) {
dcd_tag_size = bufferStream.GetUChar();
dcd_mepg_audio = bufferStream.GetUChar();
dcd_audio_stream = bufferStream.GetUChar();
} else {
ed_tag_szie = catNumb[0];
dcd_mepg_audio = catNumb[1];
dcd_audio_stream = catNumb[2];
}
bufferStream.GetLenData(&dcd_buffersize_db, 3);
dcd_max_bitrate = bufferStream.GetUInt();
dcd_avg_bitrate = bufferStream.GetUInt();
}
{
decoder_specific_info_description_tag = bufferStream.GetUChar();
// 0x80 0x80 0x80
unsigned char catNumb[3] = {0};
unsigned short temp = 0;
bufferStream.GetLenData(catNumb, 3);
if (catNumb[0] == 0x80 && catNumb[1] == 0x80 && catNumb[2] == 0x80) {
dsid_tag_szie = bufferStream.GetUChar();
temp = bufferStream.GetUShort();
} else {
dsid_tag_szie = catNumb[0];
temp = catNumb[1] << 8 | catNumb[2];
}
asc_object_type = (temp & 0xf800) >> 11;
asc_frequency_index = (temp & 0x780) >> 7;
asc_channel_configuration = (temp & 0x78) >> 3;
asc_channel_configuration = (temp & 0b100) >> 2;
}
printf("===========================\n");
h.to_string();
printf("es_description_tag: %d\n", es_description_tag);
printf("ed_tag_szie: %d\n", ed_tag_szie);
printf("ed_track_id: %d\n", ed_track_id);
printf("ed_flag: %d\n", ed_flag);
printf("decoder_config_descriptor_tag: %d\n", decoder_config_descriptor_tag);
printf("dcd_tag_size: %d\n", dcd_tag_size);
printf("dcd_mepg_audio: %d\n", dcd_mepg_audio);
printf("dcd_audio_stream: %d\n", dcd_audio_stream);
printf("dcd_buffersize_db: %d\n", dcd_buffersize_db);
printf("dcd_max_bitrate: %ud => max: %.2f Kb/s\n", dcd_max_bitrate, (float) dcd_max_bitrate / 1000.0);
printf("dcd_avg_bitrate: %ud => avg: %.2f Kb/s\n", dcd_avg_bitrate, (float) dcd_avg_bitrate / 1000.0);
printf("decoder_specific_info_description_tag: %d\n", decoder_specific_info_description_tag);
printf("dsid_tag_szie: %d\n", dsid_tag_szie);
switch (asc_object_type) { // 5bit
case 0:
printf("asc_object_type: Null\n");
break;
case 1:
printf("asc_object_type: AAC Main\n");
break;
case 2:
printf("asc_object_type: AAC LC (Low Complexity)\n");
break;
case 3:
printf("asc_object_type: AAC SSR (Scalable Sample Rate)\n");
break;
// ... 后面的不常见
}
switch (asc_frequency_index) { // 4bit
case 0:
printf("samples: 96000 Hz\n");
break;
case 1:
printf("samples: 88200 Hz\n");
break;
case 2:
printf("samples: 64000 Hz\n");
break;
case 3:
printf("samples: 48000 Hz\n");
break;
case 4:
printf("samples: 44100 Hz\n");
break;
case 5:
printf("samples: 32000 Hz\n");
break;
case 6:
printf("samples: 24000 Hz\n");
break;
case 7:
printf("samples: 22050 Hz\n");
break;
case 8:
printf("samples: 16000 Hz\n");
break;
case 9:
printf("samples: 12000 Hz\n");
break;
case 10:
printf("samples: 11025 Hz\n");
break;
case 11:
printf("samples: 8000 Hz\n");
break;
case 12:
printf("samples: 7350 Hz\n");
break;
// ... 后面的不常见
}
switch (asc_channel_configuration) { // 4bit
case 0:
printf("channels: Defined in AOT Specifc Config\n");
break;
case 1:
printf("channels: 1 channel: front-center\n");
break;
case 2:
printf("channels: 2 channels:front-left, front-right\n");
break;
case 3:
printf("channels: 3 channels:front-center,front-left, front-right \n");
break;
// ... 不明
}
switch (asc_frame_length_flag) { // 1bit
case 0:
printf("packet: Each packet contains 1024 samples\n");
break;
case 1:
printf("packet: Each packet contains 960 samples\n");
break;
}
}