WebRTC VideoEngine integrated application example -- Integrated X264 coding and ffmpeg decoding

Posted by Skittlewidth on Tue, 08 Mar 2022 03:37:07 +0100

There are currently three articles in this series, which will be updated later

Comprehensive application example of WebRTC VideoEngine (I) -- basic process of video call

Comprehensive application example of WebRTC VideoEngine (II) -- integrated OPENH264 codec

Comprehensive application example of WebRTC VideoEngine (III) -- integrating X264 coding and ffmpeg decoding

executive summary

In the previous article, I explained how to integrate OPENH264 codec into WebRTC, but OPENH264 can only encode baseline H264 video, and X264 is the best in terms of coding quality. This article will explain how to integrate X264 codec into WebRTC. In order to realize decoding, ffmpeg should be used at the same time. The overall process is the same as before, which is divided into two steps: re encapsulation of codec and registration call. There is no difference in registration call, which is mainly due to the great difference in re encapsulation.

Reseal the X264 encoding function

First of all, of course, you should download the X264 source code and compile the corresponding library for calling. Compile with mingw under windows, then export the library with poxports tool, and finally get libx264 DLL and libx264 Lib, and put X264 H and x264_config.h a total of four files are placed in the project directory and configured accordingly in the project properties.

The basic process of video coding using x264 is as follows

#include <stdint.h>
#include <stdio.h>
#include <x264.h>

int main( int argc, char **argv )
{
    int width, height;
    x264_param_t param;
    x264_picture_t pic;
    x264_picture_t pic_out;
    x264_t *h;
    int i_frame = 0;
    int i_frame_size;
    x264_nal_t *nal;
    int i_nal;

    /* Get default params for preset/tuning */
    if( x264_param_default_preset( &param, "medium", NULL ) < 0 )
        goto fail;

    /* Configure non-default params */
    param.i_csp = X264_CSP_I420;
    param.i_width  = width;
    param.i_height = height;
    param.b_vfr_input = 0;
    param.b_repeat_headers = 1;
    param.b_annexb = 1;

    /* Apply profile restrictions. */
    if( x264_param_apply_profile( &param, "high" ) < 0 )
        goto fail;

    if( x264_picture_alloc( &pic, param.i_csp, param.i_width, param.i_height ) < 0 )
        goto fail;

    h = x264_encoder_open( &param);
    if( !h )
        goto fail;

    int luma_size = width * height;
    int chroma_size = luma_size / 4;
    /* Encode frames */
    for( ;; i_frame++ )
    {
        /* Read input frame */
        if( fread( pic.img.plane[0], 1, luma_size, stdin ) != luma_size )
            break;
        if( fread( pic.img.plane[1], 1, chroma_size, stdin ) != chroma_size )
            break;
        if( fread( pic.img.plane[2], 1, chroma_size, stdin ) != chroma_size )
            break;

        pic.i_pts = i_frame;
        i_frame_size = x264_encoder_encode( h, &nal, &i_nal, &pic, &pic_out );
        if( i_frame_size < 0 )
            goto fail;
        else if( i_frame_size )
        {
            if( !fwrite( nal->p_payload, i_frame_size, 1, stdout ) )
                goto fail;
        }
    }
    /* Flush delayed frames */
    while( x264_encoder_delayed_frames( h ) )
    {
        i_frame_size = x264_encoder_encode( h, &nal, &i_nal, NULL, &pic_out );
        if( i_frame_size < 0 )
            goto fail;
        else if( i_frame_size )
        {
            if( !fwrite( nal->p_payload, i_frame_size, 1, stdout ) )
                goto fail;
        }
    }

    x264_encoder_close( h );
    x264_picture_clean( &pic );
    return 0;
}

It's the same as the article mentioned in gourd 64impl

The first is the definition of class. The original private member variable ISVCEncoder* encoder is removed, The following items are added, and other contents remain unchanged

  x264_picture_t pic;
  x264_picture_t pic_out;
  x264_t *encoder_;
  int i_frame = 0;//frame index
  x264_nal_t *nal;

Accordingly, the constructor and destructor should also be changed. I won't repeat it here. I'll focus on InitEncode method and Encode method.

The implementation of InitEncode method is rewritten as follows

int H264EncoderImpl::InitEncode(const VideoCodec* inst,
		int number_of_cores,
		size_t max_payload_size) {
		if (inst == NULL) {
			return WEBRTC_VIDEO_CODEC_ERR_PARAMETER;
		}
		if (inst->maxFramerate < 1) {
			return WEBRTC_VIDEO_CODEC_ERR_PARAMETER;
		}
		// allow zero to represent an unspecified maxBitRate
		if (inst->maxBitrate > 0 && inst->startBitrate > inst->maxBitrate) {
			return WEBRTC_VIDEO_CODEC_ERR_PARAMETER;
		}
		if (inst->width < 1 || inst->height < 1) {
			return WEBRTC_VIDEO_CODEC_ERR_PARAMETER;
		}
		if (number_of_cores < 1) {
			return WEBRTC_VIDEO_CODEC_ERR_PARAMETER;
		}

		int ret_val = Release();
		if (ret_val < 0) {
			return ret_val;
		}
		/* Get default params for preset/tuning */
		x264_param_t param;
		ret_val = x264_param_default_preset(&param, "medium", NULL);
		if (ret_val != 0) {
			WEBRTC_TRACE(webrtc::kTraceError, webrtc::kTraceVideoCoding, -1,
				"H264EncoderImpl::InitEncode() fails to initialize encoder ret_val %d",
				ret_val);
			x264_encoder_close(encoder_);
			encoder_ = NULL;
			return WEBRTC_VIDEO_CODEC_ERROR;
		}
		/* Configure non-default params */
		param.i_csp = X264_CSP_I420;
		param.i_width = inst->width;
		param.i_height = inst->height;
		param.b_vfr_input = 0;
		param.b_repeat_headers = 1;
		param.b_annexb = 0;//It is set to 0 here to make the encoded NAL have a 4-byte starting code for easy processing. Otherwise, there will be 3-byte and 4-byte starting codes at the same time, which is very troublesome
		param.i_fps_num = 1;
		param.i_fps_num = codec_.maxFramerate;
		param.rc.i_bitrate = codec_.maxBitrate;
		/* Apply profile restrictions. */
		ret_val = x264_param_apply_profile(&param, "high");
		if (ret_val != 0) {
			WEBRTC_TRACE(webrtc::kTraceError, webrtc::kTraceVideoCoding, -1,
				"H264EncoderImpl::InitEncode() fails to initialize encoder ret_val %d",
				ret_val);
			x264_encoder_close(encoder_);
			encoder_ = NULL;
			return WEBRTC_VIDEO_CODEC_ERROR;
		}

		ret_val = x264_picture_alloc(&pic, param.i_csp, param.i_width, param.i_height);
		if (ret_val != 0) {
			WEBRTC_TRACE(webrtc::kTraceError, webrtc::kTraceVideoCoding, -1,
				"H264EncoderImpl::InitEncode() fails to initialize encoder ret_val %d",
				ret_val);
			x264_encoder_close(encoder_);
			encoder_ = NULL;
			return WEBRTC_VIDEO_CODEC_ERROR;
		}

		encoder_ = x264_encoder_open(&param);
		if (!encoder_){
			WEBRTC_TRACE(webrtc::kTraceError, webrtc::kTraceVideoCoding, -1,
				"H264EncoderImpl::InitEncode() fails to initialize encoder ret_val %d",
				ret_val);
			x264_encoder_close(encoder_);
			x264_picture_clean(&pic);
			encoder_ = NULL;
			return WEBRTC_VIDEO_CODEC_ERROR;
		}

		if (&codec_ != inst) {
			codec_ = *inst;
		}

		if (encoded_image_._buffer != NULL) {
			delete[] encoded_image_._buffer;
		}
		encoded_image_._size = CalcBufferSize(kI420, codec_.width, codec_.height);
		encoded_image_._buffer = new uint8_t[encoded_image_._size];
		encoded_image_._completeFrame = true;

		inited_ = true;
		WEBRTC_TRACE(webrtc::kTraceApiCall, webrtc::kTraceVideoCoding, -1,
			"H264EncoderImpl::InitEncode(width:%d, height:%d, framerate:%d, start_bitrate:%d, max_bitrate:%d)",
			inst->width, inst->height, inst->maxFramerate, inst->startBitrate, inst->maxBitrate);

		return WEBRTC_VIDEO_CODEC_OK;
	}

The implementation of Encode method is rewritten as follows

int H264EncoderImpl::Encode(const I420VideoFrame& input_image,
		const CodecSpecificInfo* codec_specific_info,
		const std::vector<VideoFrameType>* frame_types) {
		if (!inited_) {
			return WEBRTC_VIDEO_CODEC_UNINITIALIZED;
		}
		if (input_image.IsZeroSize()) {
			return WEBRTC_VIDEO_CODEC_ERR_PARAMETER;
		}
		if (encoded_complete_callback_ == NULL) {
			return WEBRTC_VIDEO_CODEC_UNINITIALIZED;
		}

		VideoFrameType frame_type = kDeltaFrame;
		// We only support one stream at the moment.
		if (frame_types && frame_types->size() > 0) {
			frame_type = (*frame_types)[0];
		}

		bool send_keyframe = (frame_type == kKeyFrame);
		if (send_keyframe) {
			pic.b_keyframe = TRUE;
			WEBRTC_TRACE(webrtc::kTraceApiCall, webrtc::kTraceVideoCoding, -1,
				"H264EncoderImpl::EncodeKeyFrame(width:%d, height:%d)",
				input_image.width(), input_image.height());
		}

		// Check for change in frame size.
		if (input_image.width() != codec_.width ||
			input_image.height() != codec_.height) {
			int ret = UpdateCodecFrameSize(input_image);
			if (ret < 0) {
				return ret;
			}
		}

		/* Read input frame */
		pic.img.plane[0] = const_cast<uint8_t*>(input_image.buffer(kYPlane));
		pic.img.plane[1] = const_cast<uint8_t*>(input_image.buffer(kUPlane));
		pic.img.plane[2] = const_cast<uint8_t*>(input_image.buffer(kVPlane));
		pic.i_pts = i_frame;

		int i_nal = 0;
		int i_frame_size = x264_encoder_encode(encoder_, &nal, &i_nal, &pic, &pic_out);
		if (i_frame_size < 0)
		{
			WEBRTC_TRACE(webrtc::kTraceError, webrtc::kTraceVideoCoding, -1,
				"H264EncoderImpl::Encode() fails to encode %d",
				i_frame_size);
			x264_encoder_close(encoder_);
			x264_picture_clean(&pic);
			encoder_ = NULL;
			return WEBRTC_VIDEO_CODEC_ERROR;
		}

		RTPFragmentationHeader frag_info;
		
		if (i_frame_size)
		{
			if (i_nal == 0) {
				return WEBRTC_VIDEO_CODEC_OK;
			}
			frag_info.VerifyAndAllocateFragmentationHeader(i_nal);

			encoded_image_._length = 0;

			uint32_t totalNaluIndex = 0;
			for (int nal_index = 0; nal_index < i_nal; nal_index++)
			{
				uint32_t currentNaluSize = 0;
				currentNaluSize = nal[nal_index].i_payload - 4; //x264_ encoder_ The nal unit encoded by encode already has a start code. In addition, nal[index] can be directly used here instead of x264_nal_encode function
				memcpy(encoded_image_._buffer + encoded_image_._length, nal[nal_index].p_payload + 4, currentNaluSize);//encoded_image_ What is stored in is the data without the start code
				encoded_image_._length += currentNaluSize;

				WEBRTC_TRACE(webrtc::kTraceApiCall, webrtc::kTraceVideoCoding, -1,
					"H264EncoderImpl::Encode() nal_type %d, length:%d",
					nal[nal_index].i_type, encoded_image_._length);

				frag_info.fragmentationOffset[totalNaluIndex] = encoded_image_._length - currentNaluSize;
				frag_info.fragmentationLength[totalNaluIndex] = currentNaluSize;
				frag_info.fragmentationPlType[totalNaluIndex] = nal[nal_index].i_type;
				frag_info.fragmentationTimeDiff[totalNaluIndex] = 0;
				totalNaluIndex++;
			}
		}
		i_frame++;
		if (encoded_image_._length > 0) {
			encoded_image_._timeStamp = input_image.timestamp();
			encoded_image_.capture_time_ms_ = input_image.render_time_ms();
			encoded_image_._encodedHeight = codec_.height;
			encoded_image_._encodedWidth = codec_.width;
			encoded_image_._frameType = frame_type;
			// call back
			encoded_complete_callback_->Encoded(encoded_image_, NULL, &frag_info);
		}
		return WEBRTC_VIDEO_CODEC_OK;
	}

The implementation of other methods has not changed.

So far, it is easy to understand that the X264 encoder has been repackaged.

Reseal ffmpeg decoding function

First of all, it is the same to obtain the header file and library file of ffmpeg, add them to the project and set them accordingly. Here, only four libraries of avcodec, avformat, avutil swscale need to be used, and the header file can also be deleted accordingly.

The basic process of ffmpeg decoding is as follows. After the actual integration, it is encoded image & input from WebRTC_ Image to obtain the data to be decoded, so the common file based decoding process cannot be used

AVCodec *codec = avcodec_find_decoder(AV_CODEC_ID_H264);
AVCodecContext *codecCtx = avcodec_alloc_context3(codec);
avcodec_open2(codecCtx, codec, nil);
char *videoData;
int len;
AVFrame *frame = av_frame_alloc();
AVPacket packet;
av_new_packet(&packet, len);
memcpy(packet.data, videoData, len);
int ret, got_picture;
ret = avcodec_decode_video2(codecCtx, frame, &got_picture, &packet);
if (ret > 0){
    if(got_picture){
    //Proceed to the next step
    }
}

Accordingly, the definition of H264DecoderImpl class and the implementation of each method should be rewritten. The first is the definition of class. ISVCDecoder* decoder _ is removed, The following private member variables are added

AVCodecContext	*pCodecCtx;
  AVCodec			*pCodec;
  AVFrame	*pFrame, *pFrameYUV;
  AVPacket *packet;
  struct SwsContext *img_convert_ctx;
  uint8_t *decode_buffer;//Store the first received SPS, PPS and IDR frames for the first decoding
  uint8_t *out_buffer;
  int framecnt = 0;
  int encoded_length = 0;

The rewriting of constructor and destructor does not omit the table. Let's focus on InitDecode method and Decode method

The InitDecode method is rewritten as follows

int H264DecoderImpl::InitDecode(const VideoCodec* inst, int number_of_cores) {
		if (inst == NULL) {
			return WEBRTC_VIDEO_CODEC_ERR_PARAMETER;
		}
		int ret_val = Release();
		if (ret_val < 0) {
			return ret_val;
		}

		if (&codec_ != inst) {
			// Save VideoCodec instance for later; mainly for duplicating the decoder.
			codec_ = *inst;
		}
		pCodec = avcodec_find_decoder(AV_CODEC_ID_H264);
		pCodecCtx = avcodec_alloc_context3(pCodec);
		pCodecCtx->pix_fmt = PIX_FMT_YUV420P;
		pCodecCtx->width = codec_.width;
		pCodecCtx->height = codec_.height;
		//pCodecCtx->bit_rate = codec_.targetBitrate*1000;
		pCodecCtx->time_base.num = 1;
		pCodecCtx->time_base.den = codec_.maxFramerate;

		if (pCodec == NULL){
			WEBRTC_TRACE(webrtc::kTraceError, webrtc::kTraceVideoCoding, -1,
				"H264DecoderImpl::InitDecode, Codec not found.");
			return WEBRTC_VIDEO_CODEC_ERROR;
		}
		if (avcodec_open2(pCodecCtx, pCodec, NULL) < 0){
			WEBRTC_TRACE(webrtc::kTraceError, webrtc::kTraceVideoCoding, -1,
				"H264DecoderImpl::InitDecode, Could not open codec.");
			return WEBRTC_VIDEO_CODEC_ERROR;
		}
		inited_ = true;

		// Always start with a complete key frame.
		key_frame_required_ = true;
		WEBRTC_TRACE(webrtc::kTraceApiCall, webrtc::kTraceVideoCoding, -1,
			"H264DecoderImpl::InitDecode(width:%d, height:%d, framerate:%d, start_bitrate:%d, max_bitrate:%d)",
			inst->width, inst->height, inst->maxFramerate, inst->startBitrate, inst->maxBitrate);
		return WEBRTC_VIDEO_CODEC_OK;
	}

The implementation of Decode method is rewritten as follows

int H264DecoderImpl::Decode(const EncodedImage& input_image,
		bool missing_frames,
		const RTPFragmentationHeader* fragmentation,
		const CodecSpecificInfo* codec_specific_info,
		int64_t /*render_time_ms*/) {
		if (!inited_) {
			WEBRTC_TRACE(webrtc::kTraceError, webrtc::kTraceVideoCoding, -1,
				"H264DecoderImpl::Decode, decoder is not initialized");
			return WEBRTC_VIDEO_CODEC_UNINITIALIZED;
		}

		if (decode_complete_callback_ == NULL) {
			WEBRTC_TRACE(webrtc::kTraceError, webrtc::kTraceVideoCoding, -1,
				"H264DecoderImpl::Decode, decode complete call back is not set");
			return WEBRTC_VIDEO_CODEC_UNINITIALIZED;
		}

		if (input_image._buffer == NULL) {
			WEBRTC_TRACE(webrtc::kTraceError, webrtc::kTraceVideoCoding, -1,
				"H264DecoderImpl::Decode, null buffer");
			return WEBRTC_VIDEO_CODEC_ERR_PARAMETER;
		}
		if (!codec_specific_info) {
			WEBRTC_TRACE(webrtc::kTraceError, webrtc::kTraceVideoCoding, -1,
				"H264EncoderImpl::Decode, no codec info");
			return WEBRTC_VIDEO_CODEC_ERROR;
		}
		if (codec_specific_info->codecType != kVideoCodecH264) {
			WEBRTC_TRACE(webrtc::kTraceError, webrtc::kTraceVideoCoding, -1,
				"H264EncoderImpl::Decode, non h264 codec %d", codec_specific_info->codecType);
			return WEBRTC_VIDEO_CODEC_ERROR;
		}

		WEBRTC_TRACE(webrtc::kTraceApiCall, webrtc::kTraceVideoCoding, -1,
			"H264DecoderImpl::Decode(frame_type:%d, length:%d",
			input_image._frameType, input_image._length);
	
		if (framecnt < 2)
		{//The first SPS PPS and IDR frames are stored for initial decoding
			memcpy(decode_buffer + encoded_length, input_image._buffer, input_image._length);
			encoded_length += input_image._length;
			framecnt++;
		}
		else
		{
			pFrame = av_frame_alloc();
			pFrameYUV = av_frame_alloc();
			out_buffer = (uint8_t *)av_malloc(avpicture_get_size(PIX_FMT_YUV420P, pCodecCtx->width, pCodecCtx->height));
			avpicture_fill((AVPicture *)pFrameYUV, out_buffer, PIX_FMT_YUV420P, pCodecCtx->width, pCodecCtx->height);
			img_convert_ctx = sws_getContext(pCodecCtx->width, pCodecCtx->height, pCodecCtx->pix_fmt,
				pCodecCtx->width, pCodecCtx->height, PIX_FMT_YUV420P, SWS_BICUBIC, NULL, NULL, NULL);
			
			if (framecnt == 2)
			{
				packet = (AVPacket *)av_malloc(sizeof(AVPacket));
				av_new_packet(packet, encoded_length);
				memcpy(packet->data, decode_buffer, encoded_length);
				av_free(decode_buffer);
				framecnt++;
				printf("\n\nLoading");
			}
			else
			{
				packet = (AVPacket *)av_malloc(sizeof(AVPacket));
				av_new_packet(packet, input_image._length);
				memcpy(packet->data, input_image._buffer, input_image._length);
			}
			
			int got_picture = 0;
			int ret = avcodec_decode_video2(pCodecCtx, pFrame, &got_picture, packet);
			if (ret < 0){
				WEBRTC_TRACE(webrtc::kTraceError, webrtc::kTraceVideoCoding, -1,
					"H264DecoderImpl::Decode, Decode Error.");
				return WEBRTC_VIDEO_CODEC_ERROR;
			}
			if (got_picture){
				sws_scale(img_convert_ctx, (const uint8_t* const*)pFrame->data, pFrame->linesize, 0, pCodecCtx->height,
					pFrameYUV->data, pFrameYUV->linesize);

				int size_y = pFrameYUV->linesize[0] * pCodecCtx->height;
				int size_u = pFrameYUV->linesize[1] * pCodecCtx->height / 2;
				int size_v = pFrameYUV->linesize[2] * pCodecCtx->height / 2;

				decoded_image_.CreateFrame(size_y, static_cast<uint8_t*>(pFrameYUV->data[0]),
					size_u, static_cast<uint8_t*>(pFrameYUV->data[1]),
					size_v, static_cast<uint8_t*>(pFrameYUV->data[2]),
					pCodecCtx->width,
					pCodecCtx->height,
					pFrameYUV->linesize[0],
					pFrameYUV->linesize[1],
					pFrameYUV->linesize[2]);

				decoded_image_.set_timestamp(input_image._timeStamp);
				decode_complete_callback_->Decoded(decoded_image_);
				return WEBRTC_VIDEO_CODEC_OK;
			}
			else
				printf(".");
			av_free_packet(packet);
		}
		return WEBRTC_VIDEO_CODEC_OK;
	}

The implementation of other methods remains unchanged. So far, the re encapsulation of ffmpeg decoding function has been completed.