X265 motion estimation

Posted by KashKurcura on Wed, 05 Feb 2020

Considering the inter frame correlation, only the dynamic information (motion information, MV, motion vector of the object in the video) in the video needs to be encoded, which can greatly reduce the number of bits needed to be encoded.
On the other hand, accurate segmentation of moving objects is very complex. The motion estimation of video coding standard is based on pixel block (the complexity is too large if it is accurate to pixel level).
In X265, motion estimation is implemented in search:: predintersect to return the sequence number of motion vector and reference frame.

Motion estimation function

/* find the best inter prediction for each PU of specified mode */
void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t refMasks[2])
    ProfileCUScope(interMode.cu, motionEstimationElapsedTime, countMotionEstimate);

    CUData& cu = interMode.cu;
    Yuv* predYuv = &interMode.predYuv;

    // 12 mv candidates including lowresMV
    MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];

    const Slice *slice = m_slice;
    int numPart     = cu.getNumPartInter(0);
    int numPredDir  = slice->isInterP() ? 1 : 2;
    const int* numRefIdx = slice->m_numRefIdx;
    uint32_t lastMode = 0;
    int      totalmebits = 0;
    MV       mvzero(0, 0);
    Yuv&     tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
    MergeData merge;
    memset(&merge, 0, sizeof(merge));
    bool useAsMVP = false;
    for (int puIdx = 0; puIdx < numPart; puIdx++)
        MotionData* bestME = interMode.bestME[puIdx];
        PredictionUnit pu(cu, cuGeom, puIdx);
        m_me.setSourcePU(*interMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, bChromaMC);
        useAsMVP = false;
        x265_analysis_inter_data* interDataCTU = NULL;
        int cuIdx;
        cuIdx = (interMode.cu.m_cuAddr * m_param->num4x4Partitions) + cuGeom.absPartIdx;
        if (m_param->analysisReuseLevel == 10 && m_param->interRefine > 1)
            interDataCTU = m_frame->m_analysisData.interData;
            if ((cu.m_predMode[pu.puAbsPartIdx] == interDataCTU->modes[cuIdx + pu.puAbsPartIdx])
                && (cu.m_partSize[pu.puAbsPartIdx] == interDataCTU->partSize[cuIdx + pu.puAbsPartIdx])
                && !(interDataCTU->mergeFlag[cuIdx + puIdx])
                && (cu.m_cuDepth[0] == interDataCTU->depth[cuIdx]))
                useAsMVP = true;
        /* find best cost merge candidate. note: 2Nx2N merge and bidir are handled as separate modes */
        uint32_t mrgCost = numPart == 1 ? MAX_UINT : mergeEstimation(cu, cuGeom, pu, puIdx, merge);
        bestME[0].cost = MAX_UINT;
        bestME[1].cost = MAX_UINT;

        getBlkBits((PartSize)cu.m_partSize[0], slice->isInterP(), puIdx, lastMode, m_listSelBits);
        bool bDoUnidir = true;

        cu.getNeighbourMV(puIdx, pu.puAbsPartIdx, interMode.interNeighbours);
        /* Uni-directional prediction */
        if ((m_param->analysisLoad && m_param->analysisReuseLevel > 1 && m_param->analysisReuseLevel != 10)
            || (m_param->analysisMultiPassRefine && m_param->rc.bStatRead) || (m_param->bAnalysisType == AVC_INFO) || (useAsMVP))
            for (int list = 0; list < numPredDir; list++)

                int ref = -1;
                if (useAsMVP)
                    ref = interDataCTU->refIdx[list][cuIdx + puIdx];
                    ref = bestME[list].ref;
                if (ref < 0)
                uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;
                bits += getTUBits(ref, numRefIdx[list]);

                int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
                const MV* amvp = interMode.amvpCand[list][ref];
                int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
                MV mvmin, mvmax, outmv, mvp;
                mvp = amvp[mvpIdx];
                if (m_param->searchMethod == X265_SEA)
                    int puX = puIdx & 1;
                    int puY = puIdx >> 1;
                    for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
                        m_me.integral[planes] = interMode.fencYuv->m_integral[list][ref][planes] + puX * pu.width + puY * pu.height * m_slice->m_refFrameList[list][ref]->m_reconPic->m_stride;
                setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
                MV mvpIn = mvp;
                int satdCost;
                if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && mvpIdx == bestME[list].mvpIdx)
                    mvpIn = bestME[list].mv;
                if (useAsMVP)
                    MV bestmv, mvpSel[3];
                    int mvpIdxSel[3];
                    satdCost = m_me.COST_MAX;
                    mvpSel[0] = interDataCTU->mv[list][cuIdx + puIdx].word;
                    mvpIdxSel[0] = interDataCTU->mvpIdx[list][cuIdx + puIdx];
                    if (m_param->mvRefine > 1)
                        mvpSel[1] = interMode.amvpCand[list][ref][mvpIdx];
                        mvpIdxSel[1] = mvpIdx;
                        if (m_param->mvRefine > 2)
                            mvpSel[2] = interMode.amvpCand[list][ref][!mvpIdx];
                            mvpIdxSel[2] = !mvpIdx;
                    for (int cand = 0; cand < m_param->mvRefine; cand++)
                        if (cand && (mvpSel[cand] == mvpSel[cand - 1] || (cand == 2 && mvpSel[cand] == mvpSel[cand - 2])))
                        setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
                        int bcost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvpSel[cand], numMvc, mvc, m_param->searchRange, bestmv, m_param->maxSlices,
                            m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
                        if (satdCost > bcost)
                            satdCost = bcost;
                            outmv = bestmv;
                            mvp = mvpSel[cand];
                            mvpIdx = mvpIdxSel[cand];
                    satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvpIn, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices,
                        m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);

                /* Get total cost of partition, but only include MV bit cost once */
                bits += m_me.bitcost(outmv);
                uint32_t mvCost = m_me.mvcost(outmv);
                uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
                /* Refine MVP selection, updates: mvpIdx, bits, cost */
                if (!(m_param->analysisMultiPassRefine || useAsMVP))
                    mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
                    /* It is more accurate to compare with actual mvp that was used in motionestimate than amvp[mvpIdx]. Here 
                      the actual mvp is bestME from pass 1 for that mvpIdx */
                    int diffBits = m_me.bitcost(outmv, amvp[!mvpIdx]) - m_me.bitcost(outmv, mvpIn);
                    if (diffBits < 0)
                        mvpIdx = !mvpIdx;
                        uint32_t origOutBits = bits;
                        bits = origOutBits + diffBits;
                        cost = (cost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(bits);
                    mvp = amvp[mvpIdx];

                if (cost < bestME[list].cost)
                    bestME[list].mv = outmv;
                    bestME[list].mvp = mvp;
                    bestME[list].mvpIdx = mvpIdx;
                    bestME[list].cost = cost;
                    bestME[list].bits = bits;
                    bestME[list].mvCost  = mvCost;
                    bestME[list].ref = ref;
                bDoUnidir = false;
        else if (m_param->bDistributeMotionEstimation)
            PME pme(*this, interMode, cuGeom, pu, puIdx);
            pme.m_jobTotal = 0;
            pme.m_jobAcquired = 1; /* reserve L0-0 or L1-0 */

            uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : (uint32_t)-1;
            for (int list = 0; list < numPredDir; list++)
                int idx = 0;
                for (int ref = 0; ref < numRefIdx[list]; ref++)
                    if (!(refMask & (1 << ref)))

                    pme.m_jobs.ref[list][idx++]  = ref;
                pme.m_jobs.refCnt[list] = idx;

                /* the second list ref bits start at bit 16 */
                refMask >>= 16;

            if (pme.m_jobTotal > 2)
                pme.tryBondPeers(*m_frame->m_encData->m_jobProvider, pme.m_jobTotal - 1);

                processPME(pme, *this);

                int ref = pme.m_jobs.refCnt[0] ? pme.m_jobs.ref[0][0] : pme.m_jobs.ref[1][0];
                singleMotionEstimation(*this, interMode, pu, puIdx, 0, ref); /* L0-0 or L1-0 */

                bDoUnidir = false;

                ProfileCUScopeNamed(pmeWaitScope, interMode.cu, pmeBlockTime, countPMEMasters);

            /* if no peer threads were bonded, fall back to doing unidirectional
             * searches ourselves without overhead of singleMotionEstimation() */
        if (bDoUnidir)
            interMode.bestME[puIdx][0].ref = interMode.bestME[puIdx][1].ref = -1;
            uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : (uint32_t)-1;

            for (int list = 0; list < numPredDir; list++)
                for (int ref = 0; ref < numRefIdx[list]; ref++)
                    ProfileCounter(interMode.cu, totalMotionReferences[cuGeom.depth]);

                    if (!(refMask & (1 << ref)))
                        ProfileCounter(interMode.cu, skippedMotionReferences[cuGeom.depth]);

                    uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;
                    bits += getTUBits(ref, numRefIdx[list]);

                    int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);

                    const MV* amvp = interMode.amvpCand[list][ref];
                    int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
                    MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx], mvp_lowres;
                    bool bLowresMVP = false;

                    if (!m_param->analysisSave && !m_param->analysisLoad) /* Prevents load/save outputs from diverging when lowresMV is not available */
                        MV lmv = getLowresMV(cu, pu, list, ref);
                        if (lmv.notZero())
                            mvc[numMvc++] = lmv;
                        if (m_param->bEnableHME)
                            mvp_lowres = lmv;
                    if (m_param->searchMethod == X265_SEA)
                        int puX = puIdx & 1;
                        int puY = puIdx >> 1;
                        for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
                            m_me.integral[planes] = interMode.fencYuv->m_integral[list][ref][planes] + puX * pu.width + puY * pu.height * m_slice->m_refFrameList[list][ref]->m_reconPic->m_stride;
                    setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
                    int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, 
                      m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);

                    if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp)
                        MV outmv_lowres;
                        setSearchRange(cu, mvp_lowres, m_param->searchRange, mvmin, mvmax);
                        int lowresMvCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices,
                            m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
                        if (lowresMvCost < satdCost)
                            outmv = outmv_lowres;
                            satdCost = lowresMvCost;
                            bLowresMVP = true;

                    /* Get total cost of partition, but only include MV bit cost once */
                    bits += m_me.bitcost(outmv);
                    uint32_t mvCost = m_me.mvcost(outmv);
                    uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
                    /* Update LowresMVP to best AMVP cand*/
                    if (bLowresMVP)
                        updateMVP(amvp[mvpIdx], outmv, bits, cost, mvp_lowres);

                    /* Refine MVP selection, updates: mvpIdx, bits, cost */
                    mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);

                    if (cost < bestME[list].cost)
                        bestME[list].mv      = outmv;
                        bestME[list].mvp     = mvp;
                        bestME[list].mvpIdx  = mvpIdx;
                        bestME[list].ref     = ref;
                        bestME[list].cost    = cost;
                        bestME[list].bits    = bits;
                        bestME[list].mvCost  = mvCost;
                /* the second list ref bits start at bit 16 */
                refMask >>= 16;

        /* Bi-directional prediction */
        MotionData bidir[2];
        uint32_t bidirCost = MAX_UINT;
        int bidirBits = 0;

        if (slice->isInterB() && !cu.isBipredRestriction() &&  /* biprediction is possible for this PU */
            cu.m_partSize[pu.puAbsPartIdx] != SIZE_2Nx2N &&    /* 2Nx2N biprediction is handled elsewhere */
            bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT)
            bidir[0] = bestME[0];
            bidir[1] = bestME[1];

            int satdCost;

            if (m_me.bChromaSATD)
                cu.m_mv[0][pu.puAbsPartIdx] = bidir[0].mv;
                cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;
                cu.m_mv[1][pu.puAbsPartIdx] = bidir[1].mv;
                cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;
                motionCompensation(cu, pu, tmpPredYuv, true, true);

                satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +
                           m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);
                PicYuv* refPic0 = slice->m_refReconPicList[0][bestME[0].ref];
                PicYuv* refPic1 = slice->m_refReconPicList[1][bestME[1].ref];
                Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;

                /* Generate reference subpels */
                predInterLumaPixel(pu, bidirYuv[0], *refPic0, bestME[0].mv);
                predInterLumaPixel(pu, bidirYuv[1], *refPic1, bestME[1].mv);
                primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (bidirYuv[0].m_size % 64 == 0) && (bidirYuv[1].m_size % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(pu.puAbsPartIdx), bidirYuv[0].m_size,
                                                                                                 bidirYuv[1].getLumaAddr(pu.puAbsPartIdx), bidirYuv[1].m_size, 32);
                satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);

            bidirBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
            bidirCost = satdCost + m_rdCost.getCost(bidirBits);

            bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
            if (bTryZero)
                /* Do not try zero MV if unidir motion predictors are beyond
                 * valid search area */
                MV mvmin, mvmax;
                int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight);
                setSearchRange(cu, mvzero, merange, mvmin, mvmax);
                mvmax.y += 2; // there is some pad for subpel refine
                mvmin <<= 2;
                mvmax <<= 2;

                bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
                bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
            if (bTryZero)
                /* coincident blocks of the two reference pictures */
                if (m_me.bChromaSATD)
                    cu.m_mv[0][pu.puAbsPartIdx] = mvzero;
                    cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;
                    cu.m_mv[1][pu.puAbsPartIdx] = mvzero;
                    cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;
                    motionCompensation(cu, pu, tmpPredYuv, true, true);

                    satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +
                               m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);
                    const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
                    const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
                    intptr_t refStride = slice->m_mref[0][0].lumaStride;
                    primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (refStride % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
                    satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
                MV mvp0 = bestME[0].mvp;
                int mvpIdx0 = bestME[0].mvpIdx;
                uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);

                MV mvp1 = bestME[1].mvp;
                int mvpIdx1 = bestME[1].mvpIdx;
                uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);

                uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);

                /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
                mvp0 = checkBestMVP(interMode.amvpCand[0][bestME[0].ref], mvzero, mvpIdx0, bits0, cost);
                mvp1 = checkBestMVP(interMode.amvpCand[1][bestME[1].ref], mvzero, mvpIdx1, bits1, cost);

                if (cost < bidirCost)
                    bidir[0].mv = mvzero;
                    bidir[1].mv = mvzero;
                    bidir[0].mvp = mvp0;
                    bidir[1].mvp = mvp1;
                    bidir[0].mvpIdx = mvpIdx0;
                    bidir[1].mvpIdx = mvpIdx1;
                    bidirCost = cost;
                    bidirBits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);

        /* select best option and store into CU */
        if (mrgCost < bidirCost && mrgCost < bestME[0].cost && mrgCost < bestME[1].cost)
            cu.m_mergeFlag[pu.puAbsPartIdx] = true;
            cu.m_mvpIdx[0][pu.puAbsPartIdx] = merge.index; /* merge candidate ID is stored in L0 MVP idx */
            cu.setPUInterDir(merge.dir, pu.puAbsPartIdx, puIdx);
            cu.setPUMv(0, merge.mvField[0].mv, pu.puAbsPartIdx, puIdx);
            cu.setPURefIdx(0, merge.mvField[0].refIdx, pu.puAbsPartIdx, puIdx);
            cu.setPUMv(1, merge.mvField[1].mv, pu.puAbsPartIdx, puIdx);
            cu.setPURefIdx(1, merge.mvField[1].refIdx, pu.puAbsPartIdx, puIdx);

            totalmebits += merge.bits;
        else if (bidirCost < bestME[0].cost && bidirCost < bestME[1].cost)
            lastMode = 2;

            cu.m_mergeFlag[pu.puAbsPartIdx] = false;
            cu.setPUInterDir(3, pu.puAbsPartIdx, puIdx);
            cu.setPUMv(0, bidir[0].mv, pu.puAbsPartIdx, puIdx);
            cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx);
            cu.m_mvd[0][pu.puAbsPartIdx] = bidir[0].mv - bidir[0].mvp;
            cu.m_mvpIdx[0][pu.puAbsPartIdx] = bidir[0].mvpIdx;

            cu.setPUMv(1, bidir[1].mv, pu.puAbsPartIdx, puIdx);
            cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx);
            cu.m_mvd[1][pu.puAbsPartIdx] = bidir[1].mv - bidir[1].mvp;
            cu.m_mvpIdx[1][pu.puAbsPartIdx] = bidir[1].mvpIdx;

            totalmebits += bidirBits;
        else if (bestME[0].cost <= bestME[1].cost)
            lastMode = 0;

            cu.m_mergeFlag[pu.puAbsPartIdx] = false;
            cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);
            cu.setPUMv(0, bestME[0].mv, pu.puAbsPartIdx, puIdx);
            cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx);
            cu.m_mvd[0][pu.puAbsPartIdx] = bestME[0].mv - bestME[0].mvp;
            cu.m_mvpIdx[0][pu.puAbsPartIdx] = bestME[0].mvpIdx;

            cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
            cu.setPUMv(1, mvzero, pu.puAbsPartIdx, puIdx);

            totalmebits += bestME[0].bits;
            lastMode = 1;

            cu.m_mergeFlag[pu.puAbsPartIdx] = false;
            cu.setPUInterDir(2, pu.puAbsPartIdx, puIdx);
            cu.setPUMv(1, bestME[1].mv, pu.puAbsPartIdx, puIdx);
            cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx);
            cu.m_mvd[1][pu.puAbsPartIdx] = bestME[1].mv - bestME[1].mvp;
            cu.m_mvpIdx[1][pu.puAbsPartIdx] = bestME[1].mvpIdx;

            cu.setPURefIdx(0, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
            cu.setPUMv(0, mvzero, pu.puAbsPartIdx, puIdx);

            totalmebits += bestME[1].bits;

        motionCompensation(cu, pu, *predYuv, true, bChromaMC);
    interMode.sa8dBits += totalmebits;
