X264-编码模块和NAL打包输出

在上一篇介绍了编码器的VCL编码操作，分析了函数x264_slice_write（）。函数x264_slice_write（）里有四个关键模块，分别是宏块分析模块、宏块编码模块、熵编码模块和滤波模块，再加上NAL打包输出部分，是我们这里要讲的内容。

1.编码模块

宏块分析模块：调用函数x264_macroblock_analyse（）。分为两部分：帧内宏块和帧间宏块。帧内宏块用于分析帧内的预测模式，而帧间宏块进行运动估计，分析帧间的预测模式。

x264_macroblock_analyse（）：

void x264_macroblock_analyse( x264_t *h )

{

    x264_mb_analysis_t analysis;

    int i_cost = COST_MAX;

    //通过码率控制方法，获取本宏块QP

    h->mb.i_qp = x264_ratecontrol_mb_qp( h );

    /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,

     * to lower the bit cost of the qp_delta.  Don't do this if QPRD is enabled. */

    if( h->param.rc.i_aq_mode && h->param.analyse.i_subpel_refine < 10 )

        h->mb.i_qp = abs(h->mb.i_qp - h->mb.i_last_qp) == 1 ? h->mb.i_last_qp : h->mb.i_qp;

    if( h->param.analyse.b_mb_info )

        h->fdec->effective_qp[h->mb.i_mb_xy] = h->mb.i_qp; /* Store the real analysis QP. */

    //初始化

    x264_mb_analyse_init( h, &analysis, h->mb.i_qp );

    //I帧：只使用帧内预测，分别计算亮度16x16（4种）和4x4（9种）所有模式的代价值，选出代价最小的模式

    //P帧：计算帧内模式和帧间模式（ P Slice允许有Intra宏块和P宏块；同理B帧也支持Intra宏块）。

    //对P帧的每一种分割进行帧间预测，得到最佳的运动矢量及最佳匹配块。

    //帧间预测过程：选出最佳矢量——>找到最佳的整像素点——>找到最佳的二分之一像素点——>找到最佳的1/4像素点

    //然后取代价最小的为最佳MV和分割方式

    //最后从帧内模式和帧间模式中选择代价比较小的方式（有可能没有找到很好的匹配块，这时候就直接使用帧内预测而不是帧间预测）。

    if( h->sh.i_type == SLICE_TYPE_I )

    {

    	//I slice

    	//通过一系列帧内预测模式（16x16的4种,4x4的9种）代价的计算得出代价最小的最优模式

intra_analysis:

        if( analysis.i_mbrd )

            x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );

        //帧内预测分析

        //从16×16的SAD,4个8×8的SAD和，16个4×4SAD中选出最优方式

        x264_mb_analyse_intra( h, &analysis, COST_MAX );

        if( analysis.i_mbrd )

            x264_intra_rd( h, &analysis, COST_MAX );

        //分析结果都存储在analysis结构体中

        //开销

        i_cost = analysis.i_satd_i16x16;

        h->mb.i_type = I_16x16;

        //如果I4x4或者I8x8开销更小的话就拷贝

        //copy if little

        COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );

        COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );

        //画面极其特殊的时候，才有可能用到PCM

        if( analysis.i_satd_pcm < i_cost )

            h->mb.i_type = I_PCM;

        else if( analysis.i_mbrd >= 2 )

            x264_intra_rd_refine( h, &analysis );

    }

    else if( h->sh.i_type == SLICE_TYPE_P )

    {

    	//P slice

        int b_skip = 0;

        h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );

        analysis.b_try_skip = 0;

        if( analysis.b_force_intra )

        {

            if( !h->param.analyse.b_psy )

            {

                x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );

                goto intra_analysis;

            }

        }

        else

        {

            if( h->fdec->mb_info && (h->fdec->mb_info[h->mb.i_mb_xy]&X264_MBINFO_CONSTANT) )

            {

                if( !SLICE_MBAFF && (h->fdec->i_frame - h->fref[0][0]->i_frame) == 1 && !h->sh.b_weighted_pred &&

                    h->fref[0][0]->effective_qp[h->mb.i_mb_xy] <= h->mb.i_qp )

                {

                    h->mb.i_partition = D_16x16;

                    if( !M32(h->mb.cache.pskip_mv) )

                    {

                        b_skip = 1;

                        h->mb.i_type = P_SKIP;

                    }

                    else

                    {

                        h->mb.i_type = P_L0;

                        analysis.l0.me16x16.i_ref = 0;

                        M32( analysis.l0.me16x16.mv ) = 0;

                    }

                    goto skip_analysis;

                }

                else if( h->param.analyse.b_mb_info_update )

                    h->fdec->mb_info[h->mb.i_mb_xy] &= ~X264_MBINFO_CONSTANT;

            }

            int skip_invalid = h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1];

            /* If the current macroblock is off the frame, just skip it. */

            if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y * 16 >= h->param.i_height && !skip_invalid )

                b_skip = 1;

            /* Fast P_SKIP detection */

            else if( h->param.analyse.b_fast_pskip )

            {

                if( skip_invalid )

                    // FIXME don't need to check this if the reference frame is done

                    {}

                else if( h->param.analyse.i_subpel_refine >= 3 )

                    analysis.b_try_skip = 1;

                else if( h->mb.i_mb_type_left[0] == P_SKIP ||

                         h->mb.i_mb_type_top == P_SKIP ||

                         h->mb.i_mb_type_topleft == P_SKIP ||

                         h->mb.i_mb_type_topright == P_SKIP )

                    b_skip = x264_macroblock_probe_pskip( h );//检查是否是Skip类型

            }

        }

        h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );

        if( b_skip )

        {

            h->mb.i_type = P_SKIP;

            h->mb.i_partition = D_16x16;

            assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );

skip_analysis:

            for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )

                M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;

        }

        else

        {

            const unsigned int flags = h->param.analyse.inter;

            int i_type;

            int i_partition;

            int i_satd_inter, i_satd_intra;

            x264_mb_analyse_load_costs( h, &analysis );

        	/*

        	 * 16x16 帧间预测宏块分析-P

        	 *

        	 * +--------+--------+

        	 * |                 |

        	 * |                 |

        	 * |                 |

        	 * +        +        +

        	 * |                 |

        	 * |                 |

        	 * |                 |

        	 * +--------+--------+

        	 *

        	 */

            x264_mb_analyse_inter_p16x16( h, &analysis );

            if( h->mb.i_type == P_SKIP )

            {

                for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )

                    M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;

                return;

            }

            if( flags & X264_ANALYSE_PSUB16x16 )

            {

                if( h->param.analyse.b_mixed_references )

                    x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );

                else{

                	/*

                	 * 8x8帧间预测宏块分析-P

					 * +--------+

					 * |        |

					 * |        |

					 * |        |

					 * +--------+

                	 */

                    x264_mb_analyse_inter_p8x8( h, &analysis );

                }

            }

            /* Select best inter mode */

            i_type = P_L0;

            i_partition = D_16x16;

            i_cost = analysis.l0.me16x16.cost;

            //如果8x8的代价值小于16x16

            //则进行8x8子块分割的处理

            //处理的数据源自于l0

            if( ( flags & X264_ANALYSE_PSUB16x16 ) && (!analysis.b_early_terminate ||

                analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost) )

            {

                i_type = P_8x8;

                i_partition = D_8x8;

                i_cost = analysis.l0.i_cost8x8;

                /* Do sub 8x8 */

                if( flags & X264_ANALYSE_PSUB8x8 )

                {

                    for( int i = 0; i < 4; i++ )

                    {

                    	//8x8块的子块的分析

                    	/*

                    	 * 4x4

        				 * +----+----+

        				 * |    |    |

        				 * +----+----+

        				 * |    |    |

        				 * +----+----+

        				 *

        				 */

                        x264_mb_analyse_inter_p4x4( h, &analysis, i );

                        int i_thresh8x4 = analysis.l0.me4x4[i][1].cost_mv + analysis.l0.me4x4[i][2].cost_mv;

                        //如果4x4小于8x8

                        //则再分析8x4，4x8的代价

                        if( !analysis.b_early_terminate || analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost + i_thresh8x4 )

                        {

                            int i_cost8x8 = analysis.l0.i_cost4x4[i];

                            h->mb.i_sub_partition[i] = D_L0_4x4;

                            /*

							 * 8x4

							 * +----+----+

							 * |         |

							 * +----+----+

							 * |         |

							 * +----+----+

							 *

							 */

                            //如果8x4小于8x8

                            x264_mb_analyse_inter_p8x4( h, &analysis, i );

                            COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],

                                         h->mb.i_sub_partition[i], D_L0_8x4 );

                        	/*

                        	 * 4x8

            				 * +----+----+

            				 * |    |    |

            				 * +    +    +

            				 * |    |    |

            				 * +----+----+

            				 *

            				 */

                            //如果4x8小于8x8

                            x264_mb_analyse_inter_p4x8( h, &analysis, i );

                            COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],

                                         h->mb.i_sub_partition[i], D_L0_4x8 );

                            i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;

                        }

                        x264_mb_cache_mv_p8x8( h, &analysis, i );

                    }

                    analysis.l0.i_cost8x8 = i_cost;

                }

            }

            /* Now do 16x8/8x16 */

            int i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;

            //前提要求8x8的代价值小于16x16

            if( ( flags & X264_ANALYSE_PSUB16x16 ) && (!analysis.b_early_terminate ||

                analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8) )

            {

                int i_avg_mv_ref_cost = (analysis.l0.me8x8[2].cost_mv + analysis.l0.me8x8[2].i_ref_cost

                                      + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;

                analysis.i_cost_est16x8[1] = analysis.i_satd8x8[0][2] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;

            	/*

            	 * 16x8 宏块划分

            	 *

            	 * +--------+--------+

            	 * |        |        |

            	 * |        |        |

            	 * |        |        |

            	 * +--------+--------+

            	 *

            	 */

                x264_mb_analyse_inter_p16x8( h, &analysis, i_cost );

                COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );

                i_avg_mv_ref_cost = (analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[1].i_ref_cost

                                  + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;

                analysis.i_cost_est8x16[1] = analysis.i_satd8x8[0][1] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;

            	/*

            	 * 8x16 宏块划分

            	 *

            	 * +--------+

            	 * |        |

            	 * |        |

            	 * |        |

            	 * +--------+

            	 * |        |

            	 * |        |

            	 * |        |

            	 * +--------+

            	 *

            	 */

                x264_mb_analyse_inter_p8x16( h, &analysis, i_cost );

                COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );

            }

            h->mb.i_partition = i_partition;

            /* refine qpel */

            //亚像素精度搜索

            //FIXME mb_type costs?

            if( analysis.i_mbrd || !h->mb.i_subpel_refine )

            {

                /* refine later */

            }

            else if( i_partition == D_16x16 )

            {

                x264_me_refine_qpel( h, &analysis.l0.me16x16 );

                i_cost = analysis.l0.me16x16.cost;

            }

            else if( i_partition == D_16x8 )

            {

                x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );

                x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );

                i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;

            }

            else if( i_partition == D_8x16 )

            {

                x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );

                x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );

                i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;

            }

            else if( i_partition == D_8x8 )

            {

                i_cost = 0;

                for( int i8x8 = 0; i8x8 < 4; i8x8++ )

                {

                    switch( h->mb.i_sub_partition[i8x8] )

                    {

                        case D_L0_8x8:

                            x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );

                            i_cost += analysis.l0.me8x8[i8x8].cost;

                            break;

                        case D_L0_8x4:

                            x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );

                            x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );

                            i_cost += analysis.l0.me8x4[i8x8][0].cost +

                                      analysis.l0.me8x4[i8x8][1].cost;

                            break;

                        case D_L0_4x8:

                            x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );

                            x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );

                            i_cost += analysis.l0.me4x8[i8x8][0].cost +

                                      analysis.l0.me4x8[i8x8][1].cost;

                            break;

                        case D_L0_4x4:

                            x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );

                            x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );

                            x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );

                            x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );

                            i_cost += analysis.l0.me4x4[i8x8][0].cost +

                                      analysis.l0.me4x4[i8x8][1].cost +

                                      analysis.l0.me4x4[i8x8][2].cost +

                                      analysis.l0.me4x4[i8x8][3].cost;

                            break;

                        default:

                            x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );

                            break;

                    }

                }

            }

            if( h->mb.b_chroma_me )

            {

                if( CHROMA444 )

                {

                    x264_mb_analyse_intra( h, &analysis, i_cost );

                    x264_mb_analyse_intra_chroma( h, &analysis );

                }

                else

                {

                    x264_mb_analyse_intra_chroma( h, &analysis );

                    x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_chroma );

                }

                analysis.i_satd_i16x16 += analysis.i_satd_chroma;

                analysis.i_satd_i8x8   += analysis.i_satd_chroma;

                analysis.i_satd_i4x4   += analysis.i_satd_chroma;

            }

            else

                x264_mb_analyse_intra( h, &analysis, i_cost );//P Slice中也允许有Intra宏块，所以也要进行分析

            i_satd_inter = i_cost;

            i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,

                                      analysis.i_satd_i8x8,

                                      analysis.i_satd_i4x4 );

            if( analysis.i_mbrd )

            {

                x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );

                i_type = P_L0;

                i_partition = D_16x16;

                i_cost = analysis.l0.i_rd16x16;

                COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );

                COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );

                COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );

                h->mb.i_type = i_type;

                h->mb.i_partition = i_partition;

                if( i_cost < COST_MAX )

                    x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );

                x264_intra_rd( h, &analysis, i_satd_inter * 5/4 + 1 );

            }

            //获取最小的代价

            COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );

            COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );

            COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );

            COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );

            h->mb.i_type = i_type;

            if( analysis.b_force_intra && !IS_INTRA(i_type) )

            {

                /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if

                 * it was an inter block. */

                x264_analyse_update_cache( h, &analysis );

                x264_macroblock_encode( h );

                for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )

                    h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, h->mb.pic.p_fdec[p], FDEC_STRIDE, 16 );

                if( !CHROMA444 )

                {

                    int height = 16 >> CHROMA_V_SHIFT;

                    h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, height );

                    h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, height );

                }

                x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );

                goto intra_analysis;

            }

            if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )

            {

                if( IS_INTRA( h->mb.i_type ) )

                {

                    x264_intra_rd_refine( h, &analysis );

                }

                else if( i_partition == D_16x16 )

                {

                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );

                    analysis.l0.me16x16.cost = i_cost;

                    x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );

                }

                else if( i_partition == D_16x8 )

                {

                    h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =

                    h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;

                    x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );

                    x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );

                    x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );

                    x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );

                }

                else if( i_partition == D_8x16 )

                {

                    h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =

                    h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;

                    x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );

                    x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );

                    x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );

                    x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );

                }

                else if( i_partition == D_8x8 )

                {

                    x264_analyse_update_cache( h, &analysis );

                    for( int i8x8 = 0; i8x8 < 4; i8x8++ )

                    {

                        if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )

                        {

                            x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );

                        }

                        else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )

                        {

                            x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );

                            x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );

                        }

                        else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )

                        {

                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );

                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );

                        }

                        else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )

                        {

                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );

                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );

                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );

                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );

                        }

                    }

                }

            }

        }

    }

    else if( h->sh.i_type == SLICE_TYPE_B )//B Slice的时候

    {

        int i_bskip_cost = COST_MAX;

        int b_skip = 0;

        if( analysis.i_mbrd )

            x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );

        h->mb.i_type = B_SKIP;

        if( h->mb.b_direct_auto_write )

        {

            /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */

            for( int i = 0; i < 2; i++ )

            {

                int b_changed = 1;

                h->sh.b_direct_spatial_mv_pred ^= 1;

                analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );

                if( analysis.b_direct_available )

                {

                    if( b_changed )

                    {

                        x264_mb_mc( h );

                        b_skip = x264_macroblock_probe_bskip( h );

                    }

                    h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;

                }

                else

                    b_skip = 0;

            }

        }

        else

            analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );

        analysis.b_try_skip = 0;

        if( analysis.b_direct_available )

        {

            if( !h->mb.b_direct_auto_write )

                x264_mb_mc( h );

            /* If the current macroblock is off the frame, just skip it. */

            if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y * 16 >= h->param.i_height )

                b_skip = 1;

            else if( analysis.i_mbrd )

            {

                i_bskip_cost = ssd_mb( h );

                /* 6 = minimum cavlc cost of a non-skipped MB */

                b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);

            }

            else if( !h->mb.b_direct_auto_write )

            {

                /* Conditioning the probe on neighboring block types

                 * doesn't seem to help speed or quality. */

                analysis.b_try_skip = x264_macroblock_probe_bskip( h );

                if( h->param.analyse.i_subpel_refine < 3 )

                    b_skip = analysis.b_try_skip;

            }

            /* Set up MVs for future predictors */

            if( b_skip )

            {

                for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )

                    M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;

                for( int i = 0; i < h->mb.pic.i_fref[1]; i++ )

                    M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;

            }

        }

        if( !b_skip )

        {

            const unsigned int flags = h->param.analyse.inter;

            int i_type;

            int i_partition;

            int i_satd_inter;

            h->mb.b_skip_mc = 0;

            h->mb.i_type = B_DIRECT;

            x264_mb_analyse_load_costs( h, &analysis );

            /* select best inter mode */

            /* direct must be first */

            if( analysis.b_direct_available )

                x264_mb_analyse_inter_direct( h, &analysis );

        	/*

        	 * 16x16 帧间预测宏块分析-B

        	 *

        	 * +--------+--------+

        	 * |                 |

        	 * |                 |

        	 * |                 |

        	 * +        +        +

        	 * |                 |

        	 * |                 |

        	 * |                 |

        	 * +--------+--------+

        	 *

        	 */

            x264_mb_analyse_inter_b16x16( h, &analysis );

            if( h->mb.i_type == B_SKIP )

            {

                for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )

                    M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;

                for( int i = 1; i < h->mb.pic.i_fref[1]; i++ )

                    M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;

                return;

            }

            i_type = B_L0_L0;

            i_partition = D_16x16;

            i_cost = analysis.l0.me16x16.cost;

            COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );

            COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );

            COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );

            if( analysis.i_mbrd && analysis.b_early_terminate && analysis.i_cost16x16direct <= i_cost * 33/32 )

            {

                x264_mb_analyse_b_rd( h, &analysis, i_cost );

                if( i_bskip_cost < analysis.i_rd16x16direct &&

                    i_bskip_cost < analysis.i_rd16x16bi &&

                    i_bskip_cost < analysis.l0.i_rd16x16 &&

                    i_bskip_cost < analysis.l1.i_rd16x16 )

                {

                    h->mb.i_type = B_SKIP;

                    x264_analyse_update_cache( h, &analysis );

                    return;

                }

            }

            if( flags & X264_ANALYSE_BSUB16x16 )

            {

            	/*

				 * 8x8 帧间预测宏块分析-B

				 * +--------+

				 * |        |

				 * |        |

				 * |        |

				 * +--------+

				 *

				 */

                if( h->param.analyse.b_mixed_references )

                    x264_mb_analyse_inter_b8x8_mixed_ref( h, &analysis );

                else

                    x264_mb_analyse_inter_b8x8( h, &analysis );

                COPY3_IF_LT( i_cost, analysis.i_cost8x8bi, i_type, B_8x8, i_partition, D_8x8 );

                /* Try to estimate the cost of b16x8/b8x16 based on the satd scores of the b8x8 modes */

                int i_cost_est16x8bi_total = 0, i_cost_est8x16bi_total = 0;

                int i_mb_type, i_partition16x8[2], i_partition8x16[2];

                for( int i = 0; i < 2; i++ )

                {

                    int avg_l0_mv_ref_cost, avg_l1_mv_ref_cost;

                    int i_l0_satd, i_l1_satd, i_bi_satd, i_best_cost;

                    // 16x8

                    i_best_cost = COST_MAX;

                    i_l0_satd = analysis.i_satd8x8[0][i*2] + analysis.i_satd8x8[0][i*2+1];

                    i_l1_satd = analysis.i_satd8x8[1][i*2] + analysis.i_satd8x8[1][i*2+1];

                    i_bi_satd = analysis.i_satd8x8[2][i*2] + analysis.i_satd8x8[2][i*2+1];

                    avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i*2].cost_mv + analysis.l0.me8x8[i*2].i_ref_cost

                                         + analysis.l0.me8x8[i*2+1].cost_mv + analysis.l0.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;

                    avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i*2].cost_mv + analysis.l1.me8x8[i*2].i_ref_cost

                                         + analysis.l1.me8x8[i*2+1].cost_mv + analysis.l1.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;

                    COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition16x8[i], D_L0_8x8 );

                    COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition16x8[i], D_L1_8x8 );

                    COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition16x8[i], D_BI_8x8 );

                    analysis.i_cost_est16x8[i] = i_best_cost;

                    // 8x16

                    i_best_cost = COST_MAX;

                    i_l0_satd = analysis.i_satd8x8[0][i] + analysis.i_satd8x8[0][i+2];

                    i_l1_satd = analysis.i_satd8x8[1][i] + analysis.i_satd8x8[1][i+2];

                    i_bi_satd = analysis.i_satd8x8[2][i] + analysis.i_satd8x8[2][i+2];

                    avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i].cost_mv + analysis.l0.me8x8[i].i_ref_cost

                                         + analysis.l0.me8x8[i+2].cost_mv + analysis.l0.me8x8[i+2].i_ref_cost + 1 ) >> 1;

                    avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i].cost_mv + analysis.l1.me8x8[i].i_ref_cost

                                         + analysis.l1.me8x8[i+2].cost_mv + analysis.l1.me8x8[i+2].i_ref_cost + 1 ) >> 1;

                    COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition8x16[i], D_L0_8x8 );

                    COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition8x16[i], D_L1_8x8 );

                    COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition8x16[i], D_BI_8x8 );

                    analysis.i_cost_est8x16[i] = i_best_cost;

                }

                i_mb_type = B_L0_L0 + (i_partition16x8[0]>>2) * 3 + (i_partition16x8[1]>>2);

                analysis.i_cost_est16x8[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];

                i_cost_est16x8bi_total = analysis.i_cost_est16x8[0] + analysis.i_cost_est16x8[1];

                i_mb_type = B_L0_L0 + (i_partition8x16[0]>>2) * 3 + (i_partition8x16[1]>>2);

                analysis.i_cost_est8x16[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];

                i_cost_est8x16bi_total = analysis.i_cost_est8x16[0] + analysis.i_cost_est8x16[1];

                /* We can gain a little speed by checking the mode with the lowest estimated cost first */

                int try_16x8_first = i_cost_est16x8bi_total < i_cost_est8x16bi_total;

                if( try_16x8_first && (!analysis.b_early_terminate || i_cost_est16x8bi_total < i_cost) )

                {

                    x264_mb_analyse_inter_b16x8( h, &analysis, i_cost );

                    COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );

                }

                if( !analysis.b_early_terminate || i_cost_est8x16bi_total < i_cost )

                {

                    x264_mb_analyse_inter_b8x16( h, &analysis, i_cost );

                    COPY3_IF_LT( i_cost, analysis.i_cost8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );

                }

                if( !try_16x8_first && (!analysis.b_early_terminate || i_cost_est16x8bi_total < i_cost) )

                {

                    x264_mb_analyse_inter_b16x8( h, &analysis, i_cost );

                    COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );

                }

            }

            if( analysis.i_mbrd || !h->mb.i_subpel_refine )

            {

                /* refine later */

            }

            /* refine qpel */

            else if( i_partition == D_16x16 )

            {

                analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];

                analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];

                if( i_type == B_L0_L0 )

                {

                    x264_me_refine_qpel( h, &analysis.l0.me16x16 );

                    i_cost = analysis.l0.me16x16.cost

                           + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];

                }

                else if( i_type == B_L1_L1 )

                {

                    x264_me_refine_qpel( h, &analysis.l1.me16x16 );

                    i_cost = analysis.l1.me16x16.cost

                           + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];

                }

                else if( i_type == B_BI_BI )

                {

                    x264_me_refine_qpel( h, &analysis.l0.bi16x16 );

                    x264_me_refine_qpel( h, &analysis.l1.bi16x16 );

                }

            }

            else if( i_partition == D_16x8 )

            {

                for( int i = 0; i < 2; i++ )

                {

                    if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )

                        x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );

                    if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )

                        x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );

                }

            }

            else if( i_partition == D_8x16 )

            {

                for( int i = 0; i < 2; i++ )

                {

                    if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )

                        x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );

                    if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )

                        x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );

                }

            }

            else if( i_partition == D_8x8 )

            {

                for( int i = 0; i < 4; i++ )

                {

                    x264_me_t *m;

                    int i_part_cost_old;

                    int i_type_cost;

                    int i_part_type = h->mb.i_sub_partition[i];

                    int b_bidir = (i_part_type == D_BI_8x8);

                    if( i_part_type == D_DIRECT_8x8 )

                        continue;

                    if( x264_mb_partition_listX_table[0][i_part_type] )

                    {

                        m = &analysis.l0.me8x8[i];

                        i_part_cost_old = m->cost;

                        i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];

                        m->cost -= i_type_cost;

                        x264_me_refine_qpel( h, m );

                        if( !b_bidir )

                            analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;

                    }

                    if( x264_mb_partition_listX_table[1][i_part_type] )

                    {

                        m = &analysis.l1.me8x8[i];

                        i_part_cost_old = m->cost;

                        i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];

                        m->cost -= i_type_cost;

                        x264_me_refine_qpel( h, m );

                        if( !b_bidir )

                            analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;

                    }

                    /* TODO: update mvp? */

                }

            }

            i_satd_inter = i_cost;

            if( analysis.i_mbrd )

            {

                x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );

                i_type = B_SKIP;

                i_cost = i_bskip_cost;

                i_partition = D_16x16;

                COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );

                COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );

                COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );

                COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );

                COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );

                COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );

                COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );

                h->mb.i_type = i_type;

                h->mb.i_partition = i_partition;

            }

            if( h->mb.b_chroma_me )

            {

                if( CHROMA444 )

                {

                    x264_mb_analyse_intra( h, &analysis, i_satd_inter );

                    x264_mb_analyse_intra_chroma( h, &analysis );

                }

                else

                {

                    x264_mb_analyse_intra_chroma( h, &analysis );

                    x264_mb_analyse_intra( h, &analysis, i_satd_inter - analysis.i_satd_chroma );

                }

                analysis.i_satd_i16x16 += analysis.i_satd_chroma;

                analysis.i_satd_i8x8   += analysis.i_satd_chroma;

                analysis.i_satd_i4x4   += analysis.i_satd_chroma;

            }

            else

                x264_mb_analyse_intra( h, &analysis, i_satd_inter );

            if( analysis.i_mbrd )

            {

                x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );

                x264_intra_rd( h, &analysis, i_satd_inter * 17/16 + 1 );

            }

            COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );

            COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );

            COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );

            COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );

            h->mb.i_type = i_type;

            h->mb.i_partition = i_partition;

            if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )

                x264_intra_rd_refine( h, &analysis );

            if( h->mb.i_subpel_refine >= 5 )

                x264_refine_bidir( h, &analysis );

            if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )

            {

                int i_biweight;

                x264_analyse_update_cache( h, &analysis );

                if( i_partition == D_16x16 )

                {

                    if( i_type == B_L0_L0 )

                    {

                        analysis.l0.me16x16.cost = i_cost;

                        x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );

                    }

                    else if( i_type == B_L1_L1 )

                    {

                        analysis.l1.me16x16.cost = i_cost;

                        x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );

                    }

                    else if( i_type == B_BI_BI )

                    {

                        i_biweight = h->mb.bipred_weight[analysis.l0.bi16x16.i_ref][analysis.l1.bi16x16.i_ref];

                        x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );

                    }

                }

                else if( i_partition == D_16x8 )

                {

                    for( int i = 0; i < 2; i++ )

                    {

                        h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];

                        if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )

                            x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );

                        else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )

                            x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );

                        else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )

                        {

                            i_biweight = h->mb.bipred_weight[analysis.l0.me16x8[i].i_ref][analysis.l1.me16x8[i].i_ref];

                            x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );

                        }

                    }

                }

                else if( i_partition == D_8x16 )

                {

                    for( int i = 0; i < 2; i++ )

                    {

                        h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];

                        if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )

                            x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );

                        else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )

                            x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );

                        else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )

                        {

                            i_biweight = h->mb.bipred_weight[analysis.l0.me8x16[i].i_ref][analysis.l1.me8x16[i].i_ref];

                            x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );

                        }

                    }

                }

                else if( i_partition == D_8x8 )

                {

                    for( int i = 0; i < 4; i++ )

                    {

                        if( h->mb.i_sub_partition[i] == D_L0_8x8 )

                            x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );

                        else if( h->mb.i_sub_partition[i] == D_L1_8x8 )

                            x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );

                        else if( h->mb.i_sub_partition[i] == D_BI_8x8 )

                        {

                            i_biweight = h->mb.bipred_weight[analysis.l0.me8x8[i].i_ref][analysis.l1.me8x8[i].i_ref];

                            x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );

                        }

                    }

                }

            }

        }

    }

    x264_analyse_update_cache( h, &analysis );

    /* In rare cases we can end up qpel-RDing our way back to a larger partition size

     * without realizing it.  Check for this and account for it if necessary. */

    if( analysis.i_mbrd >= 2 )

    {

        /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */

        static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};

        int list = check_mv_lists[h->mb.i_type] - 1;

        if( list >= 0 && h->mb.i_partition != D_16x16 &&

            M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&

            h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )

                h->mb.i_partition = D_16x16;

    }

    if( !analysis.i_mbrd )

        x264_mb_analyse_transform( h );

    if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )

        x264_mb_analyse_qp_rd( h, &analysis );

    h->mb.b_trellis = h->param.analyse.i_trellis;

    h->mb.b_noise_reduction = h->mb.b_noise_reduction || (!!h->param.analyse.i_noise_reduction && !IS_INTRA( h->mb.i_type ));

    if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )

        x264_psy_trellis_init( h, 0 );

    if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )

        h->mb.i_skip_intra = 0;

}

大致流程：

（1）如果当前是I Slice，调用x264_mb_analyse_intra（）进行Intra宏块的帧内预测模式分析；

（2）如果是P Slice：

（a）调用x264_macroblock_probe_pskip（）分析是否为Skip宏块，如果是下面步骤不再进行分析；

（b）调用x264_mb_analyse_inter_p16x16（）分析P16x16帧间预测的代价；

（c）调用x264_mb_analyse_inter_p8x8（）分析P8x8帧间预测的代价；

（d）如果P8x8代价值小于P16x16，则依次对4个8x8的子宏块分割进行判断：

（i）调用x264_mb_analyse_inter_p4x4（）分析P4x4帧间预测的代价；

（ii）如果P4x4的代价值小于P8x8，则调用x264_mb_analyse_inter_p8x4和x264_mb_analyse_inter_p4x8分析P8x4和P4x8帧间预测的代价；

（e）如果P8x8代价值小于P16x16，调用x264_mb_analyse_inter_p16x8和x264_mb_analyse_inter_p8x16分析P16x8和P8x16帧间预测的代价；

（f）此外调用x264_mb_analyse_intra（），检查当前宏块作为Intra宏块编码的代价是否小于作为P宏块编码的代价；

（3）如果当前是B Slice，则进行和P Slice同样的处理。

宏块编码模块：调用函数x264_macroblock_encode_internal（）。

x264_macroblock_encode_internal（）：

static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_count, int chroma )

{

    int i_qp = h->mb.i_qp;

    int b_decimate = h->mb.b_dct_decimate;

    int b_force_no_skip = 0;

    int nz;

    h->mb.i_cbp_luma = 0;

    for( int p = 0; p < plane_count; p++ )

        h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = 0;

    //PCM，不常见

    if( h->mb.i_type == I_PCM )

    {

        /* if PCM is chosen, we need to store reconstructed frame data */

        for( int p = 0; p < plane_count; p++ )

            h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc[p], FENC_STRIDE, 16 );

        if( chroma )

        {

            int height = 16 >> CHROMA_V_SHIFT;

            h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, height );

            h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, height );

        }

        return;

    }

    if( !h->mb.b_allow_skip )

    {

        b_force_no_skip = 1;

        if( IS_SKIP(h->mb.i_type) )

        {

            if( h->mb.i_type == P_SKIP )

                h->mb.i_type = P_L0;

            else if( h->mb.i_type == B_SKIP )

                h->mb.i_type = B_DIRECT;

        }

    }

    //根据不同的宏块类型，进行编码

    if( h->mb.i_type == P_SKIP )

    {

        /* don't do pskip motion compensation if it was already done in macroblock_analyse */

        if( !h->mb.b_skip_mc )

        {

            int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][0],

                                  h->mb.mv_min[0], h->mb.mv_max[0] );

            int mvy = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][1],

                                  h->mb.mv_min[1], h->mb.mv_max[1] );

            for( int p = 0; p < plane_count; p++ )

                h->mc.mc_luma( h->mb.pic.p_fdec[p], FDEC_STRIDE,

                               &h->mb.pic.p_fref[0][0][p*4], h->mb.pic.i_stride[p],

                               mvx, mvy, 16, 16, &h->sh.weight[0][p] );

            if( chroma )

            {

                int v_shift = CHROMA_V_SHIFT;

                int height = 16 >> v_shift;

                /* Special case for mv0, which is (of course) very common in P-skip mode. */

                if( mvx | mvy )

                    h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE,

                                     h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],

                                     mvx, 2*mvy>>v_shift, 8, height );

                else

                    h->mc.load_deinterleave_chroma_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4],

                                                         h->mb.pic.i_stride[1], height );

                if( h->sh.weight[0][1].weightfn )

                    h->sh.weight[0][1].weightfn[8>>2]( h->mb.pic.p_fdec[1], FDEC_STRIDE,

                                                       h->mb.pic.p_fdec[1], FDEC_STRIDE,

                                                       &h->sh.weight[0][1], height );

                if( h->sh.weight[0][2].weightfn )

                    h->sh.weight[0][2].weightfn[8>>2]( h->mb.pic.p_fdec[2], FDEC_STRIDE,

                                                       h->mb.pic.p_fdec[2], FDEC_STRIDE,

                                                       &h->sh.weight[0][2], height );

            }

        }

        //编码skip类型宏块

        x264_macroblock_encode_skip( h );

        return;

    }

    if( h->mb.i_type == B_SKIP )

    {

        /* don't do bskip motion compensation if it was already done in macroblock_analyse */

        if( !h->mb.b_skip_mc )

            x264_mb_mc( h );

        x264_macroblock_encode_skip( h );

        return;

    }

    if( h->mb.i_type == I_16x16 )

    {

        h->mb.b_transform_8x8 = 0;

        //Intra16x16宏块编码-需要Hadamard变换

        //分别编码Y，U，V

    	/*

    	 * 16x16 宏块

    	 *

    	 * +--------+--------+

    	 * |                 |

    	 * |                 |

    	 * |                 |

    	 * +        +        +

    	 * |                 |

    	 * |                 |

    	 * |                 |

    	 * +--------+--------+

    	 *

    	 */

        for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )

            x264_mb_encode_i16x16( h, p, i_qp );

    }

    else if( h->mb.i_type == I_8x8 )

    {

        h->mb.b_transform_8x8 = 1;

        /* If we already encoded 3 of the 4 i8x8 blocks, we don't have to do them again. */

        if( h->mb.i_skip_intra )

        {

            h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i8x8_fdec_buf, 16, 16 );

            M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i8x8_nnz_buf[0];

            M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i8x8_nnz_buf[1];

            M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i8x8_nnz_buf[2];

            M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i8x8_nnz_buf[3];

            h->mb.i_cbp_luma = h->mb.pic.i8x8_cbp;

            /* In RD mode, restore the now-overwritten DCT data. */

            if( h->mb.i_skip_intra == 2 )

                h->mc.memcpy_aligned( h->dct.luma8x8, h->mb.pic.i8x8_dct_buf, sizeof(h->mb.pic.i8x8_dct_buf) );

        }

        for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )

        {

            for( int i = (p == 0 && h->mb.i_skip_intra) ? 3 : 0 ; i < 4; i++ )

            {

                int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]];

                x264_mb_encode_i8x8( h, p, i, i_qp, i_mode, NULL, 1 );

            }

        }

    }

    //Intra4x4类型

    else if( h->mb.i_type == I_4x4 )

    {

        /*

		 * 帧内预测：16x16 宏块被划分为16个4x4子块

		 *

		 * +----+----+----+----+

		 * |    |    |    |    |

		 * +----+----+----+----+

		 * |    |    |    |    |

		 * +----+----+----+----+

		 * |    |    |    |    |

		 * +----+----+----+----+

		 * |    |    |    |    |

		 * +----+----+----+----+

		 *

		 */

        h->mb.b_transform_8x8 = 0;

        /* If we already encoded 15 of the 16 i4x4 blocks, we don't have to do them again. */

        if( h->mb.i_skip_intra )

        {

            h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i4x4_fdec_buf, 16, 16 );

            M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i4x4_nnz_buf[0];

            M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i4x4_nnz_buf[1];

            M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i4x4_nnz_buf[2];

            M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i4x4_nnz_buf[3];

            h->mb.i_cbp_luma = h->mb.pic.i4x4_cbp;

            /* In RD mode, restore the now-overwritten DCT data. */

            if( h->mb.i_skip_intra == 2 )

                h->mc.memcpy_aligned( h->dct.luma4x4, h->mb.pic.i4x4_dct_buf, sizeof(h->mb.pic.i4x4_dct_buf) );

        }

        //分别编码Y,U,V

        for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )

        {

        	//循环16次，编码16个Intra4x4宏块

            for( int i = (p == 0 && h->mb.i_skip_intra) ? 15 : 0 ; i < 16; i++ )

            {

                pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[i]];

                int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];

                if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )

                    /* emulate missing topright samples */

                    MPIXEL_X4( &p_dst[4-FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst[3-FDEC_STRIDE] );

                //Intra4x4宏块编码

                /*

                 * +----+

                 * |    |

                 * +----+

                 */

                x264_mb_encode_i4x4( h, p, i, i_qp, i_mode, 1 );

            }

        }

    }

    //包含帧间预测

    else    /* Inter MB */

    {

        int i_decimate_mb = 0;

        /* Don't repeat motion compensation if it was already done in non-RD transform analysis */

        if( !h->mb.b_skip_mc )

            x264_mb_mc( h );

        if( h->mb.b_lossless )//lossless情况没研究过

        {

            if( h->mb.b_transform_8x8 )

                for( int p = 0; p < plane_count; p++ )

                    for( int i8x8 = 0; i8x8 < 4; i8x8++ )

                    {

                        int x = i8x8&1;

                        int y = i8x8>>1;

                        nz = h->zigzagf.sub_8x8( h->dct.luma8x8[p*4+i8x8], h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE,

                                                                           h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE );

                        STORE_8x8_NNZ( p, i8x8, nz );

                        h->mb.i_cbp_luma |= nz << i8x8;

                    }

            else

                for( int p = 0; p < plane_count; p++ )

                    for( int i4x4 = 0; i4x4 < 16; i4x4++ )

                    {

                        nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+i4x4],

                                                 h->mb.pic.p_fenc[p]+block_idx_xy_fenc[i4x4],

                                                 h->mb.pic.p_fdec[p]+block_idx_xy_fdec[i4x4] );

                        h->mb.cache.non_zero_count[x264_scan8[p*16+i4x4]] = nz;

                        h->mb.i_cbp_luma |= nz << (i4x4>>2);

                    }

        }

        else if( h->mb.b_transform_8x8 )//DCT8x8情况暂时没研究过

        {

            ALIGNED_ARRAY_N( dctcoef, dct8x8,[4],[64] );

            b_decimate &= !h->mb.b_trellis || !h->param.b_cabac; // 8x8 trellis is inherently optimal decimation for CABAC

            for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )

            {

                CLEAR_16x16_NNZ( p );

                h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[p], h->mb.pic.p_fdec[p] );

                h->nr_count[1+!!p*2] += h->mb.b_noise_reduction * 4;

                int plane_cbp = 0;

                for( int idx = 0; idx < 4; idx++ )

                {

                    nz = x264_quant_8x8( h, dct8x8[idx], i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, idx );

                    if( nz )

                    {

                        h->zigzagf.scan_8x8( h->dct.luma8x8[p*4+idx], dct8x8[idx] );

                        if( b_decimate )

                        {

                            int i_decimate_8x8 = h->quantf.decimate_score64( h->dct.luma8x8[p*4+idx] );

                            i_decimate_mb += i_decimate_8x8;

                            if( i_decimate_8x8 >= 4 )

                                plane_cbp |= 1<<idx;

                        }

                        else

                            plane_cbp |= 1<<idx;

                    }

                }

                if( i_decimate_mb >= 6 || !b_decimate )

                {

                    h->mb.i_cbp_luma |= plane_cbp;

                    FOREACH_BIT( idx, 0, plane_cbp )

                    {

                        h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[p?CQM_8PC:CQM_8PY], i_qp );

                        h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[p][8*(idx&1) + 8*(idx>>1)*FDEC_STRIDE], dct8x8[idx] );

                        STORE_8x8_NNZ( p, idx, 1 );

                    }

                }

            }

        }

        else//最普通的情况

        {

        	/*

			 * 帧间预测：16x16 宏块被划分为8x8

			 * 每个8x8再次被划分为4x4

			 *

			 * ++====+====++====+====++

			 * ||    |    ||    |    ||

			 * ++====+====++====+====++

			 * ||    |    ||    |    ||

			 * ++====+====++====+====++

			 * ||    |    ||    |    ||

			 * ++====+====++====+====++

			 * ||    |    ||    |    ||

			 * ++====+====+=====+====++

			 *

			 */

            ALIGNED_ARRAY_N( dctcoef, dct4x4,[16],[16] );

            for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )

            {

                CLEAR_16x16_NNZ( p );

                //16x16DCT（实际上分解为16个4x4DCT）

                //求编码帧p_fenc和重建帧p_fdec之间的残差，然后进行DCT变换

                h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[p], h->mb.pic.p_fdec[p] );

                if( h->mb.b_noise_reduction )

                {

                    h->nr_count[0+!!p*2] += 16;

                    for( int idx = 0; idx < 16; idx++ )

                        h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );

                }

                int plane_cbp = 0;

                //16x16的块分成4个8x8的块

                for( int i8x8 = 0; i8x8 < 4; i8x8++ )

                {

                    int i_decimate_8x8 = b_decimate ? 0 : 6;

                    int nnz8x8 = 0;

                    if( h->mb.b_trellis )

                    {

                        for( int i4x4 = 0; i4x4 < 4; i4x4++ )

                        {

                            int idx = i8x8*4+i4x4;

                            if( x264_quant_4x4_trellis( h, dct4x4[idx], CQM_4PY, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, !!p, p*16+idx ) )

                            {

                                h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4[idx] );

                                h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[p?CQM_4PC:CQM_4PY], i_qp );

                                if( i_decimate_8x8 < 6 )

                                    i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+idx] );

                                h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = 1;

                                nnz8x8 = 1;

                            }

                        }

                    }

                    else

                    {

                    	//8x8的块分成4个4x4的块，每个4x4的块再分别进行量化

                        nnz8x8 = nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );

                        if( nz )

                        {

                            FOREACH_BIT( idx, i8x8*4, nz )

                            {

                            	//这几步用于建立重建帧

                                h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4[idx] );

                                //反量化

                                h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[p?CQM_4PC:CQM_4PY], i_qp );

                                if( i_decimate_8x8 < 6 )

                                    i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+idx] );

                                h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = 1;

                            }

                        }

                    }

                    if( nnz8x8 )

                    {

                        i_decimate_mb += i_decimate_8x8;

                        if( i_decimate_8x8 < 4 )

                            STORE_8x8_NNZ( p, i8x8, 0 );

                        else

                            plane_cbp |= 1<<i8x8;

                    }

                }

                if( i_decimate_mb < 6 )

                {

                    plane_cbp = 0;

                    CLEAR_16x16_NNZ( p );

                }

                else

                {

                    h->mb.i_cbp_luma |= plane_cbp;

                    FOREACH_BIT( i8x8, 0, plane_cbp )

                    {

                    	//用于建立重建帧

                    	//残差进行DCT反变换之后，叠加到预测数据上

                        h->dctf.add8x8_idct( &h->mb.pic.p_fdec[p][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );

                    }

                }

            }

        }

    }

    /* encode chroma */

    if( chroma )

    {

        if( IS_INTRA( h->mb.i_type ) )

        {

            int i_mode = h->mb.i_chroma_pred_mode;

            if( h->mb.b_lossless )

                x264_predict_lossless_chroma( h, i_mode );

            else

            {

                h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );

                h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );

            }

        }

        /* encode the 8x8 blocks */

        x264_mb_encode_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp );

    }

    else

        h->mb.i_cbp_chroma = 0;

    /* store cbp */

    int cbp = h->mb.i_cbp_chroma << 4 | h->mb.i_cbp_luma;

    if( h->param.b_cabac )

        cbp |= h->mb.cache.non_zero_count[x264_scan8[LUMA_DC    ]] << 8

            |  h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] << 9

            |  h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] << 10;

    h->mb.cbp[h->mb.i_mb_xy] = cbp;

    /* Check for P_SKIP

     * XXX: in the me perhaps we should take x264_mb_predict_mv_pskip into account

     *      (if multiple mv give same result)*/

    if( !b_force_no_skip )

    {

        if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&

            !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) &&

            M32( h->mb.cache.mv[0][x264_scan8[0]] ) == M32( h->mb.cache.pskip_mv )

            && h->mb.cache.ref[0][x264_scan8[0]] == 0 )

        {

            h->mb.i_type = P_SKIP;

        }

        /* Check for B_SKIP */

        if( h->mb.i_type == B_DIRECT && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) )

        {

            h->mb.i_type = B_SKIP;

        }

    }

}

（1）如果是Skip类型，调用x264_macroblock_encode_skip（）编码宏块；

（2）如果Intra16x16类型，调用x264_mb_encode_i16x16（）编码宏块；

（3）如果Intra4x4类型，循环16次调用x264_mb_encode_i4x4（）编码宏块；

（4）如果Inter类型，则不再调用子函数，二是直接进行编码；

（5）如果对色度编码，调用x264_mb_encode_chroma（）。

滤波模块：调用函数x264_fdec_filter_row（）。

x264_fdec_filter_row（）：

static void x264_fdec_filter_row( x264_t *h, int mb_y, int pass )

{

    /* mb_y is the mb to be encoded next, not the mb to be filtered here */

    int b_hpel = h->fdec->b_kept_as_ref;

    int b_deblock = h->sh.i_disable_deblocking_filter_idc != 1;

    int b_end = mb_y == h->i_threadslice_end;

    int b_measure_quality = 1;

    int min_y = mb_y - (1 << SLICE_MBAFF);

    int b_start = min_y == h->i_threadslice_start;

    /* Even in interlaced mode, deblocking never modifies more than 4 pixels

     * above each MB, as bS=4 doesn't happen for the top of interlaced mbpairs. */

    int minpix_y = min_y*16 - 4 * !b_start;

    int maxpix_y = mb_y*16 - 4 * !b_end;

    b_deblock &= b_hpel || h->param.b_full_recon || h->param.psz_dump_yuv;

    if( h->param.b_sliced_threads )

    {

        switch( pass )

        {

            /* During encode: only do deblock if asked for */

            default:

            case 0:

                b_deblock &= h->param.b_full_recon;

                b_hpel = 0;

                break;

            /* During post-encode pass: do deblock if not done yet, do hpel for all

             * rows except those between slices. */

            case 1:

                b_deblock &= !h->param.b_full_recon;

                b_hpel &= !(b_start && min_y > 0);

                b_measure_quality = 0;

                break;

            /* Final pass: do the rows between slices in sequence. */

            case 2:

                b_deblock = 0;

                b_measure_quality = 0;

                break;

        }

    }

    if( mb_y & SLICE_MBAFF )

        return;

    if( min_y < h->i_threadslice_start )

        return;

    //去块效应滤波

    if( b_deblock )

        for( int y = min_y; y < mb_y; y += (1 << SLICE_MBAFF) )

            x264_frame_deblock_row( h, y );//处理一行

    /* FIXME: Prediction requires different borders for interlaced/progressive mc,

     * but the actual image data is equivalent. For now, maintain this

     * consistency by copying deblocked pixels between planes. */

    if( PARAM_INTERLACED && (!h->param.b_sliced_threads || pass == 1) )

        for( int p = 0; p < h->fdec->i_plane; p++ )

            for( int i = minpix_y>>(CHROMA_V_SHIFT && p); i < maxpix_y>>(CHROMA_V_SHIFT && p); i++ )

                memcpy( h->fdec->plane_fld[p] + i*h->fdec->i_stride[p],

                        h->fdec->plane[p]     + i*h->fdec->i_stride[p],

                        h->mb.i_mb_width*16*sizeof(pixel) );

    if( h->fdec->b_kept_as_ref && (!h->param.b_sliced_threads || pass == 1) )

        x264_frame_expand_border( h, h->fdec, min_y );

    //半像素内插

    if( b_hpel )

    {

        int end = mb_y == h->mb.i_mb_height;

        /* Can't do hpel until the previous slice is done encoding. */

        if( h->param.analyse.i_subpel_refine )

        {

        	//半像素内插

            x264_frame_filter( h, h->fdec, min_y, end );

            x264_frame_expand_border_filtered( h, h->fdec, min_y, end );

        }

    }

    if( SLICE_MBAFF && pass == 0 )

        for( int i = 0; i < 3; i++ )

        {

            XCHG( pixel *, h->intra_border_backup[0][i], h->intra_border_backup[3][i] );

            XCHG( pixel *, h->intra_border_backup[1][i], h->intra_border_backup[4][i] );

        }

    if( h->i_thread_frames > 1 && h->fdec->b_kept_as_ref )

        x264_frame_cond_broadcast( h->fdec, mb_y*16 + (b_end ? 10000 : -(X264_THREAD_HEIGHT << SLICE_MBAFF)) );

    //计算编码的质量

    if( b_measure_quality )

    {

        maxpix_y = X264_MIN( maxpix_y, h->param.i_height );

        //如果需要打印输出PSNR

        if( h->param.analyse.b_psnr )

        {

        	//实际上是计算SSD

        	//输出的时候调用x264_psnr()换算SSD为PSNR

        	/**

        	 * 计算PSNR的过程

        	 *

        	 * MSE = SSD*1/(w*h)

        	 * PSNR= 10*log10(MAX^2/MSE)

        	 *

        	 * 其中MAX指的是图像的灰度级，对于8bit来说就是2^8-1=255

        	 */

            for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )

                h->stat.frame.i_ssd[p] += x264_pixel_ssd_wxh( &h->pixf,

                    h->fdec->plane[p] + minpix_y * h->fdec->i_stride[p], h->fdec->i_stride[p],//重建帧

                    h->fenc->plane[p] + minpix_y * h->fenc->i_stride[p], h->fenc->i_stride[p],//编码帧

                    h->param.i_width, maxpix_y-minpix_y );

            if( !CHROMA444 )

            {

                uint64_t ssd_u, ssd_v;

                int v_shift = CHROMA_V_SHIFT;

                x264_pixel_ssd_nv12( &h->pixf,

                    h->fdec->plane[1] + (minpix_y>>v_shift) * h->fdec->i_stride[1], h->fdec->i_stride[1],

                    h->fenc->plane[1] + (minpix_y>>v_shift) * h->fenc->i_stride[1], h->fenc->i_stride[1],

                    h->param.i_width>>1, (maxpix_y-minpix_y)>>v_shift, &ssd_u, &ssd_v );

                h->stat.frame.i_ssd[1] += ssd_u;

                h->stat.frame.i_ssd[2] += ssd_v;

            }

        }

        //如果需要打印输出SSIM

        if( h->param.analyse.b_ssim )

        {

            int ssim_cnt;

            x264_emms();

            /* offset by 2 pixels to avoid alignment of ssim blocks with dct blocks,

             * and overlap by 4 */

            minpix_y += b_start ? 2 : -6;

            //计算SSIM

            h->stat.frame.f_ssim +=

                x264_pixel_ssim_wxh( &h->pixf,

                    h->fdec->plane[0] + 2+minpix_y*h->fdec->i_stride[0], h->fdec->i_stride[0],//重建帧

                    h->fenc->plane[0] + 2+minpix_y*h->fenc->i_stride[0], h->fenc->i_stride[0],//编码帧

                    h->param.i_width-2, maxpix_y-minpix_y, h->scratch_buffer, &ssim_cnt );

            h->stat.frame.i_ssim_cnt += ssim_cnt;

        }

    }

}

函数x264_fdec_filter_row（）完成了三步工作：

环路滤波；半像素内插；视频质量SSIM和PSNR计算。

熵编码模块：熵编码模块包含两个函数：如果输出设置为CABAC编码，调用函数x264_macroblock_write_cabac（）；如果输出设置为CAVLC编码，调用函数x264_macroblock_write_cavlc（）。

x264_macroblock_write_cavlc（）：

void x264_macroblock_write_cavlc( x264_t *h )

{

    bs_t *s = &h->out.bs;

    const int i_mb_type = h->mb.i_type;

    int plane_count = CHROMA444 ? 3 : 1;

    int chroma = !CHROMA444;

#if RDO_SKIP_BS

    s->i_bits_encoded = 0;

#else

    const int i_mb_pos_start = bs_pos( s );

    int       i_mb_pos_tex;

#endif

    if( SLICE_MBAFF

        && (!(h->mb.i_mb_y & 1) || IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride])) )

    {

        bs_write1( s, MB_INTERLACED );

#if !RDO_SKIP_BS

        h->mb.field_decoding_flag = MB_INTERLACED;

#endif

    }

#if !RDO_SKIP_BS

    if( i_mb_type == I_PCM )

    {

        static const uint8_t i_offsets[3] = {5,23,0};

        uint8_t *p_start = s->p_start;

        bs_write_ue( s, i_offsets[h->sh.i_type] + 25 );

        i_mb_pos_tex = bs_pos( s );

        h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;

        bs_align_0( s );

        for( int p = 0; p < plane_count; p++ )

            for( int i = 0; i < 256; i++ )

                bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[p][i] );

        if( chroma )

            for( int ch = 1; ch < 3; ch++ )

                for( int i = 0; i < 16>>CHROMA_V_SHIFT; i++ )

                    for( int j = 0; j < 8; j++ )

                        bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[ch][i*FENC_STRIDE+j] );

        bs_init( s, s->p, s->p_end - s->p );

        s->p_start = p_start;

        h->stat.frame.i_tex_bits += bs_pos(s) - i_mb_pos_tex;

        return;

    }

#endif

    if( h->sh.i_type == SLICE_TYPE_P )

        x264_cavlc_mb_header_p( h, i_mb_type, chroma );//写入P宏块MB Header数据-CAVLC

    else if( h->sh.i_type == SLICE_TYPE_B )

        x264_cavlc_mb_header_b( h, i_mb_type, chroma );//写入B宏块MB Header数据-CAVLC

    else //if( h->sh.i_type == SLICE_TYPE_I )

        x264_cavlc_mb_header_i( h, i_mb_type, 0, chroma );//写入I宏块MB Header数据-CAVLC

#if !RDO_SKIP_BS

    i_mb_pos_tex = bs_pos( s );

    h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;

#endif

    /* Coded block pattern */

    if( i_mb_type != I_16x16 )

        bs_write_ue( s, cbp_to_golomb[chroma][IS_INTRA(i_mb_type)][(h->mb.i_cbp_chroma << 4)|h->mb.i_cbp_luma] );

    /* transform size 8x8 flag */

    if( x264_mb_transform_8x8_allowed( h ) && h->mb.i_cbp_luma )

        bs_write1( s, h->mb.b_transform_8x8 );

    if( i_mb_type == I_16x16 )

    {

        x264_cavlc_qp_delta( h );

        /* DC Luma */

        for( int p = 0; p < plane_count; p++ )

        {

            x264_cavlc_block_residual( h, DCT_LUMA_DC, LUMA_DC+p, h->dct.luma16x16_dc[p] );

            /* AC Luma */

            if( h->mb.i_cbp_luma )

                for( int i = p*16; i < p*16+16; i++ )

                    x264_cavlc_block_residual( h, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1 );

        }

    }

    else if( h->mb.i_cbp_luma | h->mb.i_cbp_chroma )

    {

        x264_cavlc_qp_delta( h );

        //残差数据

        x264_cavlc_macroblock_luma_residual( h, plane_count );

    }

    if( h->mb.i_cbp_chroma )

    {

        /* Chroma DC residual present */

        x264_cavlc_block_residual( h, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0] );

        x264_cavlc_block_residual( h, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1] );

        if( h->mb.i_cbp_chroma == 2 ) /* Chroma AC residual present */

        {

            int step = 8 << CHROMA_V_SHIFT;

            for( int i = 16; i < 3*16; i += step )

                for( int j = i; j < i+4; j++ )

                    x264_cavlc_block_residual( h, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1 );

        }

    }

#if !RDO_SKIP_BS

    h->stat.frame.i_tex_bits += bs_pos(s) - i_mb_pos_tex;

#endif

}

2.NAL打包：

前面所说的压缩编码过程已经把所有的宏块循环完毕，实现了VCL编码。进行NAL打包是为了增强码流的健壮性，适应网络传输。VCL编码加上NAL头信息就组成完整的NAL单元，输出文件。

这部分的代码位于函数x264_encoder_encode（）中，调用了函数x264_encoder_frame_end（）。

x264_encoder_frame_end（）：在编码结束后做一些后续处理，比如说加上起始码，封装MALU。

//结束的时候做一些处理，记录一些统计信息

//pp_nal：输出的NALU

//pic_out：输出的重建帧

static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,

                                   x264_nal_t **pp_nal, int *pi_nal,

                                   x264_picture_t *pic_out )

{

    char psz_message[80];

    if( !h->param.b_sliced_threads && h->b_thread_active )

    {

        h->b_thread_active = 0;

        if( (intptr_t)x264_threadpool_wait( h->threadpool, h ) )

            return -1;

    }

    if( !h->out.i_nal )

    {

        pic_out->i_type = X264_TYPE_AUTO;

        return 0;

    }

    x264_emms();

    /* generate buffering period sei and insert it into place */

    if( h->i_thread_frames > 1 && h->fenc->b_keyframe && h->sps->vui.b_nal_hrd_parameters_present )

    {

        x264_hrd_fullness( h );

        x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );

        x264_sei_buffering_period_write( h, &h->out.bs );

        if( x264_nal_end( h ) )

           return -1;

        /* buffering period sei must follow AUD, SPS and PPS and precede all other SEIs */

        int idx = 0;

        while( h->out.nal[idx].i_type == NAL_AUD ||

               h->out.nal[idx].i_type == NAL_SPS ||

               h->out.nal[idx].i_type == NAL_PPS )

            idx++;

        x264_nal_t nal_tmp = h->out.nal[h->out.i_nal-1];

        memmove( &h->out.nal[idx+1], &h->out.nal[idx], (h->out.i_nal-idx-1)*sizeof(x264_nal_t) );

        h->out.nal[idx] = nal_tmp;

    }

    //封装一帧数据对应的NALU.

    //例如给NALU添加起始码0x00000001

    int frame_size = x264_encoder_encapsulate_nals( h, 0 );

    if( frame_size < 0 )

        return -1;

    /* Set output picture properties */

    //pic_out为x264_picture_t类型结构体。是libx264对外的结构体

    //fenc,fdec是x264_frame_t类型结构体。是libx264的内部结构体

    pic_out->i_type = h->fenc->i_type;

    pic_out->b_keyframe = h->fenc->b_keyframe;

    pic_out->i_pic_struct = h->fenc->i_pic_struct;

    pic_out->i_pts = h->fdec->i_pts;

    pic_out->i_dts = h->fdec->i_dts;

    if( pic_out->i_pts < pic_out->i_dts )

        x264_log( h, X264_LOG_WARNING, "invalid DTS: PTS is less than DTS\n" );

    pic_out->opaque = h->fenc->opaque;

    pic_out->img.i_csp = h->fdec->i_csp;

#if HIGH_BIT_DEPTH

    pic_out->img.i_csp |= X264_CSP_HIGH_DEPTH;

#endif

    pic_out->img.i_plane = h->fdec->i_plane;

    //图像数据

    for( int i = 0; i < pic_out->img.i_plane; i++ )

    {

        pic_out->img.i_stride[i] = h->fdec->i_stride[i] * sizeof(pixel);

        pic_out->img.plane[i] = (uint8_t*)h->fdec->plane[i];

    }

    //回收用过的编码帧fenc

    x264_frame_push_unused( thread_current, h->fenc );

    /* ---------------------- Update encoder state ------------------------- */

    /* update rc */

    int filler = 0;

    if( x264_ratecontrol_end( h, frame_size * 8, &filler ) < 0 )

        return -1;

    pic_out->hrd_timing = h->fenc->hrd_timing;

    pic_out->prop.f_crf_avg = h->fdec->f_crf_avg;

    /* Filler in AVC-Intra mode is written as zero bytes to the last slice

     * We don't know the size of the last slice until encapsulation so we add filler to the encapsulated NAL */

    if( h->param.i_avcintra_class )

    {

        x264_t *h0 = h->thread[0];

        int ret = x264_check_encapsulated_buffer( h, h0, h->out.i_nal, frame_size, frame_size + filler );

        if( ret < 0 )

            return -1;

        memset( h->out.nal[0].p_payload + frame_size, 0, filler );

        h->out.nal[h->out.i_nal-1].i_payload += filler;

        h->out.nal[h->out.i_nal-1].i_padding = filler;

        frame_size += filler;

    }

    else

    {

        while( filler > 0 )

        {

            int f, overhead;

            overhead = (FILLER_OVERHEAD - h->param.b_annexb);

            if( h->param.i_slice_max_size && filler > h->param.i_slice_max_size )

            {

                int next_size = filler - h->param.i_slice_max_size;

                int overflow = X264_MAX( overhead - next_size, 0 );

                f = h->param.i_slice_max_size - overhead - overflow;

            }

            else

                f = X264_MAX( 0, filler - overhead );

            if( x264_bitstream_check_buffer_filler( h, f ) )

                return -1;

            x264_nal_start( h, NAL_FILLER, NAL_PRIORITY_DISPOSABLE );

            x264_filler_write( h, &h->out.bs, f );

            if( x264_nal_end( h ) )

                return -1;

            int total_size = x264_encoder_encapsulate_nals( h, h->out.i_nal-1 );

            if( total_size < 0 )

                return -1;

            frame_size += total_size;

            filler -= total_size;

        }

    }

    /* End bitstream, set output  */

    *pi_nal = h->out.i_nal;

    *pp_nal = h->out.nal;

    h->out.i_nal = 0;

    x264_noise_reduction_update( h );

    /* ---------------------- Compute/Print statistics --------------------- */

    x264_thread_sync_stat( h, h->thread[0] );

    /* Slice stat */

    //stat中存储了统计信息

    //帧数+1 （根据类型）

    h->stat.i_frame_count[h->sh.i_type]++;

    //帧大小

    h->stat.i_frame_size[h->sh.i_type] += frame_size;

    h->stat.f_frame_qp[h->sh.i_type] += h->fdec->f_qp_avg_aq;

    //统计MB个数，把不同类型的累加起来

    for( int i = 0; i < X264_MBTYPE_MAX; i++ )

        h->stat.i_mb_count[h->sh.i_type][i] += h->stat.frame.i_mb_count[i];

    for( int i = 0; i < X264_PARTTYPE_MAX; i++ )

        h->stat.i_mb_partition[h->sh.i_type][i] += h->stat.frame.i_mb_partition[i];

    for( int i = 0; i < 2; i++ )

        h->stat.i_mb_count_8x8dct[i] += h->stat.frame.i_mb_count_8x8dct[i];

    for( int i = 0; i < 6; i++ )

        h->stat.i_mb_cbp[i] += h->stat.frame.i_mb_cbp[i];

    for( int i = 0; i < 4; i++ )

        for( int j = 0; j < 13; j++ )

            h->stat.i_mb_pred_mode[i][j] += h->stat.frame.i_mb_pred_mode[i][j];

    if( h->sh.i_type != SLICE_TYPE_I )

        for( int i_list = 0; i_list < 2; i_list++ )

            for( int i = 0; i < X264_REF_MAX*2; i++ )

                h->stat.i_mb_count_ref[h->sh.i_type][i_list][i] += h->stat.frame.i_mb_count_ref[i_list][i];

    for( int i = 0; i < 3; i++ )

        h->stat.i_mb_field[i] += h->stat.frame.i_mb_field[i];

    if( h->sh.i_type == SLICE_TYPE_P && h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE )

    {

        h->stat.i_wpred[0] += !!h->sh.weight[0][0].weightfn;

        h->stat.i_wpred[1] += !!h->sh.weight[0][1].weightfn || !!h->sh.weight[0][2].weightfn;

    }

    if( h->sh.i_type == SLICE_TYPE_B )

    {

        h->stat.i_direct_frames[ h->sh.b_direct_spatial_mv_pred ] ++;

        if( h->mb.b_direct_auto_write )

        {

            //FIXME somewhat arbitrary time constants

            if( h->stat.i_direct_score[0] + h->stat.i_direct_score[1] > h->mb.i_mb_count )

                for( int i = 0; i < 2; i++ )

                    h->stat.i_direct_score[i] = h->stat.i_direct_score[i] * 9/10;

            for( int i = 0; i < 2; i++ )

                h->stat.i_direct_score[i] += h->stat.frame.i_direct_score[i];

        }

    }

    else

        h->stat.i_consecutive_bframes[h->fenc->i_bframes]++;

    psz_message[0] = '\0';

    double dur = h->fenc->f_duration;

    h->stat.f_frame_duration[h->sh.i_type] += dur;

    //需要计算PSNR

    if( h->param.analyse.b_psnr )

    {

    	//SSD（Sum of Squared Difference）即差值的平方和

        int64_t ssd[3] =

        {

            h->stat.frame.i_ssd[0],

            h->stat.frame.i_ssd[1],

            h->stat.frame.i_ssd[2],

        };

        int luma_size = h->param.i_width * h->param.i_height;

        int chroma_size = CHROMA_SIZE( luma_size );

        //SSD是已经在“滤波”环节计算过的

        //SSD简单换算成PSNR，调用x264_psnr()

        pic_out->prop.f_psnr[0] = x264_psnr( ssd[0], luma_size );

        pic_out->prop.f_psnr[1] = x264_psnr( ssd[1], chroma_size );

        pic_out->prop.f_psnr[2] = x264_psnr( ssd[2], chroma_size );

        //平均值

        pic_out->prop.f_psnr_avg = x264_psnr( ssd[0] + ssd[1] + ssd[2], luma_size + chroma_size*2 );

        //mean系列的需要累加

        h->stat.f_ssd_global[h->sh.i_type]   += dur * (ssd[0] + ssd[1] + ssd[2]);

        h->stat.f_psnr_average[h->sh.i_type] += dur * pic_out->prop.f_psnr_avg;

        h->stat.f_psnr_mean_y[h->sh.i_type]  += dur * pic_out->prop.f_psnr[0];

        h->stat.f_psnr_mean_u[h->sh.i_type]  += dur * pic_out->prop.f_psnr[1];

        h->stat.f_psnr_mean_v[h->sh.i_type]  += dur * pic_out->prop.f_psnr[2];

        snprintf( psz_message, 80, " PSNR Y:%5.2f U:%5.2f V:%5.2f", pic_out->prop.f_psnr[0],

                                                                    pic_out->prop.f_psnr[1],

                                                                    pic_out->prop.f_psnr[2] );

    }

    //需要计算SSIM

    if( h->param.analyse.b_ssim )

    {

    	//SSIM是已经在“滤波”环节计算过的

        pic_out->prop.f_ssim = h->stat.frame.f_ssim / h->stat.frame.i_ssim_cnt;

        //mean系列的需要累加

        h->stat.f_ssim_mean_y[h->sh.i_type] += pic_out->prop.f_ssim * dur;

        snprintf( psz_message + strlen(psz_message), 80 - strlen(psz_message),

                  " SSIM Y:%.5f", pic_out->prop.f_ssim );

    }

    psz_message[79] = '\0';

    //Debug时候输出

    x264_log( h, X264_LOG_DEBUG,

                  "frame=%4d QP=%.2f NAL=%d Slice:%c Poc:%-3d I:%-4d P:%-4d SKIP:%-4d size=%d bytes%s\n",

              h->i_frame,

              h->fdec->f_qp_avg_aq,

              h->i_nal_ref_idc,

              h->sh.i_type == SLICE_TYPE_I ? 'I' : (h->sh.i_type == SLICE_TYPE_P ? 'P' : 'B' ),

              h->fdec->i_poc,

              h->stat.frame.i_mb_count_i,

              h->stat.frame.i_mb_count_p,

              h->stat.frame.i_mb_count_skip,

              frame_size,

              psz_message );

    // keep stats all in one place

    x264_thread_sync_stat( h->thread[0], h );

    // for the use of the next frame

    x264_thread_sync_stat( thread_current, h );

#ifdef DEBUG_MB_TYPE

{

    static const char mb_chars[] = { 'i', 'i', 'I', 'C', 'P', '8', 'S',

        'D', '<', 'X', 'B', 'X', '>', 'B', 'B', 'B', 'B', '8', 'S' };

    for( int mb_xy = 0; mb_xy < h->mb.i_mb_width * h->mb.i_mb_height; mb_xy++ )

    {

        if( h->mb.type[mb_xy] < X264_MBTYPE_MAX && h->mb.type[mb_xy] >= 0 )

            fprintf( stderr, "%c ", mb_chars[ h->mb.type[mb_xy] ] );

        else

            fprintf( stderr, "? " );

        if( (mb_xy+1) % h->mb.i_mb_width == 0 )

            fprintf( stderr, "\n" );

    }

}

#endif

    /* Remove duplicates, must be done near the end as breaks h->fref0 array

     * by freeing some of its pointers. */

    for( int i = 0; i < h->i_ref[0]; i++ )

        if( h->fref[0][i] && h->fref[0][i]->b_duplicate )

        {

            x264_frame_push_blank_unused( h, h->fref[0][i] );

            h->fref[0][i] = 0;

        }

    if( h->param.psz_dump_yuv )

        x264_frame_dump( h );

    x264_emms();

    return frame_size;

}

x264_encoder_frame_end（）中封装NALU调用了函数x264_encoder_encapsulate_nals（）。

x264_encoder_encapsulate_nals（）：

//封装一帧数据对应的NALU.

//例如给NALU添加起始码0x00000001

static int x264_encoder_encapsulate_nals( x264_t *h, int start )

{

    x264_t *h0 = h->thread[0];

    int nal_size = 0, previous_nal_size = 0;

    if( h->param.nalu_process )

    {

        for( int i = start; i < h->out.i_nal; i++ )

            nal_size += h->out.nal[i].i_payload;

        return nal_size;

    }

    for( int i = 0; i < start; i++ )

        previous_nal_size += h->out.nal[i].i_payload;

    for( int i = start; i < h->out.i_nal; i++ )

        nal_size += h->out.nal[i].i_payload;

    /* Worst-case NAL unit escaping: reallocate the buffer if it's too small. */

    int necessary_size = previous_nal_size + nal_size * 3/2 + h->out.i_nal * 4 + 4 + 64;

    for( int i = start; i < h->out.i_nal; i++ )

        necessary_size += h->out.nal[i].i_padding;

    if( x264_check_encapsulated_buffer( h, h0, start, previous_nal_size, necessary_size ) )

        return -1;

    uint8_t *nal_buffer = h0->nal_buffer + previous_nal_size;

    //一个一个NALU处理

    for( int i = start; i < h->out.i_nal; i++ )

    {

        int old_payload_len = h->out.nal[i].i_payload;

        h->out.nal[i].b_long_startcode = !i || h->out.nal[i].i_type == NAL_SPS || h->out.nal[i].i_type == NAL_PPS ||

                                         h->param.i_avcintra_class;

        //添加起始码

        x264_nal_encode( h, nal_buffer, &h->out.nal[i] );

        nal_buffer += h->out.nal[i].i_payload;

        if( h->param.i_avcintra_class )

        {

            h->out.nal[i].i_padding -= h->out.nal[i].i_payload - (old_payload_len + NALU_OVERHEAD);

            if( h->out.nal[i].i_padding > 0 )

            {

                memset( nal_buffer, 0, h->out.nal[i].i_padding );

                nal_buffer += h->out.nal[i].i_padding;

                h->out.nal[i].i_payload += h->out.nal[i].i_padding;

            }

            h->out.nal[i].i_padding = X264_MAX( h->out.nal[i].i_padding, 0 );

        }

    }

    x264_emms();

    return nal_buffer - (h0->nal_buffer + previous_nal_size);

}

其内部又调用了另一个函数x264_nal_encode（）逐个给一帧数据中的各个NALU添加起始码以及NALU Header。

x264_nal_encode（）：

//添加起始码

void x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal )

{

    uint8_t *src = nal->p_payload;

    uint8_t *end = nal->p_payload + nal->i_payload;

    uint8_t *orig_dst = dst;

    //起始码 ============================================

    //annexb格式，起始码为0x00000001

    if( h->param.b_annexb )

    {

        if( nal->b_long_startcode )

            *dst++ = 0x00;

        *dst++ = 0x00;

        *dst++ = 0x00;

        *dst++ = 0x01;

    }

    else /* save room for size later */

        dst += 4;//mp4格式

    //NALU Header =======================================

    /* nal header */

    *dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;

    dst = h->bsf.nal_escape( dst, src, end );

    int size = (dst - orig_dst) - 4;

    /* Write the size header for mp4/etc */

    //重新回到起始码的位置，写入mp4格式的起始码（size大小，不含起始码）

    if( !h->param.b_annexb )

    {

        /* Size doesn't include the size of the header we're writing now. */

        orig_dst[0] = size>>24;

        orig_dst[1] = size>>16;

        orig_dst[2] = size>> 8;

        orig_dst[3] = size>> 0;

    }

    //NALU负载大小，包含起始码

    nal->i_payload = size+4;

    nal->p_payload = orig_dst;

    x264_emms();

}

添加过程：

（1）annexb模式：在每个NALU前面添加0x00000001；

（2）mp4模式：先计算NALU的长度（不包含前四个字节），再将长度信息写入NALU前面的四个字节；

添加过程分两种是因为H264码流格式有两种：

（1）annexb模式：在这个模式下，每个NALU包含起始码0x00000001，SPS、PPS存储在码流中，最常见的H264裸流就是这种；

（2）mp4模式：这种模式下，每个NALU不包含起始码，原本存储起始码前4个字节存储的是NALU的长度，SPS、PPS单独放在容器的其他位置上，这种H264一般存储在容器中，比如说mp4中。

巴特西

X264-编码模块和NAL打包输出

最新文章

热门文章