/******************************************************************** * * * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * * * * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2011 * * by the Xiph.Org Foundation http://www.xiph.org/ * * * ******************************************************************** function: mode selection code last mod: $Id$ ********************************************************************/ #include <stdio.h> #include <limits.h> #include <math.h> #include <string.h> #include "collect.h" #if defined(OC_COLLECT_METRICS) int OC_HAS_MODE_METRICS; double OC_MODE_RD_WEIGHT_SATD[OC_LOGQ_BINS][3][2][OC_COMP_BINS]; double OC_MODE_RD_WEIGHT_SAD[OC_LOGQ_BINS][3][2][OC_COMP_BINS]; oc_mode_metrics OC_MODE_METRICS_SATD[OC_LOGQ_BINS-1][3][2][OC_COMP_BINS]; oc_mode_metrics OC_MODE_METRICS_SAD[OC_LOGQ_BINS-1][3][2][OC_COMP_BINS]; const char *OC_MODE_METRICS_FILENAME="modedec.stats"; void oc_mode_metrics_add(oc_mode_metrics *_metrics, double _w,int _s,int _q,int _r,double _d){ if(_metrics->w>0){ double ds; double dq; double dr; double dd; double ds2; double dq2; double s2; double sq; double q2; double sr; double qr; double sd; double qd; double s2q; double sq2; double w; double wa; double rwa; double rwa2; double rwb; double rwb2; double rw2; double rw3; double rw4; wa=_metrics->w; ds=_s-_metrics->s/wa; dq=_q-_metrics->q/wa; dr=_r-_metrics->r/wa; dd=_d-_metrics->d/wa; ds2=ds*ds; dq2=dq*dq; s2=_metrics->s2; sq=_metrics->sq; q2=_metrics->q2; sr=_metrics->sr; qr=_metrics->qr; sd=_metrics->sd; qd=_metrics->qd; s2q=_metrics->s2q; sq2=_metrics->sq2; w=wa+_w; rwa=wa/w; rwb=_w/w; rwa2=rwa*rwa; rwb2=rwb*rwb; rw2=wa*rwb; rw3=rw2*(rwa2-rwb2); rw4=_w*rwa2*rwa2+wa*rwb2*rwb2; _metrics->s2q2+=-2*(ds*sq2+dq*s2q)*rwb +(ds2*q2+4*ds*dq*sq+dq2*s2)*rwb2+ds2*dq2*rw4; _metrics->s2q+=(-2*ds*sq-dq*s2)*rwb+ds2*dq*rw3; _metrics->sq2+=(-ds*q2-2*dq*sq)*rwb+ds*dq2*rw3; _metrics->sqr+=(-ds*qr-dq*sr-dr*sq)*rwb+ds*dq*dr*rw3; _metrics->sqd+=(-ds*qd-dq*sd-dd*sq)*rwb+ds*dq*dd*rw3; _metrics->s2+=ds2*rw2; _metrics->sq+=ds*dq*rw2; _metrics->q2+=dq2*rw2; _metrics->sr+=ds*dr*rw2; _metrics->qr+=dq*dr*rw2; _metrics->r2+=dr*dr*rw2; _metrics->sd+=ds*dd*rw2; _metrics->qd+=dq*dd*rw2; _metrics->d2+=dd*dd*rw2; } _metrics->w+=_w; _metrics->s+=_s*_w; _metrics->q+=_q*_w; _metrics->r+=_r*_w; _metrics->d+=_d*_w; } void oc_mode_metrics_merge(oc_mode_metrics *_dst, const oc_mode_metrics *_src,int _n){ int i; /*Find a non-empty set of metrics.*/ for(i=0;i<_n&&_src[i].w==0;i++); if(i>=_n){ memset(_dst,0,sizeof(*_dst)); return; } memcpy(_dst,_src+i,sizeof(*_dst)); /*And iterate over the remaining non-empty sets of metrics.*/ for(i++;i<_n;i++)if(_src[i].w!=0){ double ds; double dq; double dr; double dd; double ds2; double dq2; double s2a; double s2b; double sqa; double sqb; double q2a; double q2b; double sra; double srb; double qra; double qrb; double sda; double sdb; double qda; double qdb; double s2qa; double s2qb; double sq2a; double sq2b; double w; double wa; double wb; double rwa; double rwb; double rwa2; double rwb2; double rw2; double rw3; double rw4; wa=_dst->w; wb=_src[i].w; ds=_src[i].s/wb-_dst->s/wa; dq=_src[i].q/wb-_dst->q/wa; dr=_src[i].r/wb-_dst->r/wa; dd=_src[i].d/wb-_dst->d/wa; ds2=ds*ds; dq2=dq*dq; s2a=_dst->s2; sqa=_dst->sq; q2a=_dst->q2; sra=_dst->sr; qra=_dst->qr; sda=_dst->sd; qda=_dst->qd; s2qa=_dst->s2q; sq2a=_dst->sq2; s2b=_src[i].s2; sqb=_src[i].sq; q2b=_src[i].q2; srb=_src[i].sr; qrb=_src[i].qr; sdb=_src[i].sd; qdb=_src[i].qd; s2qb=_src[i].s2q; sq2b=_src[i].sq2; w=wa+wb; if(w==0)rwa=rwb=0; else{ rwa=wa/w; rwb=wb/w; } rwa2=rwa*rwa; rwb2=rwb*rwb; rw2=wa*rwb; rw3=rw2*(rwa2-rwb2); rw4=wb*rwa2*rwa2+wa*rwb2*rwb2; /* (1,1,1) -> (0,0,0)# (1,0,0) C(1,1)*C(1,0)*C(1,0)-> d^{1,0,0}*(rwa*B_{0,1,1}-rwb*A_{0,1,1}) (0,1,0) C(1,0)*C(1,1)*C(1,0)-> d^{0,1,0}*(rwa*B_{1,0,1}-rwb*A_{1,0,1}) (0,0,1) C(1,0)*C(1,0)*C(1,1)-> d^{0,0,1}*(rwa*B_{1,1,0}-rwb*A_{1,1,0}) (1,1,0)* (1,0,1)* (0,1,1)* (1,1,1) C(1,1)*C(1,1)*C(1,1)-> d^{1,1,1}*(rwa^3*wb-rwb^3*wa) (2,1) -> (0,0)# (1,0) C(2,1)*C(1,1)->2*d^{1,0}*(rwa*B_{1,1}-rwb*A_{1,1}) (0,1) C(2,0)*C(1,1)-> d^{0,1}*(rwa*B_{2,0}-rwb*A_{2,0}) (2,0)* (1,1)* (2,1) C(2,2)*C(1,1)-> d^{2,1}*(rwa^3*wb-rwb^3*wa) (2,2) -> (0,0)# (1,0) C(2,1)*C(2,0)->2*d^{1,0}*(rwa*B_{1,2}-rwb*A_{1,2}) (0,1) C(2,0)*C(2,1)->2*d^{0,1}*(rwa*B_{2,1}-rwb*A_{2,1}) (2,0) C(2,2)*C(2,0)-> d^{2,0}*(rwa^2*B_{0,2}+rwb^2*A_{0,2}) (1,1) C(2,1)*C(2,1)->4*d^{1,1}*(rwa^2*B_{1,1}+rwb^2*A_{1,1}) (0,2) C(2,0)*C(2,2)-> d^{0,2}*(rwa^2*B_{2,0}+rwb^2*A_{2,0}) (1,2)* (2,1)* (2,2) C(2,2)*C(2,2)*d^{2,2}*(rwa^4*wb+rwb^4*wa) */ _dst->s2q2+=_src[i].s2q2+2*(ds*(rwa*sq2b-rwb*sq2a)+dq*(rwa*s2qb-rwb*s2qa)) +ds2*(rwa2*q2b+rwb2*q2a)+4*ds*dq*(rwa2*sqb+rwb2*sqa) +dq2*(rwa2*s2b+rwb2*s2a)+ds2*dq2*rw4; _dst->s2q+=_src[i].s2q+2*ds*(rwa*sqb-rwb*sqa) +dq*(rwa*s2b-rwb*s2a)+ds2*dq*rw3; _dst->sq2+=_src[i].sq2+ds*(rwa*q2b-rwb*q2a) +2*dq*(rwa*sqb-rwb*sqa)+ds*dq2*rw3; _dst->sqr+=_src[i].sqr+ds*(rwa*qrb-rwb*qra)+dq*(rwa*srb-rwb*sra) +dr*(rwa*sqb-rwb*sqa)+ds*dq*dr*rw3; _dst->sqd+=_src[i].sqd+ds*(rwa*qdb-rwb*qda)+dq*(rwa*sdb-rwb*sda) +dd*(rwa*sqb-rwb*sqa)+ds*dq*dd*rw3; _dst->s2+=_src[i].s2+ds2*rw2; _dst->sq+=_src[i].sq+ds*dq*rw2; _dst->q2+=_src[i].q2+dq2*rw2; _dst->sr+=_src[i].sr+ds*dr*rw2; _dst->qr+=_src[i].qr+dq*dr*rw2; _dst->r2+=_src[i].r2+dr*dr*rw2; _dst->sd+=_src[i].sd+ds*dd*rw2; _dst->qd+=_src[i].qd+dq*dd*rw2; _dst->d2+=_src[i].d2+dd*dd*rw2; _dst->w+=_src[i].w; _dst->s+=_src[i].s; _dst->q+=_src[i].q; _dst->r+=_src[i].r; _dst->d+=_src[i].d; } } /*Adjust a single corner of a set of metric bins to minimize the squared prediction error of R and D. Each bin is assumed to cover a quad like so: (s0,q0) (s1,q0) A----------B | | | | | | | | C----------Z (s0,q1) (s1,q1) The values A, B, and C are fixed, and Z is the free parameter. Then, for example, R_i is predicted via bilinear interpolation as x_i=(s_i-s0)/(s1-s0) y_i=(q_i-q0)/(q1-q0) dRds1_i=A+(B-A)*x_i dRds2_i=C+(Z-C)*x_i R_i=dRds1_i+(dRds2_i-dRds1_i)*y_i To find the Z that minimizes the squared prediction error over i, this can be rewritten as R_i-(A+(B-A)*x_i+(C-A)*y_i+(A-B-C)*x_i*y_i)=x_i*y_i*Z Letting X={...,x_i*y_i,...}^T and Y={...,R_i-(A+(B-A)*x_i+(C-A)*y_i+(A-B-C)*x_i*y_i),...}^T, the optimal Z is given by Z=(X^T.Y)/(X^T.X). Now, we need to compute these dot products without actually storing data for each sample. Starting with X^T.X, we have X^T.X = sum(x_i^2*y_i^2) = sum((s_i-s0)^2*(q_i-q0)^2)/((s1-s0)^2*(q1-q0)^2). Expanding the interior of the sum in a monomial basis of s_i and q_i gives s0^2*q0^2 *(1) -2*s0*q0^2*(s_i) -2*s0^2*q0*(q_i) +q0^2 *(s_i^2) +4*s0*q0 *(s_i*q_i) +s0^2 *(q_i^2) -2*q0 *(s_i^2*q_i) -2*s0 *(s_i*q_i^2) +1 *(s_i^2*q_i^2). However, computing things directly in this basis leads to gross numerical errors, as most of the terms will have similar size and destructive cancellation results. A much better basis is the central (co-)moment basis: {1,s_i-sbar,q_i-qbar,(s_i-sbar)^2,(s_i-sbar)*(q_i-qbar),(q_i-qbar)^2, (s_i-sbar)^2*(q_i-qbar),(s_i-sbar)*(q_i-qbar)^2,(s_i-sbar)^2*(q_i-qbar)^2}, where sbar and qbar are the average s and q values over the bin, respectively. In that basis, letting ds=sbar-s0 and dq=qbar-q0, (s_i-s0)^2*(q_i-q0)^2 is ds^2*dq^2*(1) +dq^2 *((s_i-sbar)^2) +4*ds*dq*((s_i-sbar)*(q_i-qbar)) +ds^2 *((q_i-qbar)^2) +2*dq *((s_i-sbar)^2*(q_i-qbar)) +2*ds *((s_i-sbar)*(q_i-qbar)^2) +1 *((s_i-sbar)^2*(q_i-qbar)^2). With these expressions in the central (co-)moment bases, all we need to do is compute sums over the (co-)moment terms, which can be done incrementally (see oc_mode_metrics_add() and oc_mode_metrics_merge()), with no need to store the individual samples. Now, for X^T.Y, we have X^T.Y = sum((R_i-A-((B-A)/(s1-s0))*(s_i-s0)-((C-A)/(q1-q0))*(q_i-q0) -((A-B-C)/((s1-s0)*(q1-q0)))*(s_i-s0)*(q_i-q0))*(s_i-s0)*(q_i-q0))/ ((s1-s0)*(q1-q0)), or, rewriting the constants to simplify notation, X^T.Y = sum((C0+C1*(s_i-s0)+C2*(q_i-q0) +C3*(s_i-s0)*(q_i-q0)+R_i)*(s_i-s0)*(q_i-q0))/((s1-s0)*(q1-q0)). Again, converting to the central (co-)moment basis, the interior of the above sum is ds*dq*(rbar+C0+C1*ds+C2*dq+C3*ds*dq) *(1) +(C1*dq+C3*dq^2) *((s_i-sbar)^2) +(rbar+C0+2*C1*ds+2*C2*dq+4*C3*ds*dq)*((s_i-sbar)*(q_i-qbar)) +(C2*ds+C3*ds^2) *((q_i-qbar)^2) +dq *((s_i-sbar)*(r_i-rbar)) +ds *((q_i-qbar)*(r_i-rbar)) +(C1+2*C3*dq) *((s_i-sbar)^2*(q_i-qbar)) +(C2+2*C3*ds) *((s_i-sbar)*(q_i-qbar)^2) +1 *((s_i-sbar)*(q_i-qbar)*(r_i-rbar)) +C3 *((s_i-sbar)^2*(q_i-qbar)^2). You might think it would be easier (if perhaps slightly less robust) to accumulate terms directly around s0 and q0. However, we update each corner of the bins in turn, so we would have to change basis to move the sums from corner to corner anyway.*/ double oc_mode_metrics_solve(double *_r,double *_d, const oc_mode_metrics *_metrics,const int *_s0,const int *_s1, const int *_q0,const int *_q1, const double *_ra,const double *_rb,const double *_rc, const double *_da,const double *_db,const double *_dc,int _n){ double xx; double rxy; double dxy; double wt; int i; xx=rxy=dxy=wt=0; for(i=0;i<_n;i++)if(_metrics[i].w>0){ double s10; double q10; double sq10; double ds; double dq; double ds2; double dq2; double r; double d; double s2; double sq; double q2; double sr; double qr; double sd; double qd; double s2q; double sq2; double sqr; double sqd; double s2q2; double c0; double c1; double c2; double c3; double w; w=_metrics[i].w; wt+=w; s10=_s1[i]-_s0[i]; q10=_q1[i]-_q0[i]; sq10=s10*q10; ds=_metrics[i].s/w-_s0[i]; dq=_metrics[i].q/w-_q0[i]; ds2=ds*ds; dq2=dq*dq; s2=_metrics[i].s2; sq=_metrics[i].sq; q2=_metrics[i].q2; s2q=_metrics[i].s2q; sq2=_metrics[i].sq2; s2q2=_metrics[i].s2q2; xx+=(dq2*(ds2*w+s2)+4*ds*dq*sq+ds2*q2+2*(dq*s2q+ds*sq2)+s2q2)/(sq10*sq10); r=_metrics[i].r/w; sr=_metrics[i].sr; qr=_metrics[i].qr; sqr=_metrics[i].sqr; c0=-_ra[i]; c1=-(_rb[i]-_ra[i])/s10; c2=-(_rc[i]-_ra[i])/q10; c3=-(_ra[i]-_rb[i]-_rc[i])/sq10; rxy+=(ds*dq*(r+c0+c1*ds+c2*dq+c3*ds*dq)*w+(c1*dq+c3*dq2)*s2 +(r+c0+2*(c1*ds+(c2+2*c3*ds)*dq))*sq+(c2*ds+c3*ds2)*q2+dq*sr+ds*qr +(c1+2*c3*dq)*s2q+(c2+2*c3*ds)*sq2+sqr+c3*s2q2)/sq10; d=_metrics[i].d/w; sd=_metrics[i].sd; qd=_metrics[i].qd; sqd=_metrics[i].sqd; c0=-_da[i]; c1=-(_db[i]-_da[i])/s10; c2=-(_dc[i]-_da[i])/q10; c3=-(_da[i]-_db[i]-_dc[i])/sq10; dxy+=(ds*dq*(d+c0+c1*ds+c2*dq+c3*ds*dq)*w+(c1*dq+c3*dq2)*s2 +(d+c0+2*(c1*ds+(c2+2*c3*ds)*dq))*sq+(c2*ds+c3*ds2)*q2+dq*sd+ds*qd +(c1+2*c3*dq)*s2q+(c2+2*c3*ds)*sq2+sqd+c3*s2q2)/sq10; } if(xx>1E-3){ *_r=rxy/xx; *_d=dxy/xx; } else{ *_r=0; *_d=0; } return wt; } /*Compile collected SATD/logq/rate/RMSE metrics into a form that's immediately useful for mode decision.*/ void oc_mode_metrics_update(oc_mode_metrics (*_metrics)[3][2][OC_COMP_BINS], int _niters_min,int _reweight,oc_mode_rd (*_table)[3][2][OC_COMP_BINS], int _shift,double (*_weight)[3][2][OC_COMP_BINS]){ int niters; int prevdr; int prevdd; int dr; int dd; int pli; int qti; int qi; int si; dd=dr=INT_MAX; niters=0; /*The encoder interpolates rate and RMSE terms bilinearly from an OC_LOGQ_BINS by OC_COMP_BINS grid of sample points in _table. To find the sample values at the grid points that minimize the total squared prediction error actually requires solving a relatively sparse linear system with a number of variables equal to the number of grid points. Instead of writing a general sparse linear system solver, we just use Gauss-Seidel iteration, i.e., we update one grid point at time until they stop changing.*/ do{ prevdr=dr; prevdd=dd; dd=dr=0; for(pli=0;pli<3;pli++){ for(qti=0;qti<2;qti++){ for(qi=0;qi<OC_LOGQ_BINS;qi++){ for(si=0;si<OC_COMP_BINS;si++){ oc_mode_metrics m[4]; int s0[4]; int s1[4]; int q0[4]; int q1[4]; double ra[4]; double rb[4]; double rc[4]; double da[4]; double db[4]; double dc[4]; double r; double d; int rate; int rmse; int ds; int n; n=0; /*Collect the statistics for the (up to) four bins grid point (si,qi) touches.*/ if(qi>0&&si>0){ q0[n]=OC_MODE_LOGQ[qi-1][pli][qti]; q1[n]=OC_MODE_LOGQ[qi][pli][qti]; s0[n]=si-1<<_shift; s1[n]=si<<_shift; ra[n]=ldexp(_table[qi-1][pli][qti][si-1].rate,-OC_BIT_SCALE); da[n]=ldexp(_table[qi-1][pli][qti][si-1].rmse,-OC_RMSE_SCALE); rb[n]=ldexp(_table[qi-1][pli][qti][si].rate,-OC_BIT_SCALE); db[n]=ldexp(_table[qi-1][pli][qti][si].rmse,-OC_RMSE_SCALE); rc[n]=ldexp(_table[qi][pli][qti][si-1].rate,-OC_BIT_SCALE); dc[n]=ldexp(_table[qi][pli][qti][si-1].rmse,-OC_RMSE_SCALE); *(m+n++)=*(_metrics[qi-1][pli][qti]+si-1); } if(qi>0){ ds=si+1<OC_COMP_BINS?1:-1; q0[n]=OC_MODE_LOGQ[qi-1][pli][qti]; q1[n]=OC_MODE_LOGQ[qi][pli][qti]; s0[n]=si+ds<<_shift; s1[n]=si<<_shift; ra[n]=ldexp(_table[qi-1][pli][qti][si+ds].rate,-OC_BIT_SCALE); da[n]= ldexp(_table[qi-1][pli][qti][si+ds].rmse,-OC_RMSE_SCALE); rb[n]=ldexp(_table[qi-1][pli][qti][si].rate,-OC_BIT_SCALE); db[n]=ldexp(_table[qi-1][pli][qti][si].rmse,-OC_RMSE_SCALE); rc[n]=ldexp(_table[qi][pli][qti][si+ds].rate,-OC_BIT_SCALE); dc[n]=ldexp(_table[qi][pli][qti][si+ds].rmse,-OC_RMSE_SCALE); *(m+n++)=*(_metrics[qi-1][pli][qti]+si); } if(qi+1<OC_LOGQ_BINS&&si>0){ q0[n]=OC_MODE_LOGQ[qi+1][pli][qti]; q1[n]=OC_MODE_LOGQ[qi][pli][qti]; s0[n]=si-1<<_shift; s1[n]=si<<_shift; ra[n]=ldexp(_table[qi+1][pli][qti][si-1].rate,-OC_BIT_SCALE); da[n]=ldexp(_table[qi+1][pli][qti][si-1].rmse,-OC_RMSE_SCALE); rb[n]=ldexp(_table[qi+1][pli][qti][si].rate,-OC_BIT_SCALE); db[n]=ldexp(_table[qi+1][pli][qti][si].rmse,-OC_RMSE_SCALE); rc[n]=ldexp(_table[qi][pli][qti][si-1].rate,-OC_BIT_SCALE); dc[n]=ldexp(_table[qi][pli][qti][si-1].rmse,-OC_RMSE_SCALE); *(m+n++)=*(_metrics[qi][pli][qti]+si-1); } if(qi+1<OC_LOGQ_BINS){ ds=si+1<OC_COMP_BINS?1:-1; q0[n]=OC_MODE_LOGQ[qi+1][pli][qti]; q1[n]=OC_MODE_LOGQ[qi][pli][qti]; s0[n]=si+ds<<_shift; s1[n]=si<<_shift; ra[n]=ldexp(_table[qi+1][pli][qti][si+ds].rate,-OC_BIT_SCALE); da[n]= ldexp(_table[qi+1][pli][qti][si+ds].rmse,-OC_RMSE_SCALE); rb[n]=ldexp(_table[qi+1][pli][qti][si].rate,-OC_BIT_SCALE); db[n]=ldexp(_table[qi+1][pli][qti][si].rmse,-OC_RMSE_SCALE); rc[n]=ldexp(_table[qi][pli][qti][si+ds].rate,-OC_BIT_SCALE); dc[n]=ldexp(_table[qi][pli][qti][si+ds].rmse,-OC_RMSE_SCALE); *(m+n++)=*(_metrics[qi][pli][qti]+si); } /*On the first pass, initialize with a simple weighted average of the neighboring bins.*/ if(!OC_HAS_MODE_METRICS&&niters==0){ double w; w=r=d=0; while(n-->0){ w+=m[n].w; r+=m[n].r; d+=m[n].d; } r=w>1E-3?r/w:0; d=w>1E-3?d/w:0; _weight[qi][pli][qti][si]=w; } else{ /*Update the grid point and save the weight for later.*/ _weight[qi][pli][qti][si]= oc_mode_metrics_solve(&r,&d,m,s0,s1,q0,q1,ra,rb,rc,da,db,dc,n); } rate=OC_CLAMPI(-32768,(int)(ldexp(r,OC_BIT_SCALE)+0.5),32767); rmse=OC_CLAMPI(-32768,(int)(ldexp(d,OC_RMSE_SCALE)+0.5),32767); dr+=abs(rate-_table[qi][pli][qti][si].rate); dd+=abs(rmse-_table[qi][pli][qti][si].rmse); _table[qi][pli][qti][si].rate=(ogg_int16_t)rate; _table[qi][pli][qti][si].rmse=(ogg_int16_t)rmse; } } } } } /*After a fixed number of initial iterations, only iterate so long as the total change is decreasing. This ensures we don't oscillate forever, which is a danger, as all of our results are rounded fairly coarsely.*/ while((dr>0||dd>0)&&(niters++<_niters_min||(dr<prevdr&&dd<prevdd))); if(_reweight){ /*Now, reduce the values of the optimal solution until we get enough samples in each bin to overcome the constant OC_ZWEIGHT factor. This encourages sampling under-populated bins and prevents a single large sample early on from discouraging coding in that bin ever again.*/ for(pli=0;pli<3;pli++){ for(qti=0;qti<2;qti++){ for(qi=0;qi<OC_LOGQ_BINS;qi++){ for(si=0;si<OC_COMP_BINS;si++){ double wt; wt=_weight[qi][pli][qti][si]; wt/=OC_ZWEIGHT+wt; _table[qi][pli][qti][si].rate=(ogg_int16_t) (_table[qi][pli][qti][si].rate*wt+0.5); _table[qi][pli][qti][si].rmse=(ogg_int16_t) (_table[qi][pli][qti][si].rmse*wt+0.5); } } } } } } /*Dump the in memory mode metrics to a file. Note this data format isn't portable between different platforms.*/ void oc_mode_metrics_dump(void){ FILE *fmetrics; fmetrics=fopen(OC_MODE_METRICS_FILENAME,"wb"); if(fmetrics!=NULL){ (void)fwrite(OC_MODE_LOGQ,sizeof(OC_MODE_LOGQ),1,fmetrics); (void)fwrite(OC_MODE_METRICS_SATD,sizeof(OC_MODE_METRICS_SATD),1,fmetrics); (void)fwrite(OC_MODE_METRICS_SAD,sizeof(OC_MODE_METRICS_SAD),1,fmetrics); fclose(fmetrics); } } void oc_mode_metrics_print_rd(FILE *_fout,const char *_table_name, #if !defined(OC_COLLECT_METRICS) const oc_mode_rd (*_mode_rd_table)[3][2][OC_COMP_BINS]){ #else oc_mode_rd (*_mode_rd_table)[3][2][OC_COMP_BINS]){ #endif int qii; fprintf(_fout, "# if !defined(OC_COLLECT_METRICS)\n" "static const\n" "# endif\n" "oc_mode_rd %s[OC_LOGQ_BINS][3][2][OC_COMP_BINS]={\n",_table_name); for(qii=0;qii<OC_LOGQ_BINS;qii++){ int pli; fprintf(_fout," {\n"); for(pli=0;pli<3;pli++){ int qti; fprintf(_fout," {\n"); for(qti=0;qti<2;qti++){ int bin; int qi; static const char *pl_names[3]={"Y'","Cb","Cr"}; static const char *qti_names[2]={"INTRA","INTER"}; qi=(63*qii+(OC_LOGQ_BINS-1>>1))/(OC_LOGQ_BINS-1); fprintf(_fout," /*%s qi=%i %s*/\n", pl_names[pli],qi,qti_names[qti]); fprintf(_fout," {\n"); fprintf(_fout," "); for(bin=0;bin<OC_COMP_BINS;bin++){ if(bin&&!(bin&0x3))fprintf(_fout,"\n "); fprintf(_fout,"{%5i,%5i}", _mode_rd_table[qii][pli][qti][bin].rate, _mode_rd_table[qii][pli][qti][bin].rmse); if(bin+1<OC_COMP_BINS)fprintf(_fout,","); } fprintf(_fout,"\n }"); if(qti<1)fprintf(_fout,","); fprintf(_fout,"\n"); } fprintf(_fout," }"); if(pli<2)fprintf(_fout,","); fprintf(_fout,"\n"); } fprintf(_fout," }"); if(qii+1<OC_LOGQ_BINS)fprintf(_fout,","); fprintf(_fout,"\n"); } fprintf(_fout, "};\n" "\n"); } void oc_mode_metrics_print(FILE *_fout){ int qii; fprintf(_fout, "/*File generated by libtheora with OC_COLLECT_METRICS" " defined at compile time.*/\n" "#if !defined(_modedec_H)\n" "# define _modedec_H (1)\n" "# include \"encint.h\"\n" "\n" "\n" "\n" "/*The log of the average quantizer for each of the OC_MODE_RD table rows\n" " (e.g., for the represented qi's, and each pli and qti), in Q10 format.\n" " The actual statistics used by the encoder will be interpolated from\n" " that table based on log_plq for the actual quantization matrix used.*/\n" "# if !defined(OC_COLLECT_METRICS)\n" "static const\n" "# endif\n" "ogg_int16_t OC_MODE_LOGQ[OC_LOGQ_BINS][3][2]={\n"); for(qii=0;qii<OC_LOGQ_BINS;qii++){ fprintf(_fout," { {0x%04X,0x%04X},{0x%04X,0x%04X},{0x%04X,0x%04X} }%s\n", OC_MODE_LOGQ[qii][0][0],OC_MODE_LOGQ[qii][0][1],OC_MODE_LOGQ[qii][1][0], OC_MODE_LOGQ[qii][1][1],OC_MODE_LOGQ[qii][2][0],OC_MODE_LOGQ[qii][2][1], qii+1<OC_LOGQ_BINS?",":""); } fprintf(_fout, "};\n" "\n"); oc_mode_metrics_print_rd(_fout,"OC_MODE_RD_SATD",OC_MODE_RD_SATD); oc_mode_metrics_print_rd(_fout,"OC_MODE_RD_SAD",OC_MODE_RD_SAD); fprintf(_fout, "#endif\n"); } # if !defined(OC_COLLECT_NO_ENC_FUNCS) void oc_enc_mode_metrics_load(oc_enc_ctx *_enc){ oc_restore_fpu(&_enc->state); /*Load any existing mode metrics if we haven't already.*/ if(!OC_HAS_MODE_METRICS){ FILE *fmetrics; memset(OC_MODE_METRICS_SATD,0,sizeof(OC_MODE_METRICS_SATD)); memset(OC_MODE_METRICS_SAD,0,sizeof(OC_MODE_METRICS_SAD)); fmetrics=fopen(OC_MODE_METRICS_FILENAME,"rb"); if(fmetrics!=NULL){ /*Read in the binary structures as written my oc_mode_metrics_dump(). Note this format isn't portable between different platforms.*/ (void)fread(OC_MODE_LOGQ,sizeof(OC_MODE_LOGQ),1,fmetrics); (void)fread(OC_MODE_METRICS_SATD,sizeof(OC_MODE_METRICS_SATD),1,fmetrics); (void)fread(OC_MODE_METRICS_SAD,sizeof(OC_MODE_METRICS_SAD),1,fmetrics); fclose(fmetrics); } else{ int qii; int qi; int pli; int qti; for(qii=0;qii<OC_LOGQ_BINS;qii++){ qi=(63*qii+(OC_LOGQ_BINS-1>>1))/(OC_LOGQ_BINS-1); for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){ OC_MODE_LOGQ[qii][pli][qti]=_enc->log_plq[qi][pli][qti]; } } } oc_mode_metrics_update(OC_MODE_METRICS_SATD,100,1, OC_MODE_RD_SATD,OC_SATD_SHIFT,OC_MODE_RD_WEIGHT_SATD); oc_mode_metrics_update(OC_MODE_METRICS_SAD,100,1, OC_MODE_RD_SAD,OC_SAD_SHIFT,OC_MODE_RD_WEIGHT_SAD); OC_HAS_MODE_METRICS=1; } } /*The following token skipping code used to also be used in the decoder (and even at one point other places in the encoder). However, it was obsoleted by other optimizations, and is now only used here. It has been moved here to avoid generating the code when it's not needed.*/ /*Determines the number of blocks or coefficients to be skipped for a given token value. _token: The token value to skip. _extra_bits: The extra bits attached to this token. Return: A positive value indicates that number of coefficients are to be skipped in the current block. Otherwise, the negative of the return value indicates that number of blocks are to be ended.*/ typedef ptrdiff_t (*oc_token_skip_func)(int _token,int _extra_bits); /*Handles the simple end of block tokens.*/ static ptrdiff_t oc_token_skip_eob(int _token,int _extra_bits){ int nblocks_adjust; nblocks_adjust=OC_UNIBBLE_TABLE32(0,1,2,3,7,15,0,0,_token)+1; return -_extra_bits-nblocks_adjust; } /*The last EOB token has a special case, where an EOB run of size zero ends all the remaining blocks in the frame.*/ static ptrdiff_t oc_token_skip_eob6(int _token,int _extra_bits){ /*Note: We want to return -PTRDIFF_MAX, but that requires C99, which is not yet available everywhere; this should be equivalent.*/ if(!_extra_bits)return -(~(size_t)0>>1); return -_extra_bits; } /*Handles the pure zero run tokens.*/ static ptrdiff_t oc_token_skip_zrl(int _token,int _extra_bits){ return _extra_bits+1; } /*Handles a normal coefficient value token.*/ static ptrdiff_t oc_token_skip_val(void){ return 1; } /*Handles a category 1A zero run/coefficient value combo token.*/ static ptrdiff_t oc_token_skip_run_cat1a(int _token){ return _token-OC_DCT_RUN_CAT1A+2; } /*Handles category 1b, 1c, 2a, and 2b zero run/coefficient value combo tokens.*/ static ptrdiff_t oc_token_skip_run(int _token,int _extra_bits){ int run_cati; int ncoeffs_mask; int ncoeffs_adjust; run_cati=_token-OC_DCT_RUN_CAT1B; ncoeffs_mask=OC_BYTE_TABLE32(3,7,0,1,run_cati); ncoeffs_adjust=OC_BYTE_TABLE32(7,11,2,3,run_cati); return (_extra_bits&ncoeffs_mask)+ncoeffs_adjust; } /*A jump table for computing the number of coefficients or blocks to skip for a given token value. This reduces all the conditional branches, etc., needed to parse these token values down to one indirect jump.*/ static const oc_token_skip_func OC_TOKEN_SKIP_TABLE[TH_NDCT_TOKENS]={ oc_token_skip_eob, oc_token_skip_eob, oc_token_skip_eob, oc_token_skip_eob, oc_token_skip_eob, oc_token_skip_eob, oc_token_skip_eob6, oc_token_skip_zrl, oc_token_skip_zrl, (oc_token_skip_func)oc_token_skip_val, (oc_token_skip_func)oc_token_skip_val, (oc_token_skip_func)oc_token_skip_val, (oc_token_skip_func)oc_token_skip_val, (oc_token_skip_func)oc_token_skip_val, (oc_token_skip_func)oc_token_skip_val, (oc_token_skip_func)oc_token_skip_val, (oc_token_skip_func)oc_token_skip_val, (oc_token_skip_func)oc_token_skip_val, (oc_token_skip_func)oc_token_skip_val, (oc_token_skip_func)oc_token_skip_val, (oc_token_skip_func)oc_token_skip_val, (oc_token_skip_func)oc_token_skip_val, (oc_token_skip_func)oc_token_skip_val, (oc_token_skip_func)oc_token_skip_run_cat1a, (oc_token_skip_func)oc_token_skip_run_cat1a, (oc_token_skip_func)oc_token_skip_run_cat1a, (oc_token_skip_func)oc_token_skip_run_cat1a, (oc_token_skip_func)oc_token_skip_run_cat1a, oc_token_skip_run, oc_token_skip_run, oc_token_skip_run, oc_token_skip_run }; /*Determines the number of blocks or coefficients to be skipped for a given token value. _token: The token value to skip. _extra_bits: The extra bits attached to this token. Return: A positive value indicates that number of coefficients are to be skipped in the current block. Otherwise, the negative of the return value indicates that number of blocks are to be ended. 0 will never be returned, so that at least one coefficient in one block will always be decoded for every token.*/ static ptrdiff_t oc_dct_token_skip(int _token,int _extra_bits){ return (*OC_TOKEN_SKIP_TABLE[_token])(_token,_extra_bits); } void oc_enc_mode_metrics_collect(oc_enc_ctx *_enc){ static const unsigned char OC_ZZI_HUFF_OFFSET[64]={ 0,16,16,16,16,16,32,32, 32,32,32,32,32,32,32,48, 48,48,48,48,48,48,48,48, 48,48,48,48,64,64,64,64, 64,64,64,64,64,64,64,64, 64,64,64,64,64,64,64,64, 64,64,64,64,64,64,64,64 }; const oc_fragment *frags; const unsigned *frag_sad; const unsigned *frag_satd; const unsigned *frag_ssd; const ptrdiff_t *coded_fragis; ptrdiff_t ncoded_fragis; ptrdiff_t fragii; double fragw; int modelines[3][3][2]; int qti; int qii; int qi; int pli; int zzi; int token; int eb; oc_restore_fpu(&_enc->state); /*Figure out which metric bins to use for this frame's quantizers.*/ for(qii=0;qii<_enc->state.nqis;qii++){ for(pli=0;pli<3;pli++){ for(qti=0;qti<2;qti++){ int log_plq; int modeline; log_plq=_enc->log_plq[_enc->state.qis[qii]][pli][qti]; for(modeline=0;modeline<OC_LOGQ_BINS-1&& OC_MODE_LOGQ[modeline+1][pli][qti]>log_plq;modeline++); modelines[qii][pli][qti]=modeline; } } } qti=_enc->state.frame_type; frags=_enc->state.frags; frag_sad=_enc->frag_sad; frag_satd=_enc->frag_satd; frag_ssd=_enc->frag_ssd; coded_fragis=_enc->state.coded_fragis; ncoded_fragis=fragii=0; /*Weight the fragments by the inverse frame size; this prevents HD content from dominating the statistics.*/ fragw=1.0/_enc->state.nfrags; for(pli=0;pli<3;pli++){ ptrdiff_t ti[64]; int eob_token[64]; int eob_run[64]; /*Set up token indices and eob run counts. We don't bother trying to figure out the real cost of the runs that span coefficients; instead we use the costs that were available when R-D token optimization was done.*/ for(zzi=0;zzi<64;zzi++){ ti[zzi]=_enc->dct_token_offs[pli][zzi]; if(ti[zzi]>0){ token=_enc->dct_tokens[pli][zzi][0]; eb=_enc->extra_bits[pli][zzi][0]; eob_token[zzi]=token; eob_run[zzi]=-oc_dct_token_skip(token,eb); } else{ eob_token[zzi]=OC_NDCT_EOB_TOKEN_MAX; eob_run[zzi]=0; } } /*Scan the list of coded fragments for this plane.*/ ncoded_fragis+=_enc->state.ncoded_fragis[pli]; for(;fragii<ncoded_fragis;fragii++){ ptrdiff_t fragi; int frag_bits; int huffi; int skip; int mb_mode; unsigned sad; unsigned satd; double sqrt_ssd; int bin; int qtj; fragi=coded_fragis[fragii]; frag_bits=0; for(zzi=0;zzi<64;){ if(eob_run[zzi]>0){ /*We've reached the end of the block.*/ eob_run[zzi]--; break; } huffi=_enc->huff_idxs[qti][zzi>0][pli+1>>1] +OC_ZZI_HUFF_OFFSET[zzi]; if(eob_token[zzi]<OC_NDCT_EOB_TOKEN_MAX){ /*This token caused an EOB run to be flushed. Therefore it gets the bits associated with it.*/ frag_bits+=_enc->huff_codes[huffi][eob_token[zzi]].nbits +OC_DCT_TOKEN_EXTRA_BITS[eob_token[zzi]]; eob_token[zzi]=OC_NDCT_EOB_TOKEN_MAX; } token=_enc->dct_tokens[pli][zzi][ti[zzi]]; eb=_enc->extra_bits[pli][zzi][ti[zzi]]; ti[zzi]++; skip=oc_dct_token_skip(token,eb); if(skip<0){ eob_token[zzi]=token; eob_run[zzi]=-skip; } else{ /*A regular DCT value token; accumulate the bits for it.*/ frag_bits+=_enc->huff_codes[huffi][token].nbits +OC_DCT_TOKEN_EXTRA_BITS[token]; zzi+=skip; } } mb_mode=frags[fragi].mb_mode; qii=frags[fragi].qii; qi=_enc->state.qis[qii]; sad=frag_sad[fragi]<<(pli+1&2); satd=frag_satd[fragi]<<(pli+1&2); sqrt_ssd=sqrt(frag_ssd[fragi]); qtj=mb_mode!=OC_MODE_INTRA; /*Accumulate statistics. The rate (frag_bits) and RMSE (sqrt(frag_ssd)) are not scaled by OC_BIT_SCALE and OC_RMSE_SCALE; this lets us change the scale factor yet still use old data.*/ bin=OC_MINI(satd>>OC_SATD_SHIFT,OC_COMP_BINS-1); oc_mode_metrics_add( OC_MODE_METRICS_SATD[modelines[qii][pli][qtj]][pli][qtj]+bin, fragw,satd,_enc->log_plq[qi][pli][qtj],frag_bits,sqrt_ssd); bin=OC_MINI(sad>>OC_SAD_SHIFT,OC_COMP_BINS-1); oc_mode_metrics_add( OC_MODE_METRICS_SAD[modelines[qii][pli][qtj]][pli][qtj]+bin, fragw,sad,_enc->log_plq[qi][pli][qtj],frag_bits,sqrt_ssd); } } /*Update global SA(T)D/logq/rate/RMSE estimation matrix.*/ oc_mode_metrics_update(OC_MODE_METRICS_SATD,4,1, OC_MODE_RD_SATD,OC_SATD_SHIFT,OC_MODE_RD_WEIGHT_SATD); oc_mode_metrics_update(OC_MODE_METRICS_SAD,4,1, OC_MODE_RD_SAD,OC_SAD_SHIFT,OC_MODE_RD_WEIGHT_SAD); } # endif #endif