画像をひとつの行列として見る場合、これをtransposeすると以下のようになります。
transeposeしたものに更にFlipHorizontal()をかければTurnRightになりますし、
FlipVerticalをかければTurnLeftになります。
VapourSynthにstd.Transpose()があるのに、TurnLeft/TurnRightがないのはこのためです。
さて、このtransposeは画像処理においては意外に重要な役割を果たします。
transposeすると、縦と横がひっくり返るため、フィルタ処理において縦と横でそれぞれ同じような処理をする場合(resizerとかconvolutionとか)、一方向の処理だけ書けば済むのです。
特にSSEとかCUDAとかでベクトル演算を利用するならば縦方向処理のほうが速いことが多いので、transposeの重要性は高まります。
というわけで、これは画像処理を嗜むものであれば自前でそこそこ速いやつを書けなければならんと思い、書いてみることにしました。
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdint.h> | |
#include <malloc.h> | |
#include <windows.h> | |
#include <emmintrin.h> | |
#include "avisynth.h" | |
static const AVS_Linkage* AVS_linkage = 0; | |
static __forceinline __m128i mm_movehl_si128(const __m128i& xmm0) | |
{ | |
return _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(xmm0), | |
_mm_castsi128_ps(xmm0))); | |
} | |
/* MSVC++10 does not loop unroll to my code...FUCK! */ | |
static void __stdcall | |
proc_8x8(const uint8_t* srcp, uint8_t* dstp, int width, int height, int src_pitch, | |
int dst_pitch) | |
{ | |
int hmod8 = (height / 8) * 8; | |
int sp = src_pitch / 16; | |
int dp = dst_pitch / 16; | |
for (int y = 0; y < hmod8; y += 8) { | |
__m128i* d = (__m128i*)(dstp + y); | |
for (int x = 0; x < width; x += 8) { | |
const __m128i* s = (__m128i*)(srcp + x); | |
__m128i xm0, xm1, xm2, xm3, xm4, xm5, xm6, xm7; | |
xm0 = _mm_loadl_epi64(s); | |
xm1 = _mm_loadl_epi64(s + sp); | |
xm2 = _mm_loadl_epi64(s + sp * 2); | |
xm3 = _mm_loadl_epi64(s + sp * 3); | |
xm4 = _mm_loadl_epi64(s + sp * 4); | |
xm5 = _mm_loadl_epi64(s + sp * 5); | |
xm6 = _mm_loadl_epi64(s + sp * 6); | |
xm7 = _mm_loadl_epi64(s + sp * 7); | |
xm0 = _mm_unpacklo_epi8(xm0, xm1); | |
xm1 = _mm_unpacklo_epi8(xm2, xm3); | |
xm2 = _mm_unpacklo_epi8(xm4, xm5); | |
xm3 = _mm_unpacklo_epi8(xm6, xm7); | |
xm4 = _mm_unpacklo_epi16(xm0, xm1); | |
xm5 = _mm_unpackhi_epi16(xm0, xm1); | |
xm6 = _mm_unpacklo_epi16(xm2, xm3); | |
xm7 = _mm_unpackhi_epi16(xm2, xm3); | |
xm0 = _mm_unpacklo_epi32(xm4, xm6); | |
xm1 = _mm_unpackhi_epi32(xm4, xm6); | |
xm2 = _mm_unpacklo_epi32(xm5, xm7); | |
xm3 = _mm_unpackhi_epi32(xm5, xm7); | |
xm4 = mm_movehl_si128(xm0); | |
xm5 = mm_movehl_si128(xm1); | |
xm6 = mm_movehl_si128(xm2); | |
xm7 = mm_movehl_si128(xm3); | |
_mm_storel_epi64(d, xm0); | |
d += dp; | |
_mm_storel_epi64(d, xm4); | |
d += dp; | |
_mm_storel_epi64(d, xm1); | |
d += dp; | |
_mm_storel_epi64(d, xm5); | |
d += dp; | |
_mm_storel_epi64(d, xm2); | |
d += dp; | |
_mm_storel_epi64(d, xm6); | |
d += dp; | |
_mm_storel_epi64(d, xm3); | |
d += dp; | |
_mm_storel_epi64(d, xm7); | |
d += dp; | |
} | |
srcp += src_pitch * 8; | |
} | |
for (int y = hmod8; y < height; y++) { | |
uint8_t* d = dstp + y; | |
for (int x = 0; x < width; x++) { | |
*d = srcp[x]; | |
d += dst_pitch; | |
} | |
srcp += src_pitch; | |
} | |
} | |
static __forceinline void | |
MM_TRANSPOSE4_SI128(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3) | |
{ | |
__m128 f0 = _mm_castsi128_ps(x0); | |
__m128 f1 = _mm_castsi128_ps(x1); | |
__m128 f2 = _mm_castsi128_ps(x2); | |
__m128 f3 = _mm_castsi128_ps(x3); | |
_MM_TRANSPOSE4_PS(f0, f1, f2, f3); | |
x0 = _mm_castps_si128(f0); | |
x1 = _mm_castps_si128(f1); | |
x2 = _mm_castps_si128(f2); | |
x3 = _mm_castps_si128(f3); | |
} | |
static __forceinline void mm_uplh_epi16(__m128i& x0, __m128i& x1) | |
{ | |
__m128i t = _mm_unpackhi_epi16(x0, x1); | |
x0 = _mm_unpacklo_epi16(x0, x1); | |
x1 = t; | |
} | |
static __forceinline void mm_uplh_epi8(__m128i& x0, __m128i& x1) | |
{ | |
__m128i t = _mm_unpackhi_epi8(x0, x1); | |
x0 = _mm_unpacklo_epi8(x0, x1); | |
x1 = t; | |
} | |
static void __stdcall | |
proc_16x16(const uint8_t* srcp, uint8_t* dstp, int width, int height, | |
int src_pitch, int dst_pitch) | |
{ | |
int hmod16 = height / 16 * 16; | |
int dp = dst_pitch / 16; | |
int sp = src_pitch / 16; | |
for (int y = 0; y < hmod16; y += 16) { | |
__m128i* d = (__m128i*)(dstp + y); | |
for (int x = 0; x < width; x += 16) { | |
const __m128i* s = (__m128i*)(srcp + x); | |
__m128i xm0, xm1, xm2, xm3, xm4, xm5, xm6, xm7, | |
xm8, xm9, xm10, xm11, xm12, xm13, xm14, xm15; | |
xm0 = _mm_load_si128(s); | |
xm1 = _mm_load_si128(s + sp); | |
xm2 = _mm_load_si128(s + sp * 2); | |
xm3 = _mm_load_si128(s + sp * 3); | |
xm4 = _mm_load_si128(s + sp * 4); | |
xm5 = _mm_load_si128(s + sp * 5); | |
xm6 = _mm_load_si128(s + sp * 6); | |
xm7 = _mm_load_si128(s + sp * 7); | |
xm8 = _mm_load_si128(s + sp * 8); | |
xm9 = _mm_load_si128(s + sp * 9); | |
xm10 = _mm_load_si128(s + sp * 10); | |
xm11 = _mm_load_si128(s + sp * 11); | |
xm12 = _mm_load_si128(s + sp * 12); | |
xm13 = _mm_load_si128(s + sp * 13); | |
xm14 = _mm_load_si128(s + sp * 14); | |
xm15 = _mm_load_si128(s + sp * 15); | |
mm_uplh_epi8(xm0, xm1); | |
mm_uplh_epi8(xm2, xm3); | |
mm_uplh_epi8(xm4, xm5); | |
mm_uplh_epi8(xm6, xm7); | |
mm_uplh_epi8(xm8, xm9); | |
mm_uplh_epi8(xm10, xm11); | |
mm_uplh_epi8(xm12, xm13); | |
mm_uplh_epi8(xm14, xm15); | |
mm_uplh_epi16(xm0, xm2); | |
mm_uplh_epi16(xm1, xm3); | |
mm_uplh_epi16(xm4, xm6); | |
mm_uplh_epi16(xm5, xm7); | |
mm_uplh_epi16(xm8, xm10); | |
mm_uplh_epi16(xm9, xm11); | |
mm_uplh_epi16(xm12, xm14); | |
mm_uplh_epi16(xm13, xm15); | |
MM_TRANSPOSE4_SI128(xm0, xm4, xm8, xm12); | |
MM_TRANSPOSE4_SI128(xm1, xm5, xm9, xm13); | |
MM_TRANSPOSE4_SI128(xm2, xm6, xm10, xm14); | |
MM_TRANSPOSE4_SI128(xm3, xm7, xm11, xm15); | |
_mm_store_si128(d, xm0); | |
d += dp; | |
_mm_store_si128(d, xm4); | |
d += dp; | |
_mm_store_si128(d, xm8); | |
d += dp; | |
_mm_store_si128(d, xm12); | |
d += dp; | |
_mm_store_si128(d, xm2); | |
d += dp; | |
_mm_store_si128(d, xm6); | |
d += dp; | |
_mm_store_si128(d, xm10); | |
d += dp; | |
_mm_store_si128(d, xm14); | |
d += dp; | |
_mm_store_si128(d, xm1); | |
d += dp; | |
_mm_store_si128(d, xm5); | |
d += dp; | |
_mm_store_si128(d, xm9); | |
d += dp; | |
_mm_store_si128(d, xm13); | |
d += dp; | |
_mm_store_si128(d, xm3); | |
d += dp; | |
_mm_store_si128(d, xm7); | |
d += dp; | |
_mm_store_si128(d, xm11); | |
d += dp; | |
_mm_store_si128(d, xm15); | |
d += dp; | |
} | |
srcp += src_pitch * 16; | |
} | |
for (int y = hmod16; y < height; y++) { | |
uint8_t* d = dstp + y; | |
for (int x = 0; x < width; x++) { | |
*d = srcp[x]; | |
d += dst_pitch; | |
} | |
srcp += src_pitch; | |
} | |
} | |
class Transpose: public GenericVideoFilter { | |
int bwidth; | |
uint8_t *buff; | |
int num_planes; | |
void (_stdcall *proc)(const uint8_t* srcp, uint8_t* dstp, int width, | |
int height, int src_pitch, int dst_pitch); | |
public: | |
Transpose(PClip child, int pf, IScriptEnvironment* env); | |
~Transpose(); | |
PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env); | |
}; | |
Transpose::Transpose(PClip c, int pf, IScriptEnvironment* env) | |
: GenericVideoFilter(c), num_planes(3) | |
{ | |
if (!vi.IsPlanar() || vi.IsYV16() || vi.IsYV411()) { | |
env->ThrowError("Transpose: unsupported format."); | |
} | |
if (vi.IsY8()) { | |
num_planes = 1; | |
} | |
int t = vi.width; | |
vi.width = vi.height; | |
vi.height = t; | |
bwidth = ((vi.width + 15) / 16) * 16; | |
int bheight = ((vi.height + 15) / 16) * 16; | |
buff = (uint8_t *)_aligned_malloc(bwidth * bheight, 16); | |
if (!buff) { | |
env->ThrowError("Transpose: failed to allocate buffer."); | |
} | |
proc = pf == 0 ? proc_16x16 : proc_8x8; | |
} | |
Transpose::~Transpose() | |
{ | |
_aligned_free(buff); | |
} | |
PVideoFrame __stdcall Transpose::GetFrame(int n, IScriptEnvironment* env) | |
{ | |
const int planes[] = {PLANAR_Y, PLANAR_U, PLANAR_V}; | |
PVideoFrame src = child->GetFrame(n, env); | |
PVideoFrame dst = env->NewVideoFrame(vi); | |
for (int i = 0; i < num_planes; i++) { | |
int p = planes[i]; | |
const uint8_t* srcp = src->GetReadPtr(p); | |
if ((intptr_t)srcp & 15) { | |
env->ThrowError("Transepose: invalid memory alignment."); | |
} | |
proc(srcp, buff, src->GetRowSize(p), src->GetHeight(p), | |
src->GetPitch(p), bwidth); | |
env->BitBlt(dst->GetWritePtr(p), dst->GetPitch(p), buff, bwidth, | |
dst->GetRowSize(p), dst->GetHeight(p)); | |
} | |
return dst; | |
} | |
static AVSValue __cdecl | |
create_transpose(AVSValue args, void* user_data, IScriptEnvironment* env) | |
{ | |
if (!(env->GetCPUFlags() & CPUF_SSE2)) { | |
env->ThrowError("Transpose: requires SSE2 capable CPU."); | |
} | |
int proc = args[1].AsInt(0); // 0:16x16 1:8x8 | |
if (proc < 0 || proc > 1) { | |
env->ThrowError("Transpose: proc must be set to 0 or 1."); | |
} | |
return new Transpose(args[0].AsClip(), proc, env); | |
} | |
extern "C" __declspec(dllexport) const char* __stdcall | |
AvisynthPluginInit3(IScriptEnvironment* env, const AVS_Linkage* const vectors) | |
{ | |
AVS_linkage = vectors; | |
env->AddFunction("Transpose", "c[proc]i", create_transpose, 0); | |
return "TRANSPOSE"; | |
} |
書いてみてわかったこと
1. SSE2を使う場合、8x8単位で処理するよりも16x16単位で処理するほうが速い
xmmレジスタの数的に32bit(8個)だと16x16はいろいろとペナルティが大きそうなのですが、それでも16x16のほうが速いようです。
もし64bitであれば、xmmレジスタの数は16個に増えるので、さらに有利になりそうな気がします。
2. VC++10は俺の書いたコードをろくにloop unrollしてくれない
このコードを書いた当初は配列とforループを使ったもうちょっとスッキリしたコードでした。でも、コンパイラがループアンロールしてくれないの...orz
手動アンロールしただけで数十fpsスピードが上がったのを目の当たりにしたときはもうなんというか力が抜けてしまいました。
こんな糞コンパイラが世にはびこっているせいでC99の普及が遅れたなんて、もうなんというかね...