As promised at the end of the video, here is the image resizing code as it currently exists:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | #include <tmmintrin.h> //requires SSSE3 for _mm_hadd_epi32 and _mm_shuffle_epi8 #include <stdint.h> typedef struct Pixel { uint8_t r, g, b, a; } Color; struct Image { Pixel * p; int w, h; inline Pixel * operator[] (int y) { return &p[y * w]; } }; Image downsize2x(Image in) { int w = in.w / 2, h = in.h / 2; Image out = { (Pixel *) malloc(w * h * sizeof(Pixel)), w, h }; for (int y = 0; y < h; ++y) { int x = 0; for (; x < w - 3; x += 4) { __m128i p1 = _mm_loadu_si128((__m128i *) &in.p[(y * 2 + 0) * in.w + (x * 2 + 0)]); __m128i p2 = _mm_loadu_si128((__m128i *) &in.p[(y * 2 + 0) * in.w + (x * 2 + 4)]); __m128i p3 = _mm_loadu_si128((__m128i *) &in.p[(y * 2 + 1) * in.w + (x * 2 + 0)]); __m128i p4 = _mm_loadu_si128((__m128i *) &in.p[(y * 2 + 1) * in.w + (x * 2 + 4)]); __m128i i1 = _mm_avg_epu8(p1, p3), i2 = _mm_avg_epu8(p2, p4); __m128i s1 = _mm_and_si128(_mm_srli_epi32(i1, 1), _mm_set1_epi32(0x7F7F7F7F)); __m128i s2 = _mm_and_si128(_mm_srli_epi32(i2, 1), _mm_set1_epi32(0x7F7F7F7F)); __m128i d = _mm_hadd_epi32(s1, s2); _mm_storeu_si128((__m128i *) &out.p[y * w + x], d); } for (; x < w; ++x) { Pixel p1 = in.p[(y * 2 + 0) * in.w + (x * 2 + 0)]; Pixel p2 = in.p[(y * 2 + 0) * in.w + (x * 2 + 1)]; Pixel p3 = in.p[(y * 2 + 1) * in.w + (x * 2 + 0)]; Pixel p4 = in.p[(y * 2 + 1) * in.w + (x * 2 + 1)]; out.p[y * w + x] = { (uint8_t) ((p1.r + p2.r + p3.r + p4.r) >> 2), (uint8_t) ((p1.g + p2.g + p3.g + p4.g) >> 2), (uint8_t) ((p1.b + p2.b + p3.b + p4.b) >> 2), (uint8_t) ((p1.a + p2.a + p3.a + p4.a) >> 2), }; } } return out; } Image downsize4x(Image in) { int w = in.w / 4, h = in.h / 4; Image out = { (Pixel *) malloc(w * h * sizeof(Pixel)), w, h }; for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { __m128i p1 = _mm_load_si128((__m128i *) &in.p[(y * 4 + 0) * in.w + (x * 4)]); __m128i p2 = _mm_load_si128((__m128i *) &in.p[(y * 4 + 1) * in.w + (x * 4)]); __m128i p3 = _mm_load_si128((__m128i *) &in.p[(y * 4 + 2) * in.w + (x * 4)]); __m128i p4 = _mm_load_si128((__m128i *) &in.p[(y * 4 + 3) * in.w + (x * 4)]); __m128i r = _mm_avg_epu8(_mm_avg_epu8(p1, p2), _mm_avg_epu8(p3, p4)); __m128i m = _mm_and_si128(_mm_srli_epi32(r, 2), _mm_set1_epi32(0x3F3F3F3F)); __m128i h1 = _mm_hadd_epi32(m, m); __m128i h2 = _mm_hadd_epi32(h1, h1); _mm_storeu_si32(&out.p[y * w + x], h2); } } return out; } Image downsize8x(Image in) { int w = in.w / 8, h = in.h / 8; Image out = { (Pixel *) malloc(w * h * sizeof(Pixel)), w, h }; for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { __m128i p01 = _mm_load_si128((__m128i *) &in.p[(y * 8 + 0) * in.w + (x * 8 + 0)]); __m128i p02 = _mm_load_si128((__m128i *) &in.p[(y * 8 + 0) * in.w + (x * 8 + 4)]); __m128i p03 = _mm_load_si128((__m128i *) &in.p[(y * 8 + 1) * in.w + (x * 8 + 0)]); __m128i p04 = _mm_load_si128((__m128i *) &in.p[(y * 8 + 1) * in.w + (x * 8 + 4)]); __m128i p05 = _mm_load_si128((__m128i *) &in.p[(y * 8 + 2) * in.w + (x * 8 + 0)]); __m128i p06 = _mm_load_si128((__m128i *) &in.p[(y * 8 + 2) * in.w + (x * 8 + 4)]); __m128i p07 = _mm_load_si128((__m128i *) &in.p[(y * 8 + 3) * in.w + (x * 8 + 0)]); __m128i p08 = _mm_load_si128((__m128i *) &in.p[(y * 8 + 3) * in.w + (x * 8 + 4)]); __m128i p09 = _mm_load_si128((__m128i *) &in.p[(y * 8 + 4) * in.w + (x * 8 + 0)]); __m128i p10 = _mm_load_si128((__m128i *) &in.p[(y * 8 + 4) * in.w + (x * 8 + 4)]); __m128i p11 = _mm_load_si128((__m128i *) &in.p[(y * 8 + 5) * in.w + (x * 8 + 0)]); __m128i p12 = _mm_load_si128((__m128i *) &in.p[(y * 8 + 5) * in.w + (x * 8 + 4)]); __m128i p13 = _mm_load_si128((__m128i *) &in.p[(y * 8 + 6) * in.w + (x * 8 + 0)]); __m128i p14 = _mm_load_si128((__m128i *) &in.p[(y * 8 + 6) * in.w + (x * 8 + 4)]); __m128i p15 = _mm_load_si128((__m128i *) &in.p[(y * 8 + 7) * in.w + (x * 8 + 0)]); __m128i p16 = _mm_load_si128((__m128i *) &in.p[(y * 8 + 7) * in.w + (x * 8 + 4)]); __m128i r1 = _mm_avg_epu8(_mm_avg_epu8(p01, p02), _mm_avg_epu8(p03, p04)); __m128i r2 = _mm_avg_epu8(_mm_avg_epu8(p05, p06), _mm_avg_epu8(p07, p08)); __m128i r3 = _mm_avg_epu8(_mm_avg_epu8(p09, p10), _mm_avg_epu8(p11, p12)); __m128i r4 = _mm_avg_epu8(_mm_avg_epu8(p13, p14), _mm_avg_epu8(p15, p16)); __m128i r = _mm_avg_epu8(_mm_avg_epu8(r1, r2), _mm_avg_epu8(r3, r4)); __m128i m = _mm_and_si128(_mm_srli_epi32(r, 2), _mm_set1_epi32(0x3F3F3F3F)); __m128i h1 = _mm_hadd_epi32(m, m); __m128i h2 = _mm_hadd_epi32(h1, h1); _mm_storeu_si32(&out.p[y * w + x], h2); } } return out; } Image bilinear(Image in, float scale) { int w = in.w * scale, h = in.h * scale; Image out = { (Pixel *) malloc(w * h * sizeof(Pixel)), w, h }; float xs = in.w / (float) w, ys = in.h / (float) h; for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { float x0 = (x + 0.5f) * xs, y0 = (y + 0.5f) * ys; int x1 = x0, y1 = y0; float xf = x0 - x1, yf = y0 - y1; Pixel p1 = in.p[(y1 + 0) * in.w + (x1 + 0)]; Pixel p2 = in.p[(y1 + 0) * in.w + (x1 + 1)]; Pixel p3 = in.p[(y1 + 1) * in.w + (x1 + 0)]; Pixel p4 = in.p[(y1 + 1) * in.w + (x1 + 1)]; __m128 v1 = _mm_cvtepi32_ps(_mm_set_epi32(p1.a, p1.b, p1.g, p1.r)); __m128 v2 = _mm_cvtepi32_ps(_mm_set_epi32(p2.a, p2.b, p2.g, p2.r)); __m128 v3 = _mm_cvtepi32_ps(_mm_set_epi32(p3.a, p3.b, p3.g, p3.r)); __m128 v4 = _mm_cvtepi32_ps(_mm_set_epi32(p4.a, p4.b, p4.g, p4.r)); __m128 r1 = _mm_add_ps(_mm_mul_ps(v1, _mm_set1_ps(1 - xf)), _mm_mul_ps(v2, _mm_set1_ps(xf))); __m128 r2 = _mm_add_ps(_mm_mul_ps(v3, _mm_set1_ps(1 - xf)), _mm_mul_ps(v4, _mm_set1_ps(xf))); __m128 c = _mm_add_ps(_mm_mul_ps(r1, _mm_set1_ps(1 - yf)), _mm_mul_ps(r2, _mm_set1_ps(yf))); __m128i i = _mm_cvtps_epi32(c); __m128i p = _mm_shuffle_epi8(i, _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0)); _mm_storeu_si32(&out.p[y * w + x], p); } } return out; } |