1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123 | #include <tmmintrin.h> //requires SSSE3 for _mm_hadd_epi32 and _mm_shuffle_epi8
#include <stdint.h>
typedef struct Pixel { uint8_t r, g, b, a; } Color;
struct Image { Pixel * p; int w, h; inline Pixel * operator[] (int y) { return &p[y * w]; } };
Image downsize2x(Image in) {
int w = in.w / 2, h = in.h / 2;
Image out = { (Pixel *) malloc(w * h * sizeof(Pixel)), w, h };
for (int y = 0; y < h; ++y) {
int x = 0;
for (; x < w - 3; x += 4) {
__m128i p1 = _mm_loadu_si128((__m128i *) &in.p[(y * 2 + 0) * in.w + (x * 2 + 0)]);
__m128i p2 = _mm_loadu_si128((__m128i *) &in.p[(y * 2 + 0) * in.w + (x * 2 + 4)]);
__m128i p3 = _mm_loadu_si128((__m128i *) &in.p[(y * 2 + 1) * in.w + (x * 2 + 0)]);
__m128i p4 = _mm_loadu_si128((__m128i *) &in.p[(y * 2 + 1) * in.w + (x * 2 + 4)]);
__m128i i1 = _mm_avg_epu8(p1, p3), i2 = _mm_avg_epu8(p2, p4);
__m128i s1 = _mm_and_si128(_mm_srli_epi32(i1, 1), _mm_set1_epi32(0x7F7F7F7F));
__m128i s2 = _mm_and_si128(_mm_srli_epi32(i2, 1), _mm_set1_epi32(0x7F7F7F7F));
__m128i d = _mm_hadd_epi32(s1, s2);
_mm_storeu_si128((__m128i *) &out.p[y * w + x], d);
}
for (; x < w; ++x) {
Pixel p1 = in.p[(y * 2 + 0) * in.w + (x * 2 + 0)];
Pixel p2 = in.p[(y * 2 + 0) * in.w + (x * 2 + 1)];
Pixel p3 = in.p[(y * 2 + 1) * in.w + (x * 2 + 0)];
Pixel p4 = in.p[(y * 2 + 1) * in.w + (x * 2 + 1)];
out.p[y * w + x] = {
(uint8_t) ((p1.r + p2.r + p3.r + p4.r) >> 2),
(uint8_t) ((p1.g + p2.g + p3.g + p4.g) >> 2),
(uint8_t) ((p1.b + p2.b + p3.b + p4.b) >> 2),
(uint8_t) ((p1.a + p2.a + p3.a + p4.a) >> 2),
};
}
}
return out;
}
Image downsize4x(Image in) {
int w = in.w / 4, h = in.h / 4;
Image out = { (Pixel *) malloc(w * h * sizeof(Pixel)), w, h };
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
__m128i p1 = _mm_load_si128((__m128i *) &in.p[(y * 4 + 0) * in.w + (x * 4)]);
__m128i p2 = _mm_load_si128((__m128i *) &in.p[(y * 4 + 1) * in.w + (x * 4)]);
__m128i p3 = _mm_load_si128((__m128i *) &in.p[(y * 4 + 2) * in.w + (x * 4)]);
__m128i p4 = _mm_load_si128((__m128i *) &in.p[(y * 4 + 3) * in.w + (x * 4)]);
__m128i r = _mm_avg_epu8(_mm_avg_epu8(p1, p2), _mm_avg_epu8(p3, p4));
__m128i m = _mm_and_si128(_mm_srli_epi32(r, 2), _mm_set1_epi32(0x3F3F3F3F));
__m128i h1 = _mm_hadd_epi32(m, m);
__m128i h2 = _mm_hadd_epi32(h1, h1);
_mm_storeu_si32(&out.p[y * w + x], h2);
}
}
return out;
}
Image downsize8x(Image in) {
int w = in.w / 8, h = in.h / 8;
Image out = { (Pixel *) malloc(w * h * sizeof(Pixel)), w, h };
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
__m128i p01 = _mm_load_si128((__m128i *) &in.p[(y * 8 + 0) * in.w + (x * 8 + 0)]);
__m128i p02 = _mm_load_si128((__m128i *) &in.p[(y * 8 + 0) * in.w + (x * 8 + 4)]);
__m128i p03 = _mm_load_si128((__m128i *) &in.p[(y * 8 + 1) * in.w + (x * 8 + 0)]);
__m128i p04 = _mm_load_si128((__m128i *) &in.p[(y * 8 + 1) * in.w + (x * 8 + 4)]);
__m128i p05 = _mm_load_si128((__m128i *) &in.p[(y * 8 + 2) * in.w + (x * 8 + 0)]);
__m128i p06 = _mm_load_si128((__m128i *) &in.p[(y * 8 + 2) * in.w + (x * 8 + 4)]);
__m128i p07 = _mm_load_si128((__m128i *) &in.p[(y * 8 + 3) * in.w + (x * 8 + 0)]);
__m128i p08 = _mm_load_si128((__m128i *) &in.p[(y * 8 + 3) * in.w + (x * 8 + 4)]);
__m128i p09 = _mm_load_si128((__m128i *) &in.p[(y * 8 + 4) * in.w + (x * 8 + 0)]);
__m128i p10 = _mm_load_si128((__m128i *) &in.p[(y * 8 + 4) * in.w + (x * 8 + 4)]);
__m128i p11 = _mm_load_si128((__m128i *) &in.p[(y * 8 + 5) * in.w + (x * 8 + 0)]);
__m128i p12 = _mm_load_si128((__m128i *) &in.p[(y * 8 + 5) * in.w + (x * 8 + 4)]);
__m128i p13 = _mm_load_si128((__m128i *) &in.p[(y * 8 + 6) * in.w + (x * 8 + 0)]);
__m128i p14 = _mm_load_si128((__m128i *) &in.p[(y * 8 + 6) * in.w + (x * 8 + 4)]);
__m128i p15 = _mm_load_si128((__m128i *) &in.p[(y * 8 + 7) * in.w + (x * 8 + 0)]);
__m128i p16 = _mm_load_si128((__m128i *) &in.p[(y * 8 + 7) * in.w + (x * 8 + 4)]);
__m128i r1 = _mm_avg_epu8(_mm_avg_epu8(p01, p02), _mm_avg_epu8(p03, p04));
__m128i r2 = _mm_avg_epu8(_mm_avg_epu8(p05, p06), _mm_avg_epu8(p07, p08));
__m128i r3 = _mm_avg_epu8(_mm_avg_epu8(p09, p10), _mm_avg_epu8(p11, p12));
__m128i r4 = _mm_avg_epu8(_mm_avg_epu8(p13, p14), _mm_avg_epu8(p15, p16));
__m128i r = _mm_avg_epu8(_mm_avg_epu8(r1, r2), _mm_avg_epu8(r3, r4));
__m128i m = _mm_and_si128(_mm_srli_epi32(r, 2), _mm_set1_epi32(0x3F3F3F3F));
__m128i h1 = _mm_hadd_epi32(m, m);
__m128i h2 = _mm_hadd_epi32(h1, h1);
_mm_storeu_si32(&out.p[y * w + x], h2);
}
}
return out;
}
Image bilinear(Image in, float scale) {
int w = in.w * scale, h = in.h * scale;
Image out = { (Pixel *) malloc(w * h * sizeof(Pixel)), w, h };
float xs = in.w / (float) w, ys = in.h / (float) h;
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
float x0 = (x + 0.5f) * xs, y0 = (y + 0.5f) * ys;
int x1 = x0, y1 = y0;
float xf = x0 - x1, yf = y0 - y1;
Pixel p1 = in.p[(y1 + 0) * in.w + (x1 + 0)];
Pixel p2 = in.p[(y1 + 0) * in.w + (x1 + 1)];
Pixel p3 = in.p[(y1 + 1) * in.w + (x1 + 0)];
Pixel p4 = in.p[(y1 + 1) * in.w + (x1 + 1)];
__m128 v1 = _mm_cvtepi32_ps(_mm_set_epi32(p1.a, p1.b, p1.g, p1.r));
__m128 v2 = _mm_cvtepi32_ps(_mm_set_epi32(p2.a, p2.b, p2.g, p2.r));
__m128 v3 = _mm_cvtepi32_ps(_mm_set_epi32(p3.a, p3.b, p3.g, p3.r));
__m128 v4 = _mm_cvtepi32_ps(_mm_set_epi32(p4.a, p4.b, p4.g, p4.r));
__m128 r1 = _mm_add_ps(_mm_mul_ps(v1, _mm_set1_ps(1 - xf)), _mm_mul_ps(v2, _mm_set1_ps(xf)));
__m128 r2 = _mm_add_ps(_mm_mul_ps(v3, _mm_set1_ps(1 - xf)), _mm_mul_ps(v4, _mm_set1_ps(xf)));
__m128 c = _mm_add_ps(_mm_mul_ps(r1, _mm_set1_ps(1 - yf)), _mm_mul_ps(r2, _mm_set1_ps(yf)));
__m128i i = _mm_cvtps_epi32(c);
__m128i p = _mm_shuffle_epi8(i, _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0));
_mm_storeu_si32(&out.p[y * w + x], p);
}
}
return out;
}
|