ManaPlus
dyepalette_replaceacolor.cpp
Go to the documentation of this file.
1 /*
2  * The ManaPlus Client
3  * Copyright (C) 2007-2009 The Mana World Development Team
4  * Copyright (C) 2009-2010 The Mana Developers
5  * Copyright (C) 2011-2019 The ManaPlus Developers
6  * Copyright (C) 2019-2021 Andrei Karas
7  *
8  * This file is part of The ManaPlus Client.
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program. If not, see <http://www.gnu.org/licenses/>.
22  */
23 
25 
26 PRAGMA48(GCC diagnostic push)
27 PRAGMA48(GCC diagnostic ignored "-Wshadow")
28 #ifndef SDL_BIG_ENDIAN
29 #include <SDL_endian.h>
30 #endif // SDL_BYTEORDER
31 PRAGMA48(GCC diagnostic pop)
32 
33 #ifdef SIMD_SUPPORTED
34 // avx2
35 #include <immintrin.h>
36 #endif // SIMD_SUPPORTED
37 
38 #include "debug.h"
39 
41  const int bufSize) const restrict2
42 {
43  STD_VECTOR<DyeColor>::const_iterator it_end = mColors.end();
44  const size_t sz = mColors.size();
45  if ((sz == 0U) || (pixels == nullptr))
46  return;
47  if ((sz % 2) != 0U)
48  -- it_end;
49 
50  for (const uint32_t *const p_end = pixels + CAST_SIZE(bufSize);
51  pixels != p_end;
52  ++pixels)
53  {
54  uint8_t *const p = reinterpret_cast<uint8_t *>(pixels);
55  const unsigned int data = *pixels;
56 
57  STD_VECTOR<DyeColor>::const_iterator it = mColors.begin();
58  while (it != it_end)
59  {
60  const DyeColor &col = *it;
61  ++ it;
62  const DyeColor &col2 = *it;
63 
64 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
65  const unsigned int coldata = (col.value[3] << 24U)
66  | (col.value[2] << 16U)
67  | (col.value[1] << 8U)
68  | (col.value[0]);
69 #else // SDL_BYTEORDER == SDL_BIG_ENDIAN
70  const unsigned int coldata = (col.value[3])
71  | (col.value[2] << 8U)
72  | (col.value[1] << 16U) |
73  (col.value[0] << 24U);
74 #endif // SDL_BYTEORDER == SDL_BIG_ENDIAN
75 
76  if (data == coldata)
77  {
78  p[3] = col2.value[0];
79  p[2] = col2.value[1];
80  p[1] = col2.value[2];
81  p[0] = col2.value[3];
82  break;
83  }
84 
85  ++ it;
86  }
87  }
88 }
89 
90 #ifdef SIMD_SUPPORTED
91 /*
92 static void print256(const char *const text, const __m256i &val);
93 static void print256(const char *const text, const __m256i &val)
94 {
95  printf("%s 0x%016llx%016llx%016llx%016llx\n", text, val[0], val[1], val[2], val[3]);
96 }
97 */
98 
99 __attribute__ ((target ("sse2")))
100 void DyePalette::replaceAColorSse2(uint32_t *restrict pixels,
101  const int bufSize) const restrict2
102 {
103  STD_VECTOR<DyeColor>::const_iterator it_end = mColors.end();
104  const size_t sz = mColors.size();
105  if ((sz == 0U) || (pixels == nullptr))
106  return;
107  if ((sz % 2) != 0U)
108  -- it_end;
109  const int mod = bufSize % 4;
110  const int bufEnd = bufSize - mod;
111 
112  for (int ptr = 0; ptr < bufEnd; ptr += 4)
113  {
114 // __m128i base = _mm_load_si128(reinterpret_cast<__m128i*>(pixels));
115  __m128i base = _mm_loadu_si128(reinterpret_cast<__m128i*>(
116  &pixels[ptr]));
117 
118  STD_VECTOR<DyeColor>::const_iterator it = mColors.begin();
119  while (it != it_end)
120  {
121  const DyeColor &col = *it;
122  ++ it;
123  const DyeColor &col2 = *it;
124 
125  __m128i newMask = _mm_set1_epi32(col2.valueA);
126  __m128i cmpMask = _mm_set1_epi32(col.valueA);
127  __m128i cmpRes = _mm_cmpeq_epi32(base, cmpMask);
128  __m128i srcAnd = _mm_andnot_si128(cmpRes, base);
129  __m128i dstAnd = _mm_and_si128(cmpRes, newMask);
130  base = _mm_or_si128(srcAnd, dstAnd);
131 
132  ++ it;
133  }
134 // _mm_store_si128(reinterpret_cast<__m128i*>(pixels), base);
135  _mm_storeu_si128(reinterpret_cast<__m128i*>(&pixels[ptr]), base);
136  }
137 
138  // complete end without simd
139  for (int ptr = bufSize - mod; ptr < bufSize; ptr ++)
140  {
141  uint8_t *const p = reinterpret_cast<uint8_t *>(&pixels[ptr]);
142  const unsigned int data = pixels[ptr];
143 
144  STD_VECTOR<DyeColor>::const_iterator it = mColors.begin();
145  while (it != it_end)
146  {
147  const DyeColor &col = *it;
148  ++ it;
149  const DyeColor &col2 = *it;
150 
151  const unsigned int coldata = (col.value[3]) |
152  (col.value[2] << 8U) |
153  (col.value[1] << 16U) |
154  (col.value[0] << 24U);
155 
156  if (data == coldata)
157  {
158  p[3] = col2.value[0];
159  p[2] = col2.value[1];
160  p[1] = col2.value[2];
161  p[0] = col2.value[3];
162  break;
163  }
164 
165  ++ it;
166  }
167  }
168 }
169 
170 __attribute__ ((target ("avx2")))
171 void DyePalette::replaceAColorAvx2(uint32_t *restrict pixels,
172  const int bufSize) const restrict2
173 {
174  STD_VECTOR<DyeColor>::const_iterator it_end = mColors.end();
175  const size_t sz = mColors.size();
176  if ((sz == 0U) || (pixels == nullptr))
177  return;
178  if ((sz % 2) != 0U)
179  -- it_end;
180  const int mod = bufSize % 8;
181  const int bufEnd = bufSize - mod;
182 
183  for (int ptr = 0; ptr < bufEnd; ptr += 8)
184  {
185 // __m256i base = _mm256_load_si256(reinterpret_cast<__m256i*>(pixels));
186  __m256i base = _mm256_loadu_si256(reinterpret_cast<__m256i*>(
187  &pixels[ptr]));
188 
189  STD_VECTOR<DyeColor>::const_iterator it = mColors.begin();
190  while (it != it_end)
191  {
192  const DyeColor &col = *it;
193  ++ it;
194  const DyeColor &col2 = *it;
195 
196  __m256i newMask = _mm256_set1_epi32(col2.valueA);
197  __m256i cmpMask = _mm256_set1_epi32(col.valueA);
198  __m256i cmpRes = _mm256_cmpeq_epi32(base, cmpMask);
199  __m256i srcAnd = _mm256_andnot_si256(cmpRes, base);
200  __m256i dstAnd = _mm256_and_si256(cmpRes, newMask);
201  base = _mm256_or_si256(srcAnd, dstAnd);
202 
203  ++ it;
204  }
205 // _mm256_store_si256(reinterpret_cast<__m256i*>(pixels), base);
206  _mm256_storeu_si256(reinterpret_cast<__m256i*>(&pixels[ptr]), base);
207  }
208 
209  // complete end without simd
210  for (int ptr = bufSize - mod; ptr < bufSize; ptr ++)
211  {
212  uint8_t *const p = reinterpret_cast<uint8_t *>(&pixels[ptr]);
213  const unsigned int data = pixels[ptr];
214 
215  STD_VECTOR<DyeColor>::const_iterator it = mColors.begin();
216  while (it != it_end)
217  {
218  const DyeColor &col = *it;
219  ++ it;
220  const DyeColor &col2 = *it;
221 
222  const unsigned int coldata = (col.value[3]) |
223  (col.value[2] << 8U) |
224  (col.value[1] << 16U) |
225  (col.value[0] << 24U);
226 
227  if (data == coldata)
228  {
229  p[3] = col2.value[0];
230  p[2] = col2.value[1];
231  p[1] = col2.value[2];
232  p[0] = col2.value[3];
233  break;
234  }
235 
236  ++ it;
237  }
238  }
239 }
240 
241 #endif // SIMD_SUPPORTED
#define CAST_SIZE
Definition: cast.h:34
void replaceAColorDefault(uint32_t *pixels, const int bufSize) const
#define restrict
Definition: localconsts.h:165
#define restrict2
Definition: localconsts.h:166
#define PRAGMA48(str)
Definition: localconsts.h:199
uint32_t data
union EAthena::ItemFlags __attribute__((packed))
std::map< std::string, DyeColor > mColors
Definition: palettedb.cpp:37
uint8_t value[4]
Definition: dyecolor.h:77
uint32_t valueA
Definition: dyecolor.h:80