xoreos  0.0.5
encoding.cpp
Go to the documentation of this file.
1 /* xoreos - A reimplementation of BioWare's Aurora engine
2  *
3  * xoreos is the legal property of its developers, whose names
4  * can be found in the AUTHORS file distributed with this source
5  * distribution.
6  *
7  * xoreos is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * as published by the Free Software Foundation; either version 3
10  * of the License, or (at your option) any later version.
11  *
12  * xoreos is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with xoreos. If not, see <http://www.gnu.org/licenses/>.
19  */
20 
25 #include <cstring>
26 #include <cerrno>
27 
28 #include <iconv.h>
29 
30 #include <vector>
31 
32 #include "src/common/encoding.h"
33 #include "src/common/error.h"
34 #include "src/common/scopedptr.h"
35 #include "src/common/singleton.h"
36 #include "src/common/ustring.h"
38 #include "src/common/writestream.h"
39 
40 namespace Common {
41 
42 static const char * const kEncodingName[kEncodingMAX] = {
43  "ASCII", "UTF-8", "UTF-16LE", "UTF-16BE", "ISO-8859-15", "WINDOWS-1250", "WINDOWS-1251",
44  "WINDOWS-1252", "CP932", "CP936", "CP949", "CP950"
45 };
46 
47 static const size_t kEncodingGrowthFrom[kEncodingMAX] = {
48  1, 1, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4
49 };
50 
51 static const size_t kEncodingGrowthTo [kEncodingMAX] = {
52  1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1
53 };
54 
55 static const size_t kTerminatorLength [kEncodingMAX] = {
56  1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1
57 };
58 
60 class ConversionManager : public Singleton<ConversionManager> {
61 public:
63  for (size_t i = 0; i < kEncodingMAX; i++) {
64  _contextFrom[i] = (iconv_t) -1;
65  _contextTo [i] = (iconv_t) -1;
66  }
67 
68  for (size_t i = 0; i < kEncodingMAX; i++)
69  if ((_contextFrom[i] = iconv_open("UTF-8", kEncodingName[i])) == ((iconv_t) -1))
70  warning("Failed to initialize %s -> UTF-8 conversion: %s", kEncodingName[i], strerror(errno));
71 
72  for (size_t i = 0; i < kEncodingMAX; i++)
73  if ((_contextTo [i] = iconv_open(kEncodingName[i], "UTF-8")) == ((iconv_t) -1))
74  warning("Failed to initialize UTF-8 -> %s conversion: %s", kEncodingName[i], strerror(errno));
75  }
76 
78  for (size_t i = 0; i < kEncodingMAX; i++) {
79  if (_contextFrom[i] != ((iconv_t) -1))
80  iconv_close(_contextFrom[i]);
81  if (_contextTo [i] != ((iconv_t) -1))
82  iconv_close(_contextTo [i]);
83  }
84  }
85 
87  if ((((size_t) from) >= kEncodingMAX) ||
88  (((size_t) to ) >= kEncodingMAX))
89  return false;
90 
91  if (from == kEncodingUTF8)
92  return _contextTo[to] != ((iconv_t) -1);
93 
94  if (to == kEncodingUTF8)
95  return _contextFrom[from] != ((iconv_t) -1);
96 
97  return false;
98  }
99 
100  UString convert(Encoding encoding, byte *data, size_t n) {
101  if (((size_t) encoding) >= kEncodingMAX)
102  throw Exception("Invalid encoding %d", encoding);
103 
104  return convert(_contextFrom[encoding], data, n, kEncodingGrowthFrom[encoding], 1);
105  }
106 
107  MemoryReadStream *convert(Encoding encoding, const UString &str, bool terminate = true) {
108  if (((size_t) encoding) >= kEncodingMAX)
109  throw Exception("Invalid encoding %d", encoding);
110 
111  return convert(_contextTo[encoding], str, kEncodingGrowthTo[encoding],
112  terminate ? kTerminatorLength[encoding] : 0);
113  }
114 
115 private:
118 
119  byte *doConvert(iconv_t &ctx, byte *data, size_t nIn, size_t nOut, size_t &size) {
120  size_t inBytes = nIn;
121  size_t outBytes = nOut;
122 
123  ScopedArray<byte> convData(new byte[outBytes]);
124 
125  byte *outBuf = convData.get();
126 
127  // Reset the converter's state
128  iconv(ctx, 0, 0, 0, 0);
129 
130  // Convert
131  if (iconv(ctx, const_cast<ICONV_CONST char **>(reinterpret_cast<char **>(&data)), &inBytes,
132  reinterpret_cast<char **>(&outBuf), &outBytes) == ((size_t) -1)) {
133 
134  warning("iconv() failed: %s", strerror(errno));
135  return 0;
136  }
137 
138  size = nOut - outBytes;
139 
140  return convData.release();
141  }
142 
143  UString convert(iconv_t &ctx, byte *data, size_t n, size_t growth, size_t termSize) {
144  if (ctx == ((iconv_t) -1))
145  return "[!!!]";
146 
147  size_t size;
148  ScopedArray<byte> dataOut(doConvert(ctx, data, n, n * growth + termSize, size));
149  if (!dataOut)
150  return "[!?!]";
151 
152  while (termSize-- > 0)
153  dataOut[size++] = '\0';
154 
155  return UString(reinterpret_cast<const char *>(dataOut.get()));
156  }
157 
158  MemoryReadStream *convert(iconv_t &ctx, const UString &str, size_t growth, size_t termSize) {
159  if (ctx == ((iconv_t) -1))
160  return 0;
161 
162  byte *dataIn = const_cast<byte *>(reinterpret_cast<const byte *>(str.c_str()));
163  size_t nIn = std::strlen(str.c_str());
164  size_t nOut = nIn * growth + termSize;
165 
166  size_t size;
167  ScopedArray<byte> dataOut(doConvert(ctx, dataIn, nIn, nOut, size));
168  if (!dataOut)
169  return 0;
170 
171  while (termSize-- > 0)
172  dataOut[size++] = '\0';
173 
174  return new MemoryReadStream(dataOut.release(), size, true);
175  }
176 };
177 
178 }
179 
180 #define ConvMan Common::ConversionManager::instance()
181 
183 
184 namespace Common {
185 
187  if (((size_t) encoding) >= kEncodingMAX)
188  return "Invalid";
189 
190  return kEncodingName[encoding];
191 }
192 
193 bool hasSupportEncoding(Encoding encoding) {
194  return ConvMan.hasSupportTranscode(Common::kEncodingUTF8, encoding ) &&
195  ConvMan.hasSupportTranscode(encoding , Common::kEncodingUTF8);
196 }
197 
198 static uint32 readFakeChar(SeekableReadStream &stream, Encoding encoding) {
199  byte data[2];
200 
201  switch (encoding) {
202  case kEncodingASCII:
203  case kEncodingLatin9:
204  case kEncodingUTF8:
205  case kEncodingCP1250:
206  case kEncodingCP1251:
207  case kEncodingCP1252:
208  case kEncodingCP932:
209  case kEncodingCP936:
210  case kEncodingCP949:
211  case kEncodingCP950:
212  if (stream.read(data, 1) != 1)
213  return 0;
214 
215  return data[0];
216 
217  case kEncodingUTF16LE:
218  if (stream.read(data, 2) != 2)
219  return 0;
220 
221  return READ_LE_UINT16(data);
222 
223  case kEncodingUTF16BE:
224  if (stream.read(data, 2) != 2)
225  return 0;
226 
227  return READ_BE_UINT16(data);
228 
229  default:
230  break;
231  }
232 
233  return 0;
234 }
235 
236 static void writeFakeChar(std::vector<byte> &output, uint32 c, Encoding encoding) {
237  byte data[2];
238 
239  switch (encoding) {
240  case kEncodingASCII:
241  case kEncodingLatin9:
242  case kEncodingUTF8:
243  case kEncodingCP1250:
244  case kEncodingCP1251:
245  case kEncodingCP1252:
246  case kEncodingCP932:
247  case kEncodingCP936:
248  case kEncodingCP949:
249  case kEncodingCP950:
250  output.push_back(c);
251  break;
252 
253  case kEncodingUTF16LE:
254  WRITE_LE_UINT16(data, c);
255  output.push_back(data[0]);
256  output.push_back(data[1]);
257  break;
258 
259  case kEncodingUTF16BE:
260  WRITE_BE_UINT16(data, c);
261  output.push_back(data[0]);
262  output.push_back(data[1]);
263  break;
264 
265  default:
266  break;
267  }
268 }
269 
270 static UString createString(std::vector<byte> &output, Encoding encoding) {
271  if (output.empty())
272  return "";
273 
274  switch (encoding) {
275  case kEncodingASCII:
276  case kEncodingUTF8:
277  output.push_back('\0');
278  return UString(reinterpret_cast<const char *>(&output[0]));
279 
280  default:
281  return ConvMan.convert(encoding, &output[0], output.size());
282  }
283 
284  return "";
285 }
286 
288  std::vector<byte> output;
289 
290  uint32 c;
291  while (((c = readFakeChar(stream, encoding)) != '\0') && !stream.eos())
292  writeFakeChar(output, c, encoding);
293 
294  return createString(output, encoding);
295 }
296 
297 UString readStringFixed(SeekableReadStream &stream, Encoding encoding, size_t length) {
298  if (length == 0)
299  return "";
300 
301  std::vector<byte> output;
302  output.resize(length);
303 
304  length = stream.read(&output[0], length);
305  output.resize(length);
306 
307  return createString(output, encoding);
308 }
309 
311  std::vector<byte> output;
312 
313  uint32 c;
314  while (((c = readFakeChar(stream, encoding)) != '\0') && !stream.eos()) {
315  if (c == '\n')
316  break;
317 
318  if (c == '\r')
319  continue;
320 
321  writeFakeChar(output, c, encoding);
322  }
323 
324  return createString(output, encoding);
325 }
326 
327 UString readString(const byte *data, size_t size, Encoding encoding) {
328  if (size == 0)
329  return "";
330 
331  std::vector<byte> output;
332  output.resize(size);
333 
334  std::memcpy(&output[0], data, size);
335 
336  return createString(output, encoding);
337 }
338 
339 size_t writeString(WriteStream &stream, const Common::UString &str, Encoding encoding, bool terminate) {
340  ScopedPtr<MemoryReadStream> data(convertString(str, encoding, terminate));
341 
342  const size_t n = stream.writeStream(*data);
343 
344  return n;
345 }
346 
347 void writeStringFixed(WriteStream &stream, const Common::UString &str, Encoding encoding, size_t length) {
348  if (length == 0)
349  return;
350 
351  ScopedPtr<MemoryReadStream> data(convertString(str, encoding, false));
352 
353  size_t n = stream.writeStream(*data, length);
354  while (n++ < length)
355  stream.writeByte(0);
356 }
357 
358 MemoryReadStream *convertString(const UString &str, Encoding encoding, bool terminateString) {
359  if (encoding == kEncodingUTF8)
360  return new MemoryReadStream(reinterpret_cast<const byte *>(str.c_str()),
361  std::strlen(str.c_str()) + (terminateString ? 1 : 0));
362 
363  return ConvMan.convert(encoding, str, terminateString);
364 }
365 
366 size_t getBytesPerCodepoint(Encoding encoding) {
367  switch (encoding) {
368  case kEncodingASCII:
369  case kEncodingLatin9:
370  case kEncodingCP1250:
371  case kEncodingCP1251:
372  case kEncodingCP1252:
373  return 1;
374 
375  case kEncodingUTF16LE:
376  case kEncodingUTF16BE:
377  return 2;
378 
379  case kEncodingUTF8:
380  case kEncodingCP932:
381  case kEncodingCP936:
382  case kEncodingCP949:
383  case kEncodingCP950:
384  throw Exception("getBytesPerCodepoint(): Encoding with variable number of bytes per codepoint");
385 
386  default:
387  break;
388  }
389 
390  throw Exception("getBytesPerCodepoint(): Invalid encoding (%d)", (int)encoding);
391 }
392 
393 bool isValidCodepoint(Encoding encoding, uint32 cp) {
394  switch (encoding) {
395  case kEncodingInvalid:
396  return false;
397 
398  case kEncodingASCII:
399  return cp <= 127;
400 
401  case kEncodingLatin9:
402  return (cp <= 0x7F) || (cp >= 0xA0);
403 
404  case kEncodingCP1250:
405  return (cp != 0x81) && (cp != 0x83) && (cp != 0x88) &&
406  (cp != 0x90) && (cp != 0x98);
407 
408  case kEncodingCP1251:
409  return cp != 0x98;
410 
411  case kEncodingCP1252:
412  return (cp != 0x81) && (cp != 0x8D) && (cp != 0x8F) &&
413  (cp != 0x90) && (cp != 0x9D);
414 
415  case kEncodingUTF8: // TODO
416  case kEncodingUTF16LE: // TODO
417  case kEncodingUTF16BE: // TODO
418  case kEncodingCP932: // TODO
419  case kEncodingCP936: // TODO
420  case kEncodingCP949: // TODO
421  case kEncodingCP950: // TODO
422  default:
423  return true;
424  }
425 
426  return false;
427 }
428 
429 } // End of namespace Common
A manager handling string encoding conversions.
Definition: encoding.cpp:60
Class and macro for implementing singletons.
UString convert(Encoding encoding, byte *data, size_t n)
Definition: encoding.cpp:100
static const size_t kTerminatorLength[kEncodingMAX]
Definition: encoding.cpp:55
iconv_t _contextFrom[kEncodingMAX]
Definition: encoding.cpp:116
Definition: 2dafile.h:39
bool hasSupportEncoding(Encoding encoding)
Do we have support for this encoding?
Definition: encoding.cpp:193
A class holding an UTF-8 string.
Definition: ustring.h:48
UString getEncodingName(Encoding encoding)
Return the human readable name of an encoding.
Definition: encoding.cpp:186
PointerType release()
Returns the plain pointer value and releases ScopedPtr.
Definition: scopedptr.h:103
virtual bool eos() const =0
Returns true if a read failed because the stream has been reached.
Implementing the reading stream interfaces for plain memory blocks.
size_t writeString(WriteStream &stream, const Common::UString &str, Encoding encoding, bool terminate)
Write a string into a stream with a given encoding.
Definition: encoding.cpp:339
UTF-16 LE (little endian).
Definition: encoding.h:44
MemoryReadStream * convert(Encoding encoding, const UString &str, bool terminate=true)
Definition: encoding.cpp:107
#define ConvMan
Definition: encoding.cpp:180
A simple scoped smart pointer template.
UString convert(iconv_t &ctx, byte *data, size_t n, size_t growth, size_t termSize)
Definition: encoding.cpp:143
static void writeFakeChar(std::vector< byte > &output, uint32 c, Encoding encoding)
Definition: encoding.cpp:236
UTF-16 BE (big endian).
Definition: encoding.h:45
size_t getBytesPerCodepoint(Encoding encoding)
Return the number of bytes per codepoint in this encoding.
Definition: encoding.cpp:366
byte * doConvert(iconv_t &ctx, byte *data, size_t nIn, size_t nOut, size_t &size)
Definition: encoding.cpp:119
Basic exceptions to throw.
MemoryReadStream * convert(iconv_t &ctx, const UString &str, size_t growth, size_t termSize)
Definition: encoding.cpp:158
UString readStringLine(SeekableReadStream &stream, Encoding encoding)
Read a line with the given encoding out of a stream.
Definition: encoding.cpp:310
const char * c_str() const
Return the (utf8 encoded) string data.
Definition: ustring.cpp:249
Generic template base class for implementing the singleton design pattern.
Definition: singleton.h:61
ISO-8859-15 (Latin-9).
Definition: encoding.h:47
Windows codepage 950 (Traditional Chinese, similar to Big5).
Definition: encoding.h:56
#define DECLARE_SINGLETON(T)
Note that you need to use this macro from the global namespace.
Definition: singleton.h:122
static const char *const kEncodingName[kEncodingMAX]
Definition: encoding.cpp:42
Basic writing stream interfaces.
bool hasSupportTranscode(Encoding from, Encoding to)
Definition: encoding.cpp:86
virtual size_t read(void *dataPtr, size_t dataSize)=0
Read data from the stream.
Simple memory based &#39;stream&#39;, which implements the ReadStream interface for a plain memory block...
Definition: memreadstream.h:66
static UString createString(std::vector< byte > &output, Encoding encoding)
Definition: encoding.cpp:270
Utility functions for working with differing string encodings.
Encoding
Definition: encoding.h:37
static const size_t kEncodingGrowthTo[kEncodingMAX]
Definition: encoding.cpp:51
For range checks.
Definition: encoding.h:58
StackException Exception
Definition: error.h:59
A scoped plain pointer, allowing pointer-y access and normal deletion.
Definition: scopedptr.h:120
void writeByte(byte value)
Definition: writestream.h:88
void warning(const char *s,...)
Definition: util.cpp:33
Generic interface for a writable data stream.
Definition: writestream.h:64
Unicode string handling.
Plain, unextended ASCII (7bit clean).
Definition: encoding.h:40
PointerType get() const
Returns the plain pointer value.
Definition: scopedptr.h:96
Windows codepage 932 (Japanese, extended Shift-JIS).
Definition: encoding.h:53
uint32_t uint32
Definition: types.h:204
size_t writeStream(ReadStream &stream, size_t n)
Copy n bytes of the given stream into the stream.
Definition: writestream.cpp:72
Windows codepage 1250 (Eastern European, Latin alphabet).
Definition: encoding.h:49
UString readString(SeekableReadStream &stream, Encoding encoding)
Read a string with the given encoding of a stream.
Definition: encoding.cpp:287
Windows codepage 1251 (Eastern European, Cyrillic alphabet).
Definition: encoding.h:50
UString readStringFixed(SeekableReadStream &stream, Encoding encoding, size_t length)
Read length bytes as a string with the given encoding out of a stream.
Definition: encoding.cpp:297
Windows codepage 1252 (Western European, Latin alphabet).
Definition: encoding.h:51
static const size_t kEncodingGrowthFrom[kEncodingMAX]
Definition: encoding.cpp:47
iconv_t _contextTo[kEncodingMAX]
Definition: encoding.cpp:117
bool isValidCodepoint(Encoding encoding, uint32 cp)
Return whether the given codepoint is valid in this encoding.
Definition: encoding.cpp:393
MemoryReadStream * convertString(const UString &str, Encoding encoding, bool terminateString)
Convert a string into the given encoding.
Definition: encoding.cpp:358
Windows codepage 949 (Korean, similar to EUC-KR).
Definition: encoding.h:55
Windows codepage 936 (Simplified Chinese, extended GB2312 with GBK codepoints).
Definition: encoding.h:54
Interface for a seekable & readable data stream.
Definition: readstream.h:265
static uint32 readFakeChar(SeekableReadStream &stream, Encoding encoding)
Definition: encoding.cpp:198
uint8 byte
Definition: types.h:209
void writeStringFixed(WriteStream &stream, const Common::UString &str, Encoding encoding, size_t length)
Write a string into a stream with a given encoding and fixed length in bytes.
Definition: encoding.cpp:347