283 lines
5.9 KiB
C++
283 lines
5.9 KiB
C++
|
// Copyright (c) 1994 James Clark
|
||
|
// See the file COPYING for copying permission.
|
||
|
|
||
|
#include "splib.h"
|
||
|
|
||
|
#ifdef SP_MULTI_BYTE
|
||
|
|
||
|
#include "UTF8CodingSystem.h"
|
||
|
#include "constant.h"
|
||
|
|
||
|
#ifdef SP_NAMESPACE
|
||
|
namespace SP_NAMESPACE {
|
||
|
#endif
|
||
|
|
||
|
enum {
|
||
|
// cmaskN is mask for first byte to test for N byte sequence
|
||
|
cmask1 = 0x80,
|
||
|
cmask2 = 0xe0,
|
||
|
cmask3 = 0xf0,
|
||
|
cmask4 = 0xf8,
|
||
|
cmask5 = 0xfc,
|
||
|
cmask6 = 0xfe,
|
||
|
// cvalN is value of masked first byte of N byte sequence
|
||
|
cval1 = 0x00,
|
||
|
cval2 = 0xc0,
|
||
|
cval3 = 0xe0,
|
||
|
cval4 = 0xf0,
|
||
|
cval5 = 0xf8,
|
||
|
cval6 = 0xfc,
|
||
|
// vmaskN is mask to get value from first byte in N byte sequence
|
||
|
vmask2 = 0x1f,
|
||
|
vmask3 = 0xf,
|
||
|
vmask4 = 0x7,
|
||
|
vmask5 = 0x3,
|
||
|
vmask6 = 0x1,
|
||
|
// minN is minimum legal resulting value for N byte sequence
|
||
|
min2 = 0x80,
|
||
|
min3 = 0x800,
|
||
|
min4 = 0x10000,
|
||
|
min5 = 0x200000,
|
||
|
min6 = 0x4000000,
|
||
|
max6 = 0x7fffffff
|
||
|
};
|
||
|
|
||
|
class UTF8Decoder : public Decoder {
|
||
|
public:
|
||
|
UTF8Decoder();
|
||
|
size_t decode(Char *, const char *, size_t, const char **);
|
||
|
Boolean convertOffset(unsigned long &offset) const;
|
||
|
private:
|
||
|
// value for encoding error
|
||
|
enum { invalid = 0xfffd };
|
||
|
Boolean recovering_;
|
||
|
PackedBoolean hadFirstChar_;
|
||
|
PackedBoolean hadByteOrderMark_;
|
||
|
};
|
||
|
|
||
|
class UTF8Encoder : public Encoder {
|
||
|
public:
|
||
|
UTF8Encoder();
|
||
|
void output(const Char *, size_t, OutputByteStream *);
|
||
|
};
|
||
|
|
||
|
Decoder *UTF8CodingSystem::makeDecoder() const
|
||
|
{
|
||
|
return new UTF8Decoder;
|
||
|
}
|
||
|
|
||
|
Encoder *UTF8CodingSystem::makeEncoder() const
|
||
|
{
|
||
|
return new UTF8Encoder;
|
||
|
}
|
||
|
|
||
|
|
||
|
UTF8Decoder::UTF8Decoder()
|
||
|
: recovering_(0), hadFirstChar_(0), hadByteOrderMark_(0)
|
||
|
{
|
||
|
}
|
||
|
|
||
|
size_t UTF8Decoder::decode(Char *to, const char *s,
|
||
|
size_t slen, const char **result)
|
||
|
{
|
||
|
// Check for byte-order mark
|
||
|
if (!hadFirstChar_ && slen >= 3) {
|
||
|
hadFirstChar_ = 1;
|
||
|
|
||
|
if ((unsigned char)s[0] == 0xEF &&
|
||
|
(unsigned char)s[1] == 0xBB &&
|
||
|
(unsigned char)s[2] == 0xBF) {
|
||
|
s += 3;
|
||
|
slen -= 3;
|
||
|
hadByteOrderMark_ = 1;
|
||
|
}
|
||
|
}
|
||
|
Char *start = to;
|
||
|
const unsigned char *us = (const unsigned char *)s;
|
||
|
if (recovering_) {
|
||
|
recovering_ = 0;
|
||
|
goto recover;
|
||
|
}
|
||
|
while (slen > 0) {
|
||
|
unsigned c0;
|
||
|
c0 = us[0];
|
||
|
if ((c0 & cmask1) == cval1) {
|
||
|
*to++ = c0;
|
||
|
us++;
|
||
|
slen--;
|
||
|
}
|
||
|
else if ((c0 & cmask2) == cval2) {
|
||
|
if (slen < 2)
|
||
|
goto done;
|
||
|
unsigned c1 = us[1] ^ 0x80;
|
||
|
if (c1 & 0xc0)
|
||
|
goto error;
|
||
|
unsigned c = ((c0 & vmask2) << 6) | c1;
|
||
|
if (c < min2)
|
||
|
c = invalid;
|
||
|
*to++ = c;
|
||
|
slen -= 2;
|
||
|
us += 2;
|
||
|
}
|
||
|
else if ((c0 & cmask3) == cval3) {
|
||
|
if (slen < 3)
|
||
|
goto done;
|
||
|
unsigned c1 = us[1] ^ 0x80;
|
||
|
unsigned c2 = us[2] ^ 0x80;
|
||
|
if ((c1 | c2) & 0xc0)
|
||
|
goto error;
|
||
|
unsigned c = ((((c0 & vmask3) << 6) | c1) << 6) | c2;
|
||
|
if (c < min3)
|
||
|
c = invalid;
|
||
|
*to++ = c;
|
||
|
slen -= 3;
|
||
|
us += 3;
|
||
|
}
|
||
|
else if ((c0 & cmask4) == cval4) {
|
||
|
if (slen < 4)
|
||
|
goto done;
|
||
|
unsigned c1 = us[1] ^ 0x80;
|
||
|
unsigned c2 = us[2] ^ 0x80;
|
||
|
unsigned c3 = us[3] ^ 0x80;
|
||
|
if ((c1 | c2 | c3) & 0xc0)
|
||
|
goto error;
|
||
|
if (charMax < min5 - 1)
|
||
|
*to++ = invalid;
|
||
|
else {
|
||
|
unsigned long c = ((((c0 & vmask4) << 6) | c1) << 6) | c2;
|
||
|
c = (c << 6) | c3;
|
||
|
if (c < min4)
|
||
|
c = invalid;
|
||
|
*to++ = c;
|
||
|
}
|
||
|
slen -= 4;
|
||
|
us += 4;
|
||
|
}
|
||
|
else if ((c0 & cmask5) == cval5) {
|
||
|
if (slen < 5)
|
||
|
goto done;
|
||
|
unsigned c1 = us[1] ^ 0x80;
|
||
|
unsigned c2 = us[2] ^ 0x80;
|
||
|
unsigned c3 = us[3] ^ 0x80;
|
||
|
unsigned c4 = us[4] ^ 0x80;
|
||
|
if ((c1 | c2 | c3 | c4) & 0xc0)
|
||
|
goto error;
|
||
|
if (charMax < min6 - 1)
|
||
|
*to++ = invalid;
|
||
|
else {
|
||
|
unsigned long c = ((((c0 & vmask5) << 6) | c1) << 6) | c2;
|
||
|
c = (((c << 6) | c3) << 6) | c4;
|
||
|
if (c < min5)
|
||
|
c = invalid;
|
||
|
*to++ = c;
|
||
|
}
|
||
|
slen -= 5;
|
||
|
us += 5;
|
||
|
}
|
||
|
else if ((c0 & cmask6) == cval6) {
|
||
|
if (slen < 6)
|
||
|
goto done;
|
||
|
unsigned c1 = us[1] ^ 0x80;
|
||
|
unsigned c2 = us[2] ^ 0x80;
|
||
|
unsigned c3 = us[3] ^ 0x80;
|
||
|
unsigned c4 = us[4] ^ 0x80;
|
||
|
unsigned c5 = us[5] ^ 0x80;
|
||
|
if ((c1 | c2 | c3 | c4 | c5) & 0xc0)
|
||
|
goto error;
|
||
|
if (charMax < max6)
|
||
|
*to++ = invalid;
|
||
|
else {
|
||
|
unsigned long c = ((((c0 & vmask6) << 6) | c1) << 6) | c2;
|
||
|
c = (((((c << 6) | c3) << 6) | c4) << 6) | c5;
|
||
|
if (c < min6)
|
||
|
c = invalid;
|
||
|
*to++ = c;
|
||
|
}
|
||
|
slen -= 6;
|
||
|
us += 6;
|
||
|
}
|
||
|
else {
|
||
|
error:
|
||
|
us++;
|
||
|
slen--;
|
||
|
*to++ = invalid;
|
||
|
recover:
|
||
|
for (;;) {
|
||
|
if (slen == 0) {
|
||
|
recovering_ = 1;
|
||
|
goto done;
|
||
|
}
|
||
|
if ((*us & 0xc0) != 0x80)
|
||
|
break;
|
||
|
us++;
|
||
|
slen--;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
done:
|
||
|
*result = (char *)us;
|
||
|
return to - start;
|
||
|
}
|
||
|
|
||
|
Boolean UTF8Decoder::convertOffset(unsigned long &n) const
|
||
|
{
|
||
|
if (hadByteOrderMark_)
|
||
|
n += 3;
|
||
|
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
UTF8Encoder::UTF8Encoder()
|
||
|
{
|
||
|
}
|
||
|
|
||
|
void UTF8Encoder::output(const Char *s, size_t n, OutputByteStream *sb)
|
||
|
{
|
||
|
for (; n > 0; s++, n--) {
|
||
|
Char c = *s;
|
||
|
if (c < min2)
|
||
|
sb->sputc((unsigned char)c);
|
||
|
else if (c < min3) {
|
||
|
sb->sputc((c >> 6) | cval2);
|
||
|
sb->sputc((c & 0x3f) | 0x80);
|
||
|
}
|
||
|
else if (c < min4) {
|
||
|
sb->sputc((c >> 12) | cval3);
|
||
|
sb->sputc(((c >> 6) & 0x3f) | 0x80);
|
||
|
sb->sputc((c & 0x3f) | 0x80);
|
||
|
}
|
||
|
else if (c < min5) {
|
||
|
sb->sputc((c >> 18) | cval4);
|
||
|
sb->sputc(((c >> 12) & 0x3f) | 0x80);
|
||
|
sb->sputc(((c >> 6) & 0x3f) | 0x80);
|
||
|
sb->sputc((c & 0x3f) | 0x80);
|
||
|
}
|
||
|
else if (c < min6) {
|
||
|
sb->sputc((c >> 24) | cval5);
|
||
|
sb->sputc(((c >> 18) & 0x3f) | 0x80);
|
||
|
sb->sputc(((c >> 12) & 0x3f) | 0x80);
|
||
|
sb->sputc(((c >> 6) & 0x3f) | 0x80);
|
||
|
sb->sputc((c & 0x3f) | 0x80);
|
||
|
}
|
||
|
else if (c <= max6) {
|
||
|
sb->sputc((c >> 30) | cval6);
|
||
|
sb->sputc(((c >> 24) & 0x3f) | 0x80);
|
||
|
sb->sputc(((c >> 18) & 0x3f) | 0x80);
|
||
|
sb->sputc(((c >> 12) & 0x3f) | 0x80);
|
||
|
sb->sputc(((c >> 6) & 0x3f) | 0x80);
|
||
|
sb->sputc((c & 0x3f) | 0x80);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
#ifdef SP_NAMESPACE
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
#else /* not SP_MULTI_BYTE */
|
||
|
|
||
|
#ifndef __GNUG__
|
||
|
static char non_empty_translation_unit; // sigh
|
||
|
#endif
|
||
|
|
||
|
#endif /* not SP_MULTI_BYTE */
|