289 lines
7.8 KiB
C++
289 lines
7.8 KiB
C++
|
// Copyright (c) 1994, 1997 James Clark
|
||
|
// See the file COPYING for copying permission.
|
||
|
|
||
|
#ifdef __GNUG__
|
||
|
#pragma implementation
|
||
|
#endif
|
||
|
#include "splib.h"
|
||
|
#include "CharsetRegistry.h"
|
||
|
#include "CharsetInfo.h"
|
||
|
#include "StringC.h"
|
||
|
#include "types.h"
|
||
|
#include "macros.h"
|
||
|
|
||
|
#ifdef SP_NAMESPACE
|
||
|
namespace SP_NAMESPACE {
|
||
|
#endif
|
||
|
|
||
|
CharsetRegistry::Iter::~Iter()
|
||
|
{
|
||
|
}
|
||
|
|
||
|
class CharsetRegistryRangeIter : public CharsetRegistry::Iter {
|
||
|
public:
|
||
|
CharsetRegistryRangeIter(const UnivCharsetDesc::Range *p, size_t n)
|
||
|
: p_(p), n_(n) { }
|
||
|
Boolean next(WideChar &min, WideChar &max, UnivChar &univ) {
|
||
|
if (n_) {
|
||
|
min = p_->descMin;
|
||
|
max = p_->descMin + (p_->count - 1);
|
||
|
univ = p_->univMin;
|
||
|
p_++;
|
||
|
n_--;
|
||
|
return 1;
|
||
|
}
|
||
|
else
|
||
|
return 0;
|
||
|
}
|
||
|
private:
|
||
|
const UnivCharsetDesc::Range *p_;
|
||
|
size_t n_;
|
||
|
};
|
||
|
|
||
|
class CharsetRegistryDescIter : public CharsetRegistry::Iter {
|
||
|
public:
|
||
|
CharsetRegistryDescIter(const unsigned short *p)
|
||
|
: p_(p + 2), n_(p[0]), c_(p[1]) { }
|
||
|
Boolean next(WideChar &min, WideChar &max, UnivChar &univ) {
|
||
|
if (n_ == 0) {
|
||
|
n_ = *p_;
|
||
|
if (n_ == 0)
|
||
|
return 0;
|
||
|
p_++;
|
||
|
c_ = *p_++;
|
||
|
}
|
||
|
int i = 1;
|
||
|
for (; i < n_; i++)
|
||
|
if (p_[i] != p_[i - 1] + 1)
|
||
|
break;
|
||
|
min = c_;
|
||
|
max = min + (i - 1);
|
||
|
univ = p_[0];
|
||
|
p_ += i;
|
||
|
c_ += i;
|
||
|
n_ -= i;
|
||
|
return 1;
|
||
|
}
|
||
|
private:
|
||
|
const unsigned short *p_;
|
||
|
size_t n_;
|
||
|
WideChar c_;
|
||
|
};
|
||
|
|
||
|
static struct {
|
||
|
const char *esc;
|
||
|
CharsetRegistry::ISORegistrationNumber number;
|
||
|
} escTable[] = {
|
||
|
{ "\x1B\x25\x40", CharsetRegistry::ISO646_ASCII_G0 },
|
||
|
{ "\x1B\x28\x40", CharsetRegistry::ISO646_ASCII_G0 },
|
||
|
{ "\x1B\x28\x42", CharsetRegistry::ISO646_ASCII_G0 }, // ASCII
|
||
|
{ "\x1B\x21\x40", CharsetRegistry::ISO646_C0 },
|
||
|
{ "\x1B\x2D\x41", CharsetRegistry::ISO8859_1 },
|
||
|
{ "\x1B\x2D\x42", CharsetRegistry::ISO8859_2 },
|
||
|
{ "\x1B\x2D\x43", CharsetRegistry::ISO8859_3 },
|
||
|
{ "\x1B\x2D\x44", CharsetRegistry::ISO8859_4 },
|
||
|
{ "\x1B\x2D\x4C", CharsetRegistry::ISO8859_5 },
|
||
|
{ "\x1B\x2D\x47", CharsetRegistry::ISO8859_6 },
|
||
|
{ "\x1B\x2D\x46", CharsetRegistry::ISO8859_7 },
|
||
|
{ "\x1B\x2D\x48", CharsetRegistry::ISO8859_8 },
|
||
|
{ "\x1B\x2D\x4D", CharsetRegistry::ISO8859_9 },
|
||
|
{ "\x1B\x28\x4A", CharsetRegistry::ISO646_JIS_G0 },
|
||
|
{ "\x1B\x28\x49", CharsetRegistry::JIS0201 },
|
||
|
{ "\x1B\x24\x42", CharsetRegistry::JIS0208 },
|
||
|
{ "\x1B\x26\x40\x1B\x24\x42", CharsetRegistry::JIS0208 },
|
||
|
{ "\x1B\x24\x28\x44", CharsetRegistry::JIS0212 },
|
||
|
{ "\x1B\x24\x41", CharsetRegistry::GB2312 },
|
||
|
{ "\x1B\x24\x28\x43", CharsetRegistry::KSC5601 },
|
||
|
{ "\x1B\x25\x2F\x40", CharsetRegistry::ISO10646_UCS2 },
|
||
|
{ "\x1B\x25\x2F\x41", CharsetRegistry::ISO10646_UCS4 },
|
||
|
{ "\x1B\x25\x2F\x43", CharsetRegistry::ISO10646_UCS2 },
|
||
|
{ "\x1B\x25\x2F\x44", CharsetRegistry::ISO10646_UCS4 },
|
||
|
{ "\x1B\x25\x2F\x45", CharsetRegistry::ISO10646_UCS2 },
|
||
|
{ "\x1B\x25\x2F\x46", CharsetRegistry::ISO10646_UCS4 },
|
||
|
};
|
||
|
|
||
|
static const UnivCharsetDesc::Range iso646_ascii[] = {
|
||
|
{ 0, 128, 0 },
|
||
|
};
|
||
|
|
||
|
static const UnivCharsetDesc::Range iso646_C0[] = {
|
||
|
{ 0, 32, 0 },
|
||
|
{ 127, 1, 127 },
|
||
|
};
|
||
|
|
||
|
static const UnivCharsetDesc::Range iso6429[] = {
|
||
|
{ 0, 32, 128 },
|
||
|
};
|
||
|
|
||
|
static const UnivCharsetDesc::Range iso8859_1[] = {
|
||
|
{ 32, 96, 160 },
|
||
|
};
|
||
|
|
||
|
static const UnivCharsetDesc::Range iso10646_ucs2[] = {
|
||
|
{ 0, 65536, 0 },
|
||
|
};
|
||
|
|
||
|
static const UnivCharsetDesc::Range iso10646_ucs4[] = {
|
||
|
{ 0, 0x80000000, 0 },
|
||
|
};
|
||
|
|
||
|
static struct {
|
||
|
CharsetRegistry::ISORegistrationNumber number;
|
||
|
const UnivCharsetDesc::Range *ranges;
|
||
|
size_t nRanges;
|
||
|
} rangeTable[] = {
|
||
|
{ CharsetRegistry::ISO646_ASCII_G0, iso646_ascii, SIZEOF(iso646_ascii) },
|
||
|
{ CharsetRegistry::ISO646_C0, iso646_C0, SIZEOF(iso646_C0) },
|
||
|
{ CharsetRegistry::ISO6429, iso6429, SIZEOF(iso6429) },
|
||
|
{ CharsetRegistry::ISO8859_1, iso8859_1, SIZEOF(iso8859_1) },
|
||
|
{ CharsetRegistry::ISO10646_UCS2, iso10646_ucs2, SIZEOF(iso10646_ucs2) },
|
||
|
{ CharsetRegistry::ISO10646_UCS4, iso10646_ucs4, SIZEOF(iso10646_ucs4) },
|
||
|
};
|
||
|
|
||
|
static const unsigned short iso8859_2[] = {
|
||
|
#include "iso8859-2.h"
|
||
|
};
|
||
|
static const unsigned short iso8859_3[] = {
|
||
|
#include "iso8859-3.h"
|
||
|
};
|
||
|
static const unsigned short iso8859_4[] = {
|
||
|
#include "iso8859-4.h"
|
||
|
};
|
||
|
static const unsigned short iso8859_5[] = {
|
||
|
#include "iso8859-5.h"
|
||
|
};
|
||
|
static const unsigned short iso8859_6[] = {
|
||
|
#include "iso8859-6.h"
|
||
|
};
|
||
|
static const unsigned short iso8859_7[] = {
|
||
|
#include "iso8859-7.h"
|
||
|
};
|
||
|
static const unsigned short iso8859_8[] = {
|
||
|
#include "iso8859-8.h"
|
||
|
};
|
||
|
static const unsigned short iso8859_9[] = {
|
||
|
#include "iso8859-9.h"
|
||
|
};
|
||
|
static const unsigned short koi8_r[] = {
|
||
|
#include "koi8-r.h"
|
||
|
};
|
||
|
static const unsigned short iso646_jis_G0[] = {
|
||
|
#include "iso646-jis.h"
|
||
|
};
|
||
|
static const unsigned short jis0201[] = {
|
||
|
#include "jis0201.h"
|
||
|
};
|
||
|
|
||
|
#ifdef SP_MULTI_BYTE
|
||
|
|
||
|
static const unsigned short jis0208[] = {
|
||
|
#include "jis0208.h"
|
||
|
};
|
||
|
static const unsigned short jis0212[] = {
|
||
|
#include "jis0212.h"
|
||
|
};
|
||
|
static const unsigned short gb2312[] = {
|
||
|
#include "gb2312.h"
|
||
|
};
|
||
|
static const unsigned short ksc5601[] = {
|
||
|
#include "ksc5601.h"
|
||
|
};
|
||
|
static const unsigned short big5[] = {
|
||
|
#include "big5.h"
|
||
|
};
|
||
|
|
||
|
#endif /* SP_MULTI_BYTE */
|
||
|
|
||
|
static const struct {
|
||
|
CharsetRegistry::ISORegistrationNumber number;
|
||
|
const unsigned short *desc;
|
||
|
} descTable[] = {
|
||
|
{ CharsetRegistry::ISO8859_2, iso8859_2 },
|
||
|
{ CharsetRegistry::ISO8859_3, iso8859_3 },
|
||
|
{ CharsetRegistry::ISO8859_4, iso8859_4 },
|
||
|
{ CharsetRegistry::ISO8859_5, iso8859_5 },
|
||
|
{ CharsetRegistry::ISO8859_6, iso8859_6 },
|
||
|
{ CharsetRegistry::ISO8859_7, iso8859_7 },
|
||
|
{ CharsetRegistry::ISO8859_8, iso8859_8 },
|
||
|
{ CharsetRegistry::ISO8859_9, iso8859_9 },
|
||
|
{ CharsetRegistry::KOI8_R, koi8_r },
|
||
|
{ CharsetRegistry::ISO646_JIS_G0, iso646_jis_G0 },
|
||
|
{ CharsetRegistry::JIS0201, jis0201 },
|
||
|
#ifdef SP_MULTI_BYTE
|
||
|
{ CharsetRegistry::JIS0208, jis0208 },
|
||
|
{ CharsetRegistry::JIS0212, jis0212 },
|
||
|
{ CharsetRegistry::GB2312, gb2312 },
|
||
|
{ CharsetRegistry::KSC5601, ksc5601 },
|
||
|
{ CharsetRegistry::BIG5, big5 },
|
||
|
#endif
|
||
|
};
|
||
|
|
||
|
|
||
|
CharsetRegistry::ISORegistrationNumber
|
||
|
CharsetRegistry::getRegistrationNumber(const StringC &sequence,
|
||
|
const CharsetInfo &charset)
|
||
|
{
|
||
|
// Canonicalize the escape sequence by mapping esc -> ESC,
|
||
|
// removing leading zeros from escape sequences, and removing
|
||
|
// initial spaces.
|
||
|
StringC s;
|
||
|
for (size_t i = 0; i < sequence.size(); i++) {
|
||
|
Char c = sequence[i];
|
||
|
if (c == charset.execToDesc('e'))
|
||
|
s += charset.execToDesc('E');
|
||
|
else if (c == charset.execToDesc('s'))
|
||
|
s += charset.execToDesc('S');
|
||
|
else if (c == charset.execToDesc('c'))
|
||
|
s += charset.execToDesc('C');
|
||
|
else if (charset.digitWeight(c) >= 0
|
||
|
&& s.size() > 0
|
||
|
&& s[s.size() - 1] == charset.execToDesc('0')
|
||
|
&& (s.size() == 1
|
||
|
|| charset.digitWeight(s[s.size() - 2]) >= 0))
|
||
|
s[s.size() - 1] = c;
|
||
|
else if (c != charset.execToDesc(' ') || s.size() > 0)
|
||
|
s += c;
|
||
|
}
|
||
|
for (size_t i = 0; i < SIZEOF(escTable); i++) {
|
||
|
StringC esc;
|
||
|
for (const char *p = escTable[i].esc; *p; p++) {
|
||
|
if (*p == 0x1B)
|
||
|
esc += charset.execToDesc("ESC");
|
||
|
else {
|
||
|
static const char digits[] = "0123456789";
|
||
|
int c = (unsigned char)*p >> 4;
|
||
|
if (c >= 10)
|
||
|
esc += charset.execToDesc('1');
|
||
|
esc += charset.execToDesc(digits[c % 10]);
|
||
|
esc += charset.execToDesc('/');
|
||
|
c = (*p & 0xf);
|
||
|
if (c >= 10)
|
||
|
esc += charset.execToDesc('1');
|
||
|
esc += charset.execToDesc(digits[c % 10]);
|
||
|
}
|
||
|
if (p[1])
|
||
|
esc += charset.execToDesc(' ');
|
||
|
}
|
||
|
if (s == esc)
|
||
|
return escTable[i].number;
|
||
|
}
|
||
|
return UNREGISTERED;
|
||
|
}
|
||
|
|
||
|
CharsetRegistry::Iter *CharsetRegistry::makeIter(ISORegistrationNumber number)
|
||
|
{
|
||
|
for (size_t i = 0; i < SIZEOF(rangeTable); i++) {
|
||
|
if (number == rangeTable[i].number)
|
||
|
return new CharsetRegistryRangeIter(rangeTable[i].ranges, rangeTable[i].nRanges);
|
||
|
}
|
||
|
for (size_t i = 0; i < SIZEOF(descTable); i++) {
|
||
|
if (number == descTable[i].number)
|
||
|
return new CharsetRegistryDescIter(descTable[i].desc);
|
||
|
}
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
#ifdef SP_NAMESPACE
|
||
|
}
|
||
|
#endif
|