425 lines
9.4 KiB
C++
425 lines
9.4 KiB
C++
|
// Copyright (c) 1994, 1997 James Clark, 2000 Matthias Clasen
|
||
|
// See the file COPYING for copying permission.
|
||
|
|
||
|
#ifdef __GNUG__
|
||
|
#pragma implementation
|
||
|
#endif
|
||
|
#include "splib.h"
|
||
|
|
||
|
#ifdef SP_MULTI_BYTE
|
||
|
|
||
|
#include "XMLCodingSystem.h"
|
||
|
#include "UTF8CodingSystem.h"
|
||
|
#include "UTF16CodingSystem.h"
|
||
|
#include "Fixed4CodingSystem.h"
|
||
|
#include "CodingSystemKit.h"
|
||
|
#include "Boolean.h"
|
||
|
#include "Owner.h"
|
||
|
#include "macros.h"
|
||
|
#include <stddef.h>
|
||
|
#include <string.h>
|
||
|
|
||
|
#ifdef SP_DECLARE_MEMMOVE
|
||
|
extern "C" {
|
||
|
void *memmove(void *, const void *, size_t);
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
#ifdef SP_NAMESPACE
|
||
|
namespace SP_NAMESPACE {
|
||
|
#endif
|
||
|
|
||
|
const Char ISO646_TAB = 0x9;
|
||
|
const Char ISO646_LF = 0xA;
|
||
|
const Char ISO646_CR = 0xD;
|
||
|
const Char ISO646_SPACE = 0x20;
|
||
|
const Char ISO646_QUOT = 0x22;
|
||
|
const Char ISO646_APOS = 0x27;
|
||
|
const Char ISO646_LT = 0x3C;
|
||
|
const Char ISO646_EQUAL = 0x3D;
|
||
|
const Char ISO646_GT = 0x3E;
|
||
|
const Char ISO646_QUEST = 0x3F;
|
||
|
const Char ISO646_LETTER_a = 0x61;
|
||
|
const Char ISO646_LETTER_c = 0x63;
|
||
|
const Char ISO646_LETTER_d = 0x64;
|
||
|
const Char ISO646_LETTER_e = 0x65;
|
||
|
const Char ISO646_LETTER_g = 0x67;
|
||
|
const Char ISO646_LETTER_i = 0x69;
|
||
|
const Char ISO646_LETTER_l = 0x6C;
|
||
|
const Char ISO646_LETTER_m = 0x6D;
|
||
|
const Char ISO646_LETTER_n = 0x6E;
|
||
|
const Char ISO646_LETTER_o = 0x6F;
|
||
|
const Char ISO646_LETTER_x = 0x78;
|
||
|
|
||
|
class XMLDecoder : public Decoder {
|
||
|
public:
|
||
|
XMLDecoder(const InputCodingSystemKit *);
|
||
|
size_t decode(Char *to, const char *from, size_t fromLen,
|
||
|
const char **rest);
|
||
|
Boolean convertOffset(unsigned long &offset) const;
|
||
|
private:
|
||
|
|
||
|
// Don't keep parsing a PI longer than this.
|
||
|
// We want to avoid reading some enormous file into memory just because
|
||
|
// some quote was left off.
|
||
|
enum { piMaxSize = 1024*32 };
|
||
|
|
||
|
void initDecoderDefault();
|
||
|
void initDecoderPI();
|
||
|
Boolean extractEncoding(StringC &name);
|
||
|
static Boolean isWS(Char);
|
||
|
|
||
|
enum DetectPhase {
|
||
|
phaseInit,
|
||
|
phasePI,
|
||
|
phaseFinish
|
||
|
};
|
||
|
DetectPhase phase_;
|
||
|
Boolean byteOrderMark_;
|
||
|
Boolean lsbFirst_;
|
||
|
Boolean lswFirst_;
|
||
|
int guessBytesPerChar_;
|
||
|
Owner<Decoder> subDecoder_;
|
||
|
// Contains all the characters passed to caller that were
|
||
|
// not produced by subDecoder_.
|
||
|
StringC pi_;
|
||
|
Char piLiteral_;
|
||
|
const InputCodingSystemKit *kit_;
|
||
|
};
|
||
|
|
||
|
XMLCodingSystem::XMLCodingSystem(const InputCodingSystemKit *kit)
|
||
|
: kit_(kit)
|
||
|
{
|
||
|
}
|
||
|
|
||
|
Decoder *XMLCodingSystem::makeDecoder() const
|
||
|
{
|
||
|
return new XMLDecoder(kit_);
|
||
|
}
|
||
|
|
||
|
Encoder *XMLCodingSystem::makeEncoder() const
|
||
|
{
|
||
|
UTF8CodingSystem utf8;
|
||
|
return utf8.makeEncoder();
|
||
|
}
|
||
|
|
||
|
XMLDecoder::XMLDecoder(const InputCodingSystemKit *kit)
|
||
|
: Decoder(1),
|
||
|
kit_(kit),
|
||
|
phase_(phaseInit),
|
||
|
byteOrderMark_(0),
|
||
|
lsbFirst_(0),
|
||
|
lswFirst_(0),
|
||
|
guessBytesPerChar_(1),
|
||
|
piLiteral_(0)
|
||
|
{
|
||
|
}
|
||
|
|
||
|
size_t XMLDecoder::decode(Char *to, const char *from, size_t fromLen,
|
||
|
const char **rest)
|
||
|
{
|
||
|
if (phase_ == phaseFinish)
|
||
|
return subDecoder_->decode(to, from, fromLen, rest);
|
||
|
if (phase_ == phaseInit) {
|
||
|
if (fromLen == 0) {
|
||
|
*rest = from;
|
||
|
return 0;
|
||
|
}
|
||
|
switch ((unsigned char)*from) {
|
||
|
case 0x00:
|
||
|
case 0x3C:
|
||
|
case 0xFF:
|
||
|
case 0xFE:
|
||
|
if (fromLen < 2) {
|
||
|
*rest = from;
|
||
|
return 0;
|
||
|
}
|
||
|
switch (((unsigned char)from[0] << 8) | (unsigned char)from[1]) {
|
||
|
case 0xFEFF:
|
||
|
phase_ = phasePI;
|
||
|
byteOrderMark_ = 1;
|
||
|
guessBytesPerChar_ = 2;
|
||
|
from += 2;
|
||
|
fromLen -= 2;
|
||
|
break;
|
||
|
case 0xFFFE:
|
||
|
lsbFirst_ = 1;
|
||
|
phase_ = phasePI;
|
||
|
byteOrderMark_ = 1;
|
||
|
guessBytesPerChar_ = 2;
|
||
|
from += 2;
|
||
|
fromLen -= 2;
|
||
|
break;
|
||
|
case 0x3C3F:
|
||
|
phase_ = phasePI;
|
||
|
break;
|
||
|
case 0x0000:
|
||
|
case 0x3C00:
|
||
|
case 0x003C:
|
||
|
if (fromLen < 4) {
|
||
|
*rest = from;
|
||
|
return 0;
|
||
|
}
|
||
|
switch (((unsigned char)from[0] << 24)
|
||
|
| ((unsigned char)from[1] << 16)
|
||
|
| ((unsigned char)from[2] << 8)
|
||
|
| (unsigned char)from[3]) {
|
||
|
case 0x0000003C:
|
||
|
lsbFirst_ = 0;
|
||
|
lswFirst_ = 0;
|
||
|
phase_ = phasePI;
|
||
|
guessBytesPerChar_ = 4;
|
||
|
break;
|
||
|
case 0x00003C00:
|
||
|
lsbFirst_ = 1;
|
||
|
lswFirst_ = 0;
|
||
|
phase_ = phasePI;
|
||
|
guessBytesPerChar_ = 4;
|
||
|
break;
|
||
|
case 0x003C0000:
|
||
|
lsbFirst_ = 0;
|
||
|
lswFirst_ = 1;
|
||
|
phase_ = phasePI;
|
||
|
guessBytesPerChar_ = 4;
|
||
|
break;
|
||
|
case 0x3C000000:
|
||
|
lsbFirst_ = 1;
|
||
|
lswFirst_ = 1;
|
||
|
phase_ = phasePI;
|
||
|
guessBytesPerChar_ = 4;
|
||
|
break;
|
||
|
case 0x003C003F:
|
||
|
lsbFirst_ = 1;
|
||
|
phase_ = phasePI;
|
||
|
guessBytesPerChar_ = 2;
|
||
|
break;
|
||
|
case 0x3C003F00:
|
||
|
lsbFirst_ = 0;
|
||
|
phase_ = phasePI;
|
||
|
guessBytesPerChar_ = 2;
|
||
|
break;
|
||
|
default:
|
||
|
break;
|
||
|
}
|
||
|
default:
|
||
|
break;
|
||
|
}
|
||
|
if (phase_ == phasePI)
|
||
|
break;
|
||
|
// fall through
|
||
|
default:
|
||
|
phase_ = phaseFinish;
|
||
|
guessBytesPerChar_ = 1;
|
||
|
initDecoderDefault();
|
||
|
return subDecoder_->decode(to, from, fromLen, rest);
|
||
|
}
|
||
|
}
|
||
|
ASSERT(phase_ == phasePI);
|
||
|
Char *p = to;
|
||
|
for (; fromLen > guessBytesPerChar_;
|
||
|
fromLen -= guessBytesPerChar_, from += guessBytesPerChar_) {
|
||
|
if (!piLiteral_ && pi_.size() > 0 && pi_[pi_.size() - 1] == ISO646_GT) {
|
||
|
initDecoderPI();
|
||
|
phase_ = phaseFinish;
|
||
|
return (p - to) + subDecoder_->decode(p, from, fromLen, rest);
|
||
|
}
|
||
|
Char c;
|
||
|
switch (guessBytesPerChar_) {
|
||
|
case 1:
|
||
|
c = (unsigned char)from[0];
|
||
|
break;
|
||
|
case 2:
|
||
|
c = lsbFirst_ ? ((unsigned char)from[1] << 8) | (unsigned char)from[0]
|
||
|
: ((unsigned char)from[0] << 8) | (unsigned char)from[1];
|
||
|
break;
|
||
|
case 4:
|
||
|
{
|
||
|
size_t shift0 = 8*(!lsbFirst_ + 2*!lswFirst_);
|
||
|
size_t shift1 = 8*(lsbFirst_ + 2*!lswFirst_);
|
||
|
size_t shift2 = 8*(!lsbFirst_ + 2*lswFirst_);
|
||
|
size_t shift3 = 8*(lsbFirst_ + 2*lswFirst_);
|
||
|
c = ((unsigned char)from[0] << shift0)
|
||
|
| ((unsigned char)from[1] << shift1)
|
||
|
| ((unsigned char)from[2] << shift2)
|
||
|
| ((unsigned char)from[3] << shift3);
|
||
|
}
|
||
|
break;
|
||
|
default:
|
||
|
CANNOT_HAPPEN();
|
||
|
}
|
||
|
static const Char startBytes[] = {
|
||
|
ISO646_LT, ISO646_QUEST, ISO646_LETTER_x, ISO646_LETTER_m, ISO646_LETTER_l
|
||
|
};
|
||
|
// Stop accumulating the PI if we get characters that are illegal in the PI.
|
||
|
if (c == 0
|
||
|
|| c >= 0x7F
|
||
|
|| (pi_.size() > 0 && c == ISO646_LT)
|
||
|
|| pi_.size() > piMaxSize
|
||
|
|| (pi_.size() < 5 && c != startBytes[pi_.size()])
|
||
|
|| (pi_.size() == 5 && !isWS(c))) {
|
||
|
initDecoderDefault();
|
||
|
phase_ = phaseFinish;
|
||
|
break;
|
||
|
}
|
||
|
*p++ = c;
|
||
|
pi_ += c;
|
||
|
if (piLiteral_) {
|
||
|
if (c == piLiteral_)
|
||
|
piLiteral_ = 0;
|
||
|
}
|
||
|
else if (c == ISO646_QUOT || c == ISO646_APOS)
|
||
|
piLiteral_ = c;
|
||
|
}
|
||
|
size_t n = p - to;
|
||
|
if (phase_ == phaseFinish && fromLen > 0)
|
||
|
n += subDecoder_->decode(p, from, fromLen, rest);
|
||
|
else
|
||
|
*rest = from;
|
||
|
return n;
|
||
|
}
|
||
|
|
||
|
Boolean XMLDecoder::convertOffset(unsigned long &n) const
|
||
|
{
|
||
|
if (n <= pi_.size())
|
||
|
n *= guessBytesPerChar_;
|
||
|
else {
|
||
|
if (!subDecoder_)
|
||
|
return 0;
|
||
|
unsigned long tem = n - pi_.size();
|
||
|
if (!subDecoder_->convertOffset(tem))
|
||
|
return 0;
|
||
|
n = tem + pi_.size() * guessBytesPerChar_;
|
||
|
}
|
||
|
if (byteOrderMark_)
|
||
|
n += 2;
|
||
|
return 1;
|
||
|
}
|
||
|
|
||
|
void XMLDecoder::initDecoderDefault()
|
||
|
{
|
||
|
switch (guessBytesPerChar_) {
|
||
|
case 1:
|
||
|
{
|
||
|
UTF8CodingSystem utf8;
|
||
|
subDecoder_ = utf8.makeDecoder();
|
||
|
}
|
||
|
break;
|
||
|
case 2:
|
||
|
{
|
||
|
UTF16CodingSystem utf16;
|
||
|
subDecoder_ = utf16.makeDecoder(lsbFirst_);
|
||
|
}
|
||
|
break;
|
||
|
case 4:
|
||
|
{
|
||
|
Fixed4CodingSystem utf32;
|
||
|
subDecoder_ = utf32.makeDecoder(lsbFirst_, lswFirst_);
|
||
|
}
|
||
|
break;
|
||
|
default:
|
||
|
CANNOT_HAPPEN();
|
||
|
}
|
||
|
minBytesPerChar_ = subDecoder_->minBytesPerChar();
|
||
|
}
|
||
|
|
||
|
void XMLDecoder::initDecoderPI()
|
||
|
{
|
||
|
StringC name;
|
||
|
if (!extractEncoding(name))
|
||
|
initDecoderDefault();
|
||
|
const char *dummy;
|
||
|
static const UnivCharsetDesc::Range range = { 0, 128, 0 };
|
||
|
CharsetInfo piCharset(UnivCharsetDesc(&range, 1));
|
||
|
const InputCodingSystem *ics
|
||
|
= kit_->makeInputCodingSystem(name,
|
||
|
piCharset,
|
||
|
0,
|
||
|
dummy);
|
||
|
if (ics) {
|
||
|
subDecoder_ = ics->makeDecoder(lsbFirst_, lswFirst_);
|
||
|
minBytesPerChar_ = subDecoder_->minBytesPerChar();
|
||
|
}
|
||
|
if (!subDecoder_)
|
||
|
initDecoderDefault();
|
||
|
}
|
||
|
|
||
|
Boolean XMLDecoder::isWS(Char c)
|
||
|
{
|
||
|
switch (c) {
|
||
|
case ISO646_CR:
|
||
|
case ISO646_LF:
|
||
|
case ISO646_SPACE:
|
||
|
case ISO646_TAB:
|
||
|
return 1;
|
||
|
}
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
Boolean XMLDecoder::extractEncoding(StringC &name)
|
||
|
{
|
||
|
Char lit = 0;
|
||
|
for (size_t i = 5; i < pi_.size(); i++) {
|
||
|
if (!lit) {
|
||
|
if (pi_[i] == ISO646_APOS || pi_[i] == ISO646_QUOT)
|
||
|
lit = pi_[i];
|
||
|
else if (pi_[i] == ISO646_EQUAL) {
|
||
|
size_t j = i;
|
||
|
for (; j > 0; j--) {
|
||
|
if (!isWS(pi_[j - 1]))
|
||
|
break;
|
||
|
}
|
||
|
size_t nameEnd = j;
|
||
|
for (; j > 0; j--) {
|
||
|
if (isWS(pi_[j - 1]) || pi_[j - 1] == ISO646_QUOT || pi_[j - 1] == ISO646_APOS)
|
||
|
break;
|
||
|
}
|
||
|
static const Char encodingName[] = {
|
||
|
ISO646_LETTER_e, ISO646_LETTER_n, ISO646_LETTER_c, ISO646_LETTER_o,
|
||
|
ISO646_LETTER_d, ISO646_LETTER_i, ISO646_LETTER_n, ISO646_LETTER_g,
|
||
|
0
|
||
|
};
|
||
|
const Char *s = encodingName;
|
||
|
for (; *s && j < nameEnd; j++, s++)
|
||
|
if (pi_[j] != *s)
|
||
|
break;
|
||
|
if (j == nameEnd && *s == 0) {
|
||
|
size_t j = i + 1;
|
||
|
for (; j < pi_.size(); j++) {
|
||
|
if (!isWS(pi_[j]))
|
||
|
break;
|
||
|
}
|
||
|
if (pi_[j] == ISO646_QUOT || pi_[j] == ISO646_APOS) {
|
||
|
Char lit = pi_[j];
|
||
|
size_t nameStart = j + 1;
|
||
|
for (++j; j < pi_.size(); j++) {
|
||
|
if (pi_[j] == lit) {
|
||
|
if (j > nameStart) {
|
||
|
name.assign(&pi_[nameStart], j - nameStart);
|
||
|
return 1;
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
return 0;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
else if (pi_[i] == lit)
|
||
|
lit = 0;
|
||
|
}
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
#ifdef SP_NAMESPACE
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
#else /* not SP_MULTI_BYTE */
|
||
|
|
||
|
#ifndef __GNUG__
|
||
|
static char non_empty_translation_unit; // sigh
|
||
|
#endif
|
||
|
|
||
|
#endif /* not SP_MULTI_BYTE */
|