// Copyright (c) 1994, 1997 James Clark, 2000 Matthias Clasen // See the file COPYING for copying permission. #ifdef __GNUG__ #pragma implementation #endif #include "splib.h" #ifdef SP_MULTI_BYTE #include "XMLCodingSystem.h" #include "UTF8CodingSystem.h" #include "UTF16CodingSystem.h" #include "Fixed4CodingSystem.h" #include "CodingSystemKit.h" #include "Boolean.h" #include "Owner.h" #include "macros.h" #include #include #ifdef SP_DECLARE_MEMMOVE extern "C" { void *memmove(void *, const void *, size_t); } #endif #ifdef SP_NAMESPACE namespace SP_NAMESPACE { #endif const Char ISO646_TAB = 0x9; const Char ISO646_LF = 0xA; const Char ISO646_CR = 0xD; const Char ISO646_SPACE = 0x20; const Char ISO646_QUOT = 0x22; const Char ISO646_APOS = 0x27; const Char ISO646_LT = 0x3C; const Char ISO646_EQUAL = 0x3D; const Char ISO646_GT = 0x3E; const Char ISO646_QUEST = 0x3F; const Char ISO646_LETTER_a = 0x61; const Char ISO646_LETTER_c = 0x63; const Char ISO646_LETTER_d = 0x64; const Char ISO646_LETTER_e = 0x65; const Char ISO646_LETTER_g = 0x67; const Char ISO646_LETTER_i = 0x69; const Char ISO646_LETTER_l = 0x6C; const Char ISO646_LETTER_m = 0x6D; const Char ISO646_LETTER_n = 0x6E; const Char ISO646_LETTER_o = 0x6F; const Char ISO646_LETTER_x = 0x78; class XMLDecoder : public Decoder { public: XMLDecoder(const InputCodingSystemKit *); size_t decode(Char *to, const char *from, size_t fromLen, const char **rest); Boolean convertOffset(unsigned long &offset) const; private: // Don't keep parsing a PI longer than this. // We want to avoid reading some enormous file into memory just because // some quote was left off. enum { piMaxSize = 1024*32 }; void initDecoderDefault(); void initDecoderPI(); Boolean extractEncoding(StringC &name); static Boolean isWS(Char); enum DetectPhase { phaseInit, phasePI, phaseFinish }; DetectPhase phase_; Boolean byteOrderMark_; Boolean lsbFirst_; Boolean lswFirst_; int guessBytesPerChar_; Owner subDecoder_; // Contains all the characters passed to caller that were // not produced by subDecoder_. StringC pi_; Char piLiteral_; const InputCodingSystemKit *kit_; }; XMLCodingSystem::XMLCodingSystem(const InputCodingSystemKit *kit) : kit_(kit) { } Decoder *XMLCodingSystem::makeDecoder() const { return new XMLDecoder(kit_); } Encoder *XMLCodingSystem::makeEncoder() const { UTF8CodingSystem utf8; return utf8.makeEncoder(); } XMLDecoder::XMLDecoder(const InputCodingSystemKit *kit) : Decoder(1), kit_(kit), phase_(phaseInit), byteOrderMark_(0), lsbFirst_(0), lswFirst_(0), guessBytesPerChar_(1), piLiteral_(0) { } size_t XMLDecoder::decode(Char *to, const char *from, size_t fromLen, const char **rest) { if (phase_ == phaseFinish) return subDecoder_->decode(to, from, fromLen, rest); if (phase_ == phaseInit) { if (fromLen == 0) { *rest = from; return 0; } switch ((unsigned char)*from) { case 0x00: case 0x3C: case 0xFF: case 0xFE: if (fromLen < 2) { *rest = from; return 0; } switch (((unsigned char)from[0] << 8) | (unsigned char)from[1]) { case 0xFEFF: phase_ = phasePI; byteOrderMark_ = 1; guessBytesPerChar_ = 2; from += 2; fromLen -= 2; break; case 0xFFFE: lsbFirst_ = 1; phase_ = phasePI; byteOrderMark_ = 1; guessBytesPerChar_ = 2; from += 2; fromLen -= 2; break; case 0x3C3F: phase_ = phasePI; break; case 0x0000: case 0x3C00: case 0x003C: if (fromLen < 4) { *rest = from; return 0; } switch (((unsigned char)from[0] << 24) | ((unsigned char)from[1] << 16) | ((unsigned char)from[2] << 8) | (unsigned char)from[3]) { case 0x0000003C: lsbFirst_ = 0; lswFirst_ = 0; phase_ = phasePI; guessBytesPerChar_ = 4; break; case 0x00003C00: lsbFirst_ = 1; lswFirst_ = 0; phase_ = phasePI; guessBytesPerChar_ = 4; break; case 0x003C0000: lsbFirst_ = 0; lswFirst_ = 1; phase_ = phasePI; guessBytesPerChar_ = 4; break; case 0x3C000000: lsbFirst_ = 1; lswFirst_ = 1; phase_ = phasePI; guessBytesPerChar_ = 4; break; case 0x003C003F: lsbFirst_ = 1; phase_ = phasePI; guessBytesPerChar_ = 2; break; case 0x3C003F00: lsbFirst_ = 0; phase_ = phasePI; guessBytesPerChar_ = 2; break; default: break; } default: break; } if (phase_ == phasePI) break; // fall through default: phase_ = phaseFinish; guessBytesPerChar_ = 1; initDecoderDefault(); return subDecoder_->decode(to, from, fromLen, rest); } } ASSERT(phase_ == phasePI); Char *p = to; for (; fromLen > guessBytesPerChar_; fromLen -= guessBytesPerChar_, from += guessBytesPerChar_) { if (!piLiteral_ && pi_.size() > 0 && pi_[pi_.size() - 1] == ISO646_GT) { initDecoderPI(); phase_ = phaseFinish; return (p - to) + subDecoder_->decode(p, from, fromLen, rest); } Char c; switch (guessBytesPerChar_) { case 1: c = (unsigned char)from[0]; break; case 2: c = lsbFirst_ ? ((unsigned char)from[1] << 8) | (unsigned char)from[0] : ((unsigned char)from[0] << 8) | (unsigned char)from[1]; break; case 4: { size_t shift0 = 8*(!lsbFirst_ + 2*!lswFirst_); size_t shift1 = 8*(lsbFirst_ + 2*!lswFirst_); size_t shift2 = 8*(!lsbFirst_ + 2*lswFirst_); size_t shift3 = 8*(lsbFirst_ + 2*lswFirst_); c = ((unsigned char)from[0] << shift0) | ((unsigned char)from[1] << shift1) | ((unsigned char)from[2] << shift2) | ((unsigned char)from[3] << shift3); } break; default: CANNOT_HAPPEN(); } static const Char startBytes[] = { ISO646_LT, ISO646_QUEST, ISO646_LETTER_x, ISO646_LETTER_m, ISO646_LETTER_l }; // Stop accumulating the PI if we get characters that are illegal in the PI. if (c == 0 || c >= 0x7F || (pi_.size() > 0 && c == ISO646_LT) || pi_.size() > piMaxSize || (pi_.size() < 5 && c != startBytes[pi_.size()]) || (pi_.size() == 5 && !isWS(c))) { initDecoderDefault(); phase_ = phaseFinish; break; } *p++ = c; pi_ += c; if (piLiteral_) { if (c == piLiteral_) piLiteral_ = 0; } else if (c == ISO646_QUOT || c == ISO646_APOS) piLiteral_ = c; } size_t n = p - to; if (phase_ == phaseFinish && fromLen > 0) n += subDecoder_->decode(p, from, fromLen, rest); else *rest = from; return n; } Boolean XMLDecoder::convertOffset(unsigned long &n) const { if (n <= pi_.size()) n *= guessBytesPerChar_; else { if (!subDecoder_) return 0; unsigned long tem = n - pi_.size(); if (!subDecoder_->convertOffset(tem)) return 0; n = tem + pi_.size() * guessBytesPerChar_; } if (byteOrderMark_) n += 2; return 1; } void XMLDecoder::initDecoderDefault() { switch (guessBytesPerChar_) { case 1: { UTF8CodingSystem utf8; subDecoder_ = utf8.makeDecoder(); } break; case 2: { UTF16CodingSystem utf16; subDecoder_ = utf16.makeDecoder(lsbFirst_); } break; case 4: { Fixed4CodingSystem utf32; subDecoder_ = utf32.makeDecoder(lsbFirst_, lswFirst_); } break; default: CANNOT_HAPPEN(); } minBytesPerChar_ = subDecoder_->minBytesPerChar(); } void XMLDecoder::initDecoderPI() { StringC name; if (!extractEncoding(name)) initDecoderDefault(); const char *dummy; static const UnivCharsetDesc::Range range = { 0, 128, 0 }; CharsetInfo piCharset(UnivCharsetDesc(&range, 1)); const InputCodingSystem *ics = kit_->makeInputCodingSystem(name, piCharset, 0, dummy); if (ics) { subDecoder_ = ics->makeDecoder(lsbFirst_, lswFirst_); minBytesPerChar_ = subDecoder_->minBytesPerChar(); } if (!subDecoder_) initDecoderDefault(); } Boolean XMLDecoder::isWS(Char c) { switch (c) { case ISO646_CR: case ISO646_LF: case ISO646_SPACE: case ISO646_TAB: return 1; } return 0; } Boolean XMLDecoder::extractEncoding(StringC &name) { Char lit = 0; for (size_t i = 5; i < pi_.size(); i++) { if (!lit) { if (pi_[i] == ISO646_APOS || pi_[i] == ISO646_QUOT) lit = pi_[i]; else if (pi_[i] == ISO646_EQUAL) { size_t j = i; for (; j > 0; j--) { if (!isWS(pi_[j - 1])) break; } size_t nameEnd = j; for (; j > 0; j--) { if (isWS(pi_[j - 1]) || pi_[j - 1] == ISO646_QUOT || pi_[j - 1] == ISO646_APOS) break; } static const Char encodingName[] = { ISO646_LETTER_e, ISO646_LETTER_n, ISO646_LETTER_c, ISO646_LETTER_o, ISO646_LETTER_d, ISO646_LETTER_i, ISO646_LETTER_n, ISO646_LETTER_g, 0 }; const Char *s = encodingName; for (; *s && j < nameEnd; j++, s++) if (pi_[j] != *s) break; if (j == nameEnd && *s == 0) { size_t j = i + 1; for (; j < pi_.size(); j++) { if (!isWS(pi_[j])) break; } if (pi_[j] == ISO646_QUOT || pi_[j] == ISO646_APOS) { Char lit = pi_[j]; size_t nameStart = j + 1; for (++j; j < pi_.size(); j++) { if (pi_[j] == lit) { if (j > nameStart) { name.assign(&pi_[nameStart], j - nameStart); return 1; } break; } } } return 0; } } } else if (pi_[i] == lit) lit = 0; } return 0; } #ifdef SP_NAMESPACE } #endif #else /* not SP_MULTI_BYTE */ #ifndef __GNUG__ static char non_empty_translation_unit; // sigh #endif #endif /* not SP_MULTI_BYTE */