// Copyright (c) 1994 James Clark // See the file COPYING for copying permission. #ifdef __GNUG__ #pragma implementation #endif #include "splib.h" #include #include #include "macros.h" #include "types.h" #include "Syntax.h" #include "token.h" #include "Sd.h" #include "Mode.h" #include "ModeInfo.h" #ifdef SP_NAMESPACE namespace SP_NAMESPACE { #endif const unsigned REQUIRE_EMPTY_STARTTAG = 01; const unsigned REQUIRE_EMPTY_ENDTAG = 02; const unsigned REQUIRE_CONCUR = 04; const unsigned REQUIRE_LINK_OR_CONCUR = 010; const unsigned REQUIRE_NOT_KEEPRSRE = 020; const unsigned REQUIRE_FLAGS = 037; #define ULONG_BIT (CHAR_BIT * sizeof(unsigned long)) struct PackedTokenInfo { Token token; // token to be returned unsigned flags; unsigned char contents[2]; // components of the delimiter or d-i-c unsigned char modes[25]; // list of modes in which it is recognized, // terminated by EOM // a bit vector computed from modes (lo to hi) unsigned long modeBits[(nModes + ULONG_BIT - 1)/ULONG_BIT]; void computeModeBits(); Boolean inMode(Mode mode) const; }; const unsigned char SET = Syntax::nDelimGeneral; const unsigned char FUNCTION = SET + Syntax::nSet; const unsigned char NOTHING = UCHAR_MAX; const unsigned char EOM = 255; // end of modes static PackedTokenInfo tokenTable[] = { // Delimiters and delimiters in context { tokenAnd, 0, { Syntax::dAND, NOTHING }, { grpMode, EOM }}, { tokenCom, 0, { Syntax::dCOM, NOTHING }, { mdMode, mdMinusMode, mdPeroMode, sdMode, comMode, sdcomMode, piPasMode, EOM }}, { tokenCroDigit, 0, { Syntax::dCRO, SET + Syntax::digit }, { econMode, mconMode, rcconMode, econnetMode, mconnetMode, rcconnetMode, rcconeMode, plitMode, plitaMode, pliteMode, sdplitMode, sdplitaMode, alitMode, alitaMode, aliteMode, talitMode, talitaMode, taliteMode, rcmsMode, EOM }}, { tokenCroNameStart, 0, { Syntax::dCRO, SET + Syntax::nameStart }, { econMode, mconMode, rcconMode, econnetMode, mconnetMode, rcconnetMode, rcconeMode, plitMode, plitaMode, pliteMode, sdplitMode, sdplitaMode, alitMode, alitaMode, aliteMode, talitMode, talitaMode, taliteMode, rcmsMode, EOM }}, { tokenDsc, 0, { Syntax::dDSC, NOTHING }, { /* mdMode, */ asMode, dsMode, EOM }}, { tokenDso, 0, { Syntax::dDSO, NOTHING }, { mdMode, EOM }}, { tokenDtgc, 0, { Syntax::dDTGC, NOTHING }, { grpMode, EOM }}, { tokenDtgo, 0, { Syntax::dDTGO, NOTHING }, { grpMode, EOM }}, { tokenEroNameStart, 0, { Syntax::dERO, SET + Syntax::nameStart }, { econMode, mconMode, rcconMode, econnetMode, mconnetMode, rcconnetMode, rcconeMode, alitMode, alitaMode, aliteMode, talitMode, talitaMode, taliteMode, rcmsMode, EOM }}, { tokenEroGrpo, REQUIRE_LINK_OR_CONCUR, { Syntax::dERO, Syntax::dGRPO }, { econMode, mconMode, rcconMode, econnetMode, mconnetMode, rcconnetMode, rcconeMode, alitMode, alitaMode, aliteMode, talitMode, talitaMode, taliteMode, rcmsMode, EOM }}, { tokenEtago, 0, { Syntax::dETAGO, NOTHING }, { tagMode, EOM }}, { tokenEtagoNameStart, 0, { Syntax::dETAGO, SET + Syntax::nameStart }, { econMode, mconMode, cconMode, rcconMode, econnetMode, mconnetMode, cconnetMode, rcconnetMode, EOM }}, { tokenEtagoTagc, REQUIRE_EMPTY_ENDTAG, { Syntax::dETAGO, Syntax::dTAGC }, { econMode, mconMode, cconMode, rcconMode, econnetMode, mconnetMode, cconnetMode, rcconnetMode, EOM }}, { tokenEtagoGrpo, REQUIRE_CONCUR, { Syntax::dETAGO, Syntax::dGRPO }, { econMode, mconMode, cconMode, rcconMode, econnetMode, mconnetMode, cconnetMode, rcconnetMode, EOM }}, { tokenGrpc, 0, { Syntax::dGRPC, NOTHING }, { grpMode, EOM }}, { tokenGrpo, 0, { Syntax::dGRPO, NOTHING }, { mdMode, mdMinusMode, grpMode, EOM }}, { tokenHcroHexDigit, 0, { Syntax::dHCRO, SET + Syntax::hexDigit }, { econMode, mconMode, rcconMode, econnetMode, mconnetMode, rcconnetMode, rcconeMode, plitMode, plitaMode, pliteMode, alitMode, alitaMode, aliteMode, talitMode, talitaMode, taliteMode, rcmsMode, EOM }}, { tokenLit, 0, { Syntax::dLIT, NOTHING }, { alitMode, talitMode, plitMode, sdplitMode, mlitMode, slitMode, sdslitMode, asMode, piPasMode, tagMode, mdMode, sdMode, grpMode, EOM }}, { tokenLita, 0, { Syntax::dLITA, NOTHING }, { alitaMode, talitaMode, plitaMode, sdplitaMode, mlitaMode, slitaMode, sdslitaMode, asMode, piPasMode, tagMode, mdMode, sdMode, grpMode, EOM }}, { tokenMdc, 0, { Syntax::dMDC, NOTHING }, { mdMode, sdMode, EOM }}, { tokenMdoNameStart, 0, { Syntax::dMDO, SET + Syntax::nameStart }, { econMode, mconMode, econnetMode, mconnetMode, proMode, dsMode, dsiMode, EOM }}, { tokenMdoMdc, 0, { Syntax::dMDO, Syntax::dMDC }, { econMode, mconMode, econnetMode, mconnetMode, proMode, dsMode, dsiMode, EOM }}, { tokenMdoCom, 0, { Syntax::dMDO, Syntax::dCOM }, { econMode, mconMode, econnetMode, mconnetMode, proMode, dsMode, dsiMode, EOM }}, { tokenMdoDso, 0, { Syntax::dMDO, Syntax::dDSO }, { econMode, mconMode, econnetMode, mconnetMode, dsMode, dsiMode, imsMode, EOM }}, { tokenMinus, 0, { Syntax::dMINUS, NOTHING }, { mdMinusMode, sdMode, EOM }}, { tokenMinusGrpo, 0, { Syntax::dMINUS, Syntax::dGRPO }, { mdMode, EOM }}, { tokenMscMdc, 0, { Syntax::dMSC, Syntax::dMDC}, { imsMode, cmsMode, rcmsMode, econMode, mconMode, econnetMode, mconnetMode, dsMode, dsiMode, EOM }}, { tokenNestc, 0, { Syntax::dNESTC, NOTHING }, { tagMode, EOM }}, { tokenNet, 0, { Syntax::dNET, NOTHING }, { econnetMode, mconnetMode, cconnetMode, rcconnetMode, EOM }}, { tokenOpt, 0, { Syntax::dOPT, NOTHING }, { grpMode, grpsufMode, EOM }}, { tokenOr, 0, { Syntax::dOR, NOTHING }, { grpMode, EOM }}, { tokenPero, 0, { Syntax::dPERO, NOTHING }, { mdPeroMode, EOM }}, { tokenPeroNameStart, 0, { Syntax::dPERO, SET + Syntax::nameStart }, { mdMode, mdMinusMode, mdPeroMode, dsMode, dsiMode, grpMode, plitMode, plitaMode, pliteMode, sdplitMode, sdplitaMode, EOM }}, { tokenPeroGrpo, REQUIRE_LINK_OR_CONCUR, { Syntax::dPERO, Syntax::dGRPO }, { mdMode, mdMinusMode, mdPeroMode, dsMode, dsiMode, grpMode, plitMode, plitaMode, pliteMode, sdplitMode, sdplitaMode, EOM }}, { tokenPic, 0, { Syntax::dPIC, NOTHING }, { piMode, EOM }}, { tokenPio, 0, { Syntax::dPIO, NOTHING }, { econMode, mconMode, econnetMode, mconnetMode, proMode, dsMode, dsiMode, EOM }}, { tokenPlus, 0, { Syntax::dPLUS, NOTHING }, { grpMode, grpsufMode, EOM }}, { tokenPlusGrpo, 0, { Syntax::dPLUS, Syntax::dGRPO }, { mdMode, EOM }}, { tokenRefc, 0, { Syntax::dREFC, NOTHING }, { refMode, EOM }}, { tokenRep, 0, { Syntax::dREP, NOTHING }, { grpMode, grpsufMode, EOM }}, { tokenRni, 0, { Syntax::dRNI, NOTHING }, { grpMode, mdMode, mdPeroMode, EOM }}, { tokenSeq, 0, { Syntax::dSEQ, NOTHING }, { grpMode, EOM }}, { tokenStago, 0, { Syntax::dSTAGO, NOTHING }, { tagMode, EOM }}, { tokenStagoNameStart, 0, { Syntax::dSTAGO, SET + Syntax::nameStart }, { econMode, mconMode, econnetMode, mconnetMode, EOM }}, { tokenStagoTagc, REQUIRE_EMPTY_STARTTAG, { Syntax::dSTAGO, Syntax::dTAGC }, { econMode, mconMode, econnetMode, mconnetMode, EOM }}, { tokenStagoGrpo, REQUIRE_CONCUR, { Syntax::dSTAGO, Syntax::dGRPO }, { econMode, mconMode, econnetMode, mconnetMode, EOM }}, { tokenTagc, 0, { Syntax::dTAGC, NOTHING }, { tagMode, EOM }}, { tokenVi, 0, { Syntax::dVI, NOTHING }, { tagMode, asMode, piPasMode, EOM }}, // Other tokens { tokenRe, REQUIRE_NOT_KEEPRSRE, { FUNCTION + Syntax::fRE, NOTHING }, { mconMode, cconMode, rcconMode, mconnetMode, cconnetMode, rcconnetMode, rcconeMode, cmsMode, rcmsMode, EOM }}, { tokenRe, 0, { FUNCTION + Syntax::fRE, NOTHING }, { refMode, mlitMode, mlitaMode, alitMode, alitaMode, aliteMode, talitMode, talitaMode, taliteMode, EOM }}, { tokenRs, REQUIRE_NOT_KEEPRSRE, { FUNCTION + Syntax::fRS, NOTHING }, { mconMode, cconMode, rcconMode, mconnetMode, cconnetMode, rcconnetMode, rcconeMode, cmsMode, rcmsMode, EOM }}, { tokenRs, 0, { FUNCTION + Syntax::fRS, NOTHING }, { mlitMode, mlitaMode, alitMode, alitaMode, aliteMode, talitMode, talitaMode, taliteMode, EOM }}, { tokenSpace, 0, { FUNCTION + Syntax::fSPACE, NOTHING }, { mlitMode, mlitaMode, talitMode, talitaMode, taliteMode, EOM }}, { tokenSepchar, 0, { SET + Syntax::sepchar, NOTHING }, { alitMode, alitaMode, aliteMode, talitMode, talitaMode, taliteMode, EOM }}, { tokenS, 0, { SET + Syntax::s, NOTHING }, { econMode, econnetMode, grpMode, mdMode, mdMinusMode, mdPeroMode, sdMode, proMode, dsMode, dsiMode, asMode, piPasMode, tagMode, EOM }}, { tokenNameStart, 0, { SET + Syntax::nameStart, NOTHING }, { grpMode, mdMode, mdMinusMode, mdPeroMode, sdMode, asMode, piPasMode, tagMode, EOM }}, { tokenDigit, 0, { SET + Syntax::digit, NOTHING }, { grpMode, mdMode, mdMinusMode, sdMode, asMode, piPasMode, tagMode, EOM }}, { tokenLcUcNmchar, 0, { SET + Syntax::nmchar, NOTHING }, { grpMode, mdMode, asMode, piPasMode, tagMode, EOM }}, { tokenIgnoredChar, 0, { SET + Syntax::sgmlChar, NOTHING }, { imsMode, EOM }}, { tokenChar, 0, { SET + Syntax::sgmlChar, NOTHING }, // Note that character data is recognized in element content, // and will cause #PCDATA to begin. { alitMode, alitaMode, aliteMode, talitMode, talitaMode, taliteMode, comMode, piMode, cmsMode, rcmsMode, plitMode, plitaMode, pliteMode, slitMode, slitaMode, econMode, mconMode, cconMode, rcconMode, econnetMode, mconnetMode, cconnetMode, rcconnetMode, rcconeMode, EOM }}, { tokenChar, 0, { SET + Syntax::minimumData, NOTHING }, { mlitMode, mlitaMode, EOM }}, { tokenChar, 0, { SET + Syntax::significant, NOTHING }, { sdplitMode, sdplitaMode, sdslitMode, sdslitaMode, sdcomMode, EOM }}, }; inline Boolean PackedTokenInfo::inMode(Mode mode) const { return ((modeBits[unsigned(mode) / ULONG_BIT] & ((unsigned long)1 << (unsigned(mode) % ULONG_BIT))) != 0); } void PackedTokenInfo::computeModeBits() { for (unsigned char *p = modes; *p != EOM; p++) modeBits[*p / ULONG_BIT] |= (unsigned long)1 << (*p % ULONG_BIT); } struct TokenTableIniter { TokenTableIniter(); }; static TokenTableIniter tokenTableIniter; TokenTableIniter::TokenTableIniter() { for (size_t i = 0; i < SIZEOF(tokenTable); i++) tokenTable[i].computeModeBits(); } ModeInfo::ModeInfo(Mode mode, const Sd &sd) : mode_(mode), p_(tokenTable), count_(SIZEOF(tokenTable)), missingRequirements_(REQUIRE_FLAGS) { if (sd.startTagEmpty()) missingRequirements_ &= ~REQUIRE_EMPTY_STARTTAG; if (sd.endTagEmpty()) missingRequirements_ &= ~REQUIRE_EMPTY_ENDTAG; if (sd.concur()) missingRequirements_ &= ~(REQUIRE_CONCUR|REQUIRE_LINK_OR_CONCUR); if (sd.link()) missingRequirements_ &= ~REQUIRE_LINK_OR_CONCUR; if (!sd.keeprsre()) missingRequirements_ &= ~REQUIRE_NOT_KEEPRSRE; } Boolean ModeInfo::nextToken(TokenInfo *t) { for (; count_ > 0; --count_, ++p_) if (p_->inMode(mode_) && (p_->flags & missingRequirements_) == 0) { t->token = p_->token; t->priority = Priority::delim; const unsigned char *contents = p_->contents; --count_; ++p_; unsigned char c = contents[0]; if (c < SET) t->delim1 = Syntax::DelimGeneral(c); else if (c < SET + Syntax::nSet) { t->set = Syntax::Set(c - SET); t->type = TokenInfo::setType; switch (t->set) { case Syntax::sepchar: case Syntax::s: case Syntax::blank: t->priority = Priority::function; break; default: t->priority = Priority::data; break; } return 1; } else { t->function = Syntax::StandardFunction(c - FUNCTION); t->priority = Priority::function; t->type = TokenInfo::functionType; return 1; } c = contents[1]; if (c == NOTHING) { t->type = TokenInfo::delimType; return 1; } if (c < SET) { t->delim2 = Syntax::DelimGeneral(c); t->type = TokenInfo::delimDelimType; return 1; } if (c < SET + Syntax::nSet) { t->set = Syntax::Set(c - SET); t->type = TokenInfo::delimSetType; return 1; } abort(); } return 0; } #ifdef SP_NAMESPACE } #endif