300 lines
12 KiB
C++
300 lines
12 KiB
C++
|
// Copyright (c) 1994 James Clark
|
||
|
// See the file COPYING for copying permission.
|
||
|
|
||
|
#ifdef __GNUG__
|
||
|
#pragma implementation
|
||
|
#endif
|
||
|
#include "splib.h"
|
||
|
#include <limits.h>
|
||
|
#include <stdlib.h>
|
||
|
#include "macros.h"
|
||
|
#include "types.h"
|
||
|
#include "Syntax.h"
|
||
|
#include "token.h"
|
||
|
#include "Sd.h"
|
||
|
#include "Mode.h"
|
||
|
#include "ModeInfo.h"
|
||
|
|
||
|
#ifdef SP_NAMESPACE
|
||
|
namespace SP_NAMESPACE {
|
||
|
#endif
|
||
|
|
||
|
const unsigned REQUIRE_EMPTY_STARTTAG = 01;
|
||
|
const unsigned REQUIRE_EMPTY_ENDTAG = 02;
|
||
|
const unsigned REQUIRE_CONCUR = 04;
|
||
|
const unsigned REQUIRE_LINK_OR_CONCUR = 010;
|
||
|
const unsigned REQUIRE_NOT_KEEPRSRE = 020;
|
||
|
const unsigned REQUIRE_FLAGS = 037;
|
||
|
|
||
|
#define ULONG_BIT (CHAR_BIT * sizeof(unsigned long))
|
||
|
|
||
|
struct PackedTokenInfo {
|
||
|
Token token; // token to be returned
|
||
|
unsigned flags;
|
||
|
unsigned char contents[2]; // components of the delimiter or d-i-c
|
||
|
unsigned char modes[25]; // list of modes in which it is recognized,
|
||
|
// terminated by EOM
|
||
|
// a bit vector computed from modes (lo to hi)
|
||
|
unsigned long modeBits[(nModes + ULONG_BIT - 1)/ULONG_BIT];
|
||
|
void computeModeBits();
|
||
|
Boolean inMode(Mode mode) const;
|
||
|
};
|
||
|
|
||
|
const unsigned char SET = Syntax::nDelimGeneral;
|
||
|
const unsigned char FUNCTION = SET + Syntax::nSet;
|
||
|
const unsigned char NOTHING = UCHAR_MAX;
|
||
|
|
||
|
const unsigned char EOM = 255; // end of modes
|
||
|
|
||
|
static PackedTokenInfo tokenTable[] = {
|
||
|
// Delimiters and delimiters in context
|
||
|
{ tokenAnd, 0, { Syntax::dAND, NOTHING }, { grpMode, EOM }},
|
||
|
{ tokenCom, 0, { Syntax::dCOM, NOTHING },
|
||
|
{ mdMode, mdMinusMode, mdPeroMode, sdMode, comMode, sdcomMode, piPasMode, EOM }},
|
||
|
{ tokenCroDigit, 0, { Syntax::dCRO, SET + Syntax::digit },
|
||
|
{ econMode, mconMode, rcconMode, econnetMode, mconnetMode, rcconnetMode,
|
||
|
rcconeMode, plitMode, plitaMode, pliteMode, sdplitMode, sdplitaMode,
|
||
|
alitMode, alitaMode, aliteMode,
|
||
|
talitMode, talitaMode, taliteMode, rcmsMode, EOM }},
|
||
|
{ tokenCroNameStart, 0, { Syntax::dCRO, SET + Syntax::nameStart },
|
||
|
{ econMode, mconMode, rcconMode, econnetMode, mconnetMode, rcconnetMode,
|
||
|
rcconeMode, plitMode, plitaMode, pliteMode, sdplitMode, sdplitaMode,
|
||
|
alitMode, alitaMode, aliteMode,
|
||
|
talitMode, talitaMode, taliteMode, rcmsMode, EOM }},
|
||
|
{ tokenDsc, 0, { Syntax::dDSC, NOTHING },
|
||
|
{ /* mdMode, */ asMode, dsMode, EOM }},
|
||
|
{ tokenDso, 0, { Syntax::dDSO, NOTHING }, { mdMode, EOM }},
|
||
|
{ tokenDtgc, 0, { Syntax::dDTGC, NOTHING }, { grpMode, EOM }},
|
||
|
{ tokenDtgo, 0, { Syntax::dDTGO, NOTHING }, { grpMode, EOM }},
|
||
|
{ tokenEroNameStart, 0, { Syntax::dERO, SET + Syntax::nameStart },
|
||
|
{ econMode, mconMode, rcconMode, econnetMode, mconnetMode, rcconnetMode,
|
||
|
rcconeMode, alitMode, alitaMode, aliteMode, talitMode, talitaMode,
|
||
|
taliteMode, rcmsMode, EOM }},
|
||
|
{ tokenEroGrpo, REQUIRE_LINK_OR_CONCUR, { Syntax::dERO, Syntax::dGRPO },
|
||
|
{ econMode, mconMode, rcconMode, econnetMode, mconnetMode, rcconnetMode,
|
||
|
rcconeMode, alitMode, alitaMode, aliteMode, talitMode, talitaMode,
|
||
|
taliteMode, rcmsMode, EOM }},
|
||
|
{ tokenEtago, 0, { Syntax::dETAGO, NOTHING }, { tagMode, EOM }},
|
||
|
{ tokenEtagoNameStart, 0, { Syntax::dETAGO, SET + Syntax::nameStart },
|
||
|
{ econMode, mconMode, cconMode, rcconMode,
|
||
|
econnetMode, mconnetMode, cconnetMode, rcconnetMode, EOM }},
|
||
|
{ tokenEtagoTagc, REQUIRE_EMPTY_ENDTAG, { Syntax::dETAGO, Syntax::dTAGC },
|
||
|
{ econMode, mconMode, cconMode, rcconMode,
|
||
|
econnetMode, mconnetMode, cconnetMode, rcconnetMode, EOM }},
|
||
|
{ tokenEtagoGrpo, REQUIRE_CONCUR, { Syntax::dETAGO, Syntax::dGRPO },
|
||
|
{ econMode, mconMode, cconMode, rcconMode,
|
||
|
econnetMode, mconnetMode, cconnetMode, rcconnetMode, EOM }},
|
||
|
{ tokenGrpc, 0, { Syntax::dGRPC, NOTHING }, { grpMode, EOM }},
|
||
|
{ tokenGrpo, 0, { Syntax::dGRPO, NOTHING },
|
||
|
{ mdMode, mdMinusMode, grpMode, EOM }},
|
||
|
{ tokenHcroHexDigit, 0, { Syntax::dHCRO, SET + Syntax::hexDigit },
|
||
|
{ econMode, mconMode, rcconMode, econnetMode, mconnetMode, rcconnetMode,
|
||
|
rcconeMode, plitMode, plitaMode, pliteMode,
|
||
|
alitMode, alitaMode, aliteMode,
|
||
|
talitMode, talitaMode, taliteMode, rcmsMode, EOM }},
|
||
|
{ tokenLit, 0, { Syntax::dLIT, NOTHING },
|
||
|
{ alitMode, talitMode, plitMode, sdplitMode, mlitMode, slitMode, sdslitMode,
|
||
|
asMode, piPasMode, tagMode, mdMode, sdMode, grpMode, EOM }},
|
||
|
{ tokenLita, 0, { Syntax::dLITA, NOTHING },
|
||
|
{ alitaMode, talitaMode, plitaMode, sdplitaMode, mlitaMode, slitaMode, sdslitaMode,
|
||
|
asMode, piPasMode, tagMode, mdMode, sdMode, grpMode, EOM }},
|
||
|
{ tokenMdc, 0, { Syntax::dMDC, NOTHING }, { mdMode, sdMode, EOM }},
|
||
|
{ tokenMdoNameStart, 0, { Syntax::dMDO, SET + Syntax::nameStart },
|
||
|
{ econMode, mconMode, econnetMode, mconnetMode,
|
||
|
proMode, dsMode, dsiMode, EOM }},
|
||
|
{ tokenMdoMdc, 0, { Syntax::dMDO, Syntax::dMDC },
|
||
|
{ econMode, mconMode, econnetMode, mconnetMode,
|
||
|
proMode, dsMode, dsiMode, EOM }},
|
||
|
{ tokenMdoCom, 0, { Syntax::dMDO, Syntax::dCOM },
|
||
|
{ econMode, mconMode, econnetMode, mconnetMode,
|
||
|
proMode, dsMode, dsiMode, EOM }},
|
||
|
{ tokenMdoDso, 0, { Syntax::dMDO, Syntax::dDSO },
|
||
|
{ econMode, mconMode, econnetMode, mconnetMode,
|
||
|
dsMode, dsiMode, imsMode, EOM }},
|
||
|
{ tokenMinus, 0, { Syntax::dMINUS, NOTHING }, { mdMinusMode, sdMode, EOM }},
|
||
|
{ tokenMinusGrpo, 0, { Syntax::dMINUS, Syntax::dGRPO }, { mdMode, EOM }},
|
||
|
{ tokenMscMdc, 0, { Syntax::dMSC, Syntax::dMDC},
|
||
|
{ imsMode, cmsMode, rcmsMode,
|
||
|
econMode, mconMode, econnetMode, mconnetMode, dsMode, dsiMode, EOM }},
|
||
|
{ tokenNestc, 0, { Syntax::dNESTC, NOTHING }, { tagMode, EOM }},
|
||
|
{ tokenNet, 0, { Syntax::dNET, NOTHING },
|
||
|
{ econnetMode, mconnetMode, cconnetMode, rcconnetMode, EOM }},
|
||
|
{ tokenOpt, 0, { Syntax::dOPT, NOTHING }, { grpMode, grpsufMode, EOM }},
|
||
|
{ tokenOr, 0, { Syntax::dOR, NOTHING }, { grpMode, EOM }},
|
||
|
{ tokenPero, 0, { Syntax::dPERO, NOTHING }, { mdPeroMode, EOM }},
|
||
|
{ tokenPeroNameStart, 0, { Syntax::dPERO, SET + Syntax::nameStart }, {
|
||
|
mdMode, mdMinusMode, mdPeroMode, dsMode, dsiMode, grpMode,
|
||
|
plitMode, plitaMode, pliteMode, sdplitMode, sdplitaMode, EOM }},
|
||
|
{ tokenPeroGrpo, REQUIRE_LINK_OR_CONCUR, { Syntax::dPERO, Syntax::dGRPO },
|
||
|
{ mdMode, mdMinusMode, mdPeroMode, dsMode, dsiMode, grpMode,
|
||
|
plitMode, plitaMode, pliteMode, sdplitMode, sdplitaMode, EOM }},
|
||
|
{ tokenPic, 0, { Syntax::dPIC, NOTHING }, { piMode, EOM }},
|
||
|
{ tokenPio, 0, { Syntax::dPIO, NOTHING },
|
||
|
{ econMode, mconMode, econnetMode, mconnetMode, proMode,
|
||
|
dsMode, dsiMode, EOM }},
|
||
|
{ tokenPlus, 0, { Syntax::dPLUS, NOTHING }, { grpMode, grpsufMode, EOM }},
|
||
|
{ tokenPlusGrpo, 0, { Syntax::dPLUS, Syntax::dGRPO }, { mdMode, EOM }},
|
||
|
{ tokenRefc, 0, { Syntax::dREFC, NOTHING }, { refMode, EOM }},
|
||
|
{ tokenRep, 0, { Syntax::dREP, NOTHING }, { grpMode, grpsufMode, EOM }},
|
||
|
{ tokenRni, 0, { Syntax::dRNI, NOTHING },
|
||
|
{ grpMode, mdMode, mdPeroMode, EOM }},
|
||
|
{ tokenSeq, 0, { Syntax::dSEQ, NOTHING }, { grpMode, EOM }},
|
||
|
{ tokenStago, 0, { Syntax::dSTAGO, NOTHING }, { tagMode, EOM }},
|
||
|
{ tokenStagoNameStart, 0, { Syntax::dSTAGO, SET + Syntax::nameStart },
|
||
|
{ econMode, mconMode, econnetMode, mconnetMode, EOM }},
|
||
|
{ tokenStagoTagc, REQUIRE_EMPTY_STARTTAG, { Syntax::dSTAGO, Syntax::dTAGC },
|
||
|
{ econMode, mconMode, econnetMode, mconnetMode, EOM }},
|
||
|
{ tokenStagoGrpo, REQUIRE_CONCUR, { Syntax::dSTAGO, Syntax::dGRPO },
|
||
|
{ econMode, mconMode, econnetMode, mconnetMode, EOM }},
|
||
|
{ tokenTagc, 0, { Syntax::dTAGC, NOTHING }, { tagMode, EOM }},
|
||
|
{ tokenVi, 0, { Syntax::dVI, NOTHING }, { tagMode, asMode, piPasMode, EOM }},
|
||
|
// Other tokens
|
||
|
{ tokenRe, REQUIRE_NOT_KEEPRSRE, { FUNCTION + Syntax::fRE, NOTHING },
|
||
|
{ mconMode, cconMode, rcconMode,
|
||
|
mconnetMode, cconnetMode, rcconnetMode,
|
||
|
rcconeMode, cmsMode, rcmsMode, EOM }},
|
||
|
{ tokenRe, 0, { FUNCTION + Syntax::fRE, NOTHING },
|
||
|
{ refMode,
|
||
|
mlitMode, mlitaMode, alitMode, alitaMode, aliteMode,
|
||
|
talitMode, talitaMode, taliteMode,
|
||
|
EOM }},
|
||
|
{ tokenRs, REQUIRE_NOT_KEEPRSRE, { FUNCTION + Syntax::fRS, NOTHING },
|
||
|
{ mconMode, cconMode, rcconMode,
|
||
|
mconnetMode, cconnetMode, rcconnetMode,
|
||
|
rcconeMode, cmsMode, rcmsMode, EOM }},
|
||
|
{ tokenRs, 0, { FUNCTION + Syntax::fRS, NOTHING },
|
||
|
{ mlitMode, mlitaMode, alitMode, alitaMode, aliteMode,
|
||
|
talitMode, talitaMode, taliteMode,
|
||
|
EOM }},
|
||
|
{ tokenSpace, 0, { FUNCTION + Syntax::fSPACE, NOTHING },
|
||
|
{ mlitMode, mlitaMode, talitMode, talitaMode, taliteMode, EOM }},
|
||
|
{ tokenSepchar, 0, { SET + Syntax::sepchar, NOTHING },
|
||
|
{ alitMode, alitaMode, aliteMode,
|
||
|
talitMode, talitaMode, taliteMode, EOM }},
|
||
|
{ tokenS, 0, { SET + Syntax::s, NOTHING },
|
||
|
{ econMode, econnetMode, grpMode, mdMode, mdMinusMode, mdPeroMode, sdMode,
|
||
|
proMode, dsMode, dsiMode, asMode, piPasMode, tagMode, EOM }},
|
||
|
{ tokenNameStart, 0, { SET + Syntax::nameStart, NOTHING },
|
||
|
{ grpMode, mdMode, mdMinusMode, mdPeroMode, sdMode,
|
||
|
asMode, piPasMode, tagMode, EOM }},
|
||
|
{ tokenDigit, 0, { SET + Syntax::digit, NOTHING },
|
||
|
{ grpMode, mdMode, mdMinusMode, sdMode, asMode, piPasMode, tagMode, EOM }},
|
||
|
{ tokenLcUcNmchar, 0, { SET + Syntax::nmchar, NOTHING },
|
||
|
{ grpMode, mdMode, asMode, piPasMode, tagMode, EOM }},
|
||
|
{ tokenIgnoredChar, 0, { SET + Syntax::sgmlChar, NOTHING },
|
||
|
{ imsMode, EOM }},
|
||
|
{ tokenChar, 0, { SET + Syntax::sgmlChar, NOTHING },
|
||
|
// Note that character data is recognized in element content,
|
||
|
// and will cause #PCDATA to begin.
|
||
|
{ alitMode, alitaMode, aliteMode,
|
||
|
talitMode, talitaMode, taliteMode,
|
||
|
comMode, piMode,
|
||
|
cmsMode, rcmsMode,
|
||
|
plitMode, plitaMode, pliteMode,
|
||
|
slitMode, slitaMode,
|
||
|
econMode, mconMode, cconMode, rcconMode,
|
||
|
econnetMode, mconnetMode, cconnetMode, rcconnetMode, rcconeMode, EOM }},
|
||
|
{ tokenChar, 0, { SET + Syntax::minimumData, NOTHING },
|
||
|
{ mlitMode, mlitaMode, EOM }},
|
||
|
{ tokenChar, 0, { SET + Syntax::significant, NOTHING },
|
||
|
{ sdplitMode, sdplitaMode, sdslitMode, sdslitaMode, sdcomMode, EOM }},
|
||
|
};
|
||
|
|
||
|
inline Boolean PackedTokenInfo::inMode(Mode mode) const
|
||
|
{
|
||
|
return ((modeBits[unsigned(mode) / ULONG_BIT]
|
||
|
& ((unsigned long)1 << (unsigned(mode) % ULONG_BIT)))
|
||
|
!= 0);
|
||
|
}
|
||
|
|
||
|
void PackedTokenInfo::computeModeBits()
|
||
|
{
|
||
|
for (unsigned char *p = modes; *p != EOM; p++)
|
||
|
modeBits[*p / ULONG_BIT] |= (unsigned long)1 << (*p % ULONG_BIT);
|
||
|
}
|
||
|
|
||
|
struct TokenTableIniter {
|
||
|
TokenTableIniter();
|
||
|
};
|
||
|
|
||
|
static TokenTableIniter tokenTableIniter;
|
||
|
|
||
|
TokenTableIniter::TokenTableIniter()
|
||
|
{
|
||
|
for (size_t i = 0; i < SIZEOF(tokenTable); i++)
|
||
|
tokenTable[i].computeModeBits();
|
||
|
}
|
||
|
|
||
|
ModeInfo::ModeInfo(Mode mode, const Sd &sd)
|
||
|
: mode_(mode), p_(tokenTable), count_(SIZEOF(tokenTable)),
|
||
|
missingRequirements_(REQUIRE_FLAGS)
|
||
|
{
|
||
|
if (sd.startTagEmpty())
|
||
|
missingRequirements_ &= ~REQUIRE_EMPTY_STARTTAG;
|
||
|
if (sd.endTagEmpty())
|
||
|
missingRequirements_ &= ~REQUIRE_EMPTY_ENDTAG;
|
||
|
if (sd.concur())
|
||
|
missingRequirements_ &= ~(REQUIRE_CONCUR|REQUIRE_LINK_OR_CONCUR);
|
||
|
if (sd.link())
|
||
|
missingRequirements_ &= ~REQUIRE_LINK_OR_CONCUR;
|
||
|
if (!sd.keeprsre())
|
||
|
missingRequirements_ &= ~REQUIRE_NOT_KEEPRSRE;
|
||
|
}
|
||
|
|
||
|
Boolean ModeInfo::nextToken(TokenInfo *t)
|
||
|
{
|
||
|
for (; count_ > 0; --count_, ++p_)
|
||
|
if (p_->inMode(mode_) && (p_->flags & missingRequirements_) == 0) {
|
||
|
t->token = p_->token;
|
||
|
t->priority = Priority::delim;
|
||
|
const unsigned char *contents = p_->contents;
|
||
|
--count_;
|
||
|
++p_;
|
||
|
unsigned char c = contents[0];
|
||
|
if (c < SET)
|
||
|
t->delim1 = Syntax::DelimGeneral(c);
|
||
|
else if (c < SET + Syntax::nSet) {
|
||
|
t->set = Syntax::Set(c - SET);
|
||
|
t->type = TokenInfo::setType;
|
||
|
switch (t->set) {
|
||
|
case Syntax::sepchar:
|
||
|
case Syntax::s:
|
||
|
case Syntax::blank:
|
||
|
t->priority = Priority::function;
|
||
|
break;
|
||
|
default:
|
||
|
t->priority = Priority::data;
|
||
|
break;
|
||
|
}
|
||
|
return 1;
|
||
|
}
|
||
|
else {
|
||
|
t->function = Syntax::StandardFunction(c - FUNCTION);
|
||
|
t->priority = Priority::function;
|
||
|
t->type = TokenInfo::functionType;
|
||
|
return 1;
|
||
|
}
|
||
|
c = contents[1];
|
||
|
if (c == NOTHING) {
|
||
|
t->type = TokenInfo::delimType;
|
||
|
return 1;
|
||
|
}
|
||
|
if (c < SET) {
|
||
|
t->delim2 = Syntax::DelimGeneral(c);
|
||
|
t->type = TokenInfo::delimDelimType;
|
||
|
return 1;
|
||
|
}
|
||
|
if (c < SET + Syntax::nSet) {
|
||
|
t->set = Syntax::Set(c - SET);
|
||
|
t->type = TokenInfo::delimSetType;
|
||
|
return 1;
|
||
|
}
|
||
|
abort();
|
||
|
}
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
#ifdef SP_NAMESPACE
|
||
|
}
|
||
|
#endif
|