// Package html minifies HTML5 following the specifications at http://www.w3.org/TR/html5/syntax.html. package html import ( "bytes" "io" "github.com/tdewolff/minify/v2" "github.com/tdewolff/parse/v2" "github.com/tdewolff/parse/v2/buffer" "github.com/tdewolff/parse/v2/html" ) var ( gtBytes = []byte(">") isBytes = []byte("=") spaceBytes = []byte(" ") doctypeBytes = []byte("<!doctype html>") jsMimeBytes = []byte("application/javascript") cssMimeBytes = []byte("text/css") htmlMimeBytes = []byte("text/html") svgMimeBytes = []byte("image/svg+xml") formMimeBytes = []byte("application/x-www-form-urlencoded") mathMimeBytes = []byte("application/mathml+xml") dataSchemeBytes = []byte("data:") jsSchemeBytes = []byte("javascript:") httpBytes = []byte("http") radioBytes = []byte("radio") onBytes = []byte("on") textBytes = []byte("text") noneBytes = []byte("none") submitBytes = []byte("submit") allBytes = []byte("all") rectBytes = []byte("rect") dataBytes = []byte("data") getBytes = []byte("get") autoBytes = []byte("auto") oneBytes = []byte("one") inlineParams = map[string]string{"inline": "1"} ) //////////////////////////////////////////////////////////////// // Minifier is an HTML minifier. type Minifier struct { KeepComments bool KeepConditionalComments bool KeepDefaultAttrVals bool KeepDocumentTags bool KeepEndTags bool KeepQuotes bool KeepWhitespace bool } // Minify minifies HTML data, it reads from r and writes to w. func Minify(m *minify.M, w io.Writer, r io.Reader, params map[string]string) error { return (&Minifier{}).Minify(m, w, r, params) } // Minify minifies HTML data, it reads from r and writes to w. func (o *Minifier) Minify(m *minify.M, w io.Writer, r io.Reader, _ map[string]string) error { var rawTagHash Hash var rawTagMediatype []byte omitSpace := true // if true the next leading space is omitted inPre := false attrMinifyBuffer := buffer.NewWriter(make([]byte, 0, 64)) attrByteBuffer := make([]byte, 0, 64) z := parse.NewInput(r) defer z.Restore() l := html.NewLexer(z) tb := NewTokenBuffer(z, l) for { t := *tb.Shift() switch t.TokenType { case html.ErrorToken: if _, err := w.Write(nil); err != nil { return err } if l.Err() == io.EOF { return nil } return l.Err() case html.DoctypeToken: w.Write(doctypeBytes) case html.CommentToken: if o.KeepComments { w.Write(t.Data) } else if o.KeepConditionalComments && 6 < len(t.Text) && (bytes.HasPrefix(t.Text, []byte("[if ")) || bytes.HasSuffix(t.Text, []byte("[endif]")) || bytes.HasSuffix(t.Text, []byte("[endif]--"))) { // [if ...] is always 7 or more characters, [endif] is only encountered for downlevel-revealed // see https://msdn.microsoft.com/en-us/library/ms537512(v=vs.85).aspx#syntax if bytes.HasPrefix(t.Data, []byte("<!--[if ")) && bytes.HasSuffix(t.Data, []byte("<![endif]-->")) { // downlevel-hidden begin := bytes.IndexByte(t.Data, '>') + 1 end := len(t.Data) - len("<![endif]-->") w.Write(t.Data[:begin]) if err := o.Minify(m, w, buffer.NewReader(t.Data[begin:end]), nil); err != nil { return minify.UpdateErrorPosition(err, z, t.Offset) } w.Write(t.Data[end:]) } else { w.Write(t.Data) // downlevel-revealed or short downlevel-hidden } } else if 1 < len(t.Text) && t.Text[0] == '#' { // SSI tags w.Write(t.Data) } case html.SvgToken: if err := m.MinifyMimetype(svgMimeBytes, w, buffer.NewReader(t.Data), nil); err != nil { if err != minify.ErrNotExist { return minify.UpdateErrorPosition(err, z, t.Offset) } w.Write(t.Data) } case html.MathToken: if err := m.MinifyMimetype(mathMimeBytes, w, buffer.NewReader(t.Data), nil); err != nil { if err != minify.ErrNotExist { return minify.UpdateErrorPosition(err, z, t.Offset) } w.Write(t.Data) } case html.TextToken: // CSS and JS minifiers for inline code if rawTagHash != 0 { if rawTagHash == Style || rawTagHash == Script || rawTagHash == Iframe { var mimetype []byte var params map[string]string if rawTagHash == Iframe { mimetype = htmlMimeBytes } else if len(rawTagMediatype) > 0 { mimetype, params = parse.Mediatype(rawTagMediatype) } else if rawTagHash == Script { mimetype = jsMimeBytes } else if rawTagHash == Style { mimetype = cssMimeBytes } if err := m.MinifyMimetype(mimetype, w, buffer.NewReader(t.Data), params); err != nil { if err != minify.ErrNotExist { return minify.UpdateErrorPosition(err, z, t.Offset) } w.Write(t.Data) } } else { w.Write(t.Data) } } else if inPre { w.Write(t.Data) } else { t.Data = parse.ReplaceMultipleWhitespaceAndEntities(t.Data, EntitiesMap, TextRevEntitiesMap) // whitespace removal; trim left if omitSpace && parse.IsWhitespace(t.Data[0]) { t.Data = t.Data[1:] } // whitespace removal; trim right omitSpace = false if len(t.Data) == 0 { omitSpace = true } else if parse.IsWhitespace(t.Data[len(t.Data)-1]) { omitSpace = true i := 0 for { next := tb.Peek(i) // trim if EOF, text token with leading whitespace or block token if next.TokenType == html.ErrorToken { t.Data = t.Data[:len(t.Data)-1] omitSpace = false break } else if next.TokenType == html.TextToken { // this only happens when a comment, doctype or phrasing end tag (only for !o.KeepWhitespace) was in between // remove if the text token starts with a whitespace if len(next.Data) > 0 && parse.IsWhitespace(next.Data[0]) { t.Data = t.Data[:len(t.Data)-1] omitSpace = false } break } else if next.TokenType == html.StartTagToken || next.TokenType == html.EndTagToken { if o.KeepWhitespace { break } // remove when followed up by a block tag if next.Traits&nonPhrasingTag != 0 { t.Data = t.Data[:len(t.Data)-1] omitSpace = false break } else if next.TokenType == html.StartTagToken { break } } i++ } } w.Write(t.Data) } case html.StartTagToken, html.EndTagToken: rawTagHash = 0 hasAttributes := false if t.TokenType == html.StartTagToken { if next := tb.Peek(0); next.TokenType == html.AttributeToken { hasAttributes = true } if t.Traits&rawTag != 0 { // ignore empty script and style tags if !hasAttributes && (t.Hash == Script || t.Hash == Style) { if next := tb.Peek(1); next.TokenType == html.EndTagToken { tb.Shift() tb.Shift() break } } rawTagHash = t.Hash rawTagMediatype = nil // do not minify content of <style amp-boilerplate> if hasAttributes && t.Hash == Style { if attrs := tb.Attributes(Amp_Boilerplate); attrs[0] != nil { rawTagHash = 0 } } } } else if t.Hash == Template { omitSpace = true // EndTagToken } if t.Hash == Pre { inPre = t.TokenType == html.StartTagToken } // remove superfluous tags, except for html, head and body tags when KeepDocumentTags is set if !hasAttributes && (!o.KeepDocumentTags && (t.Hash == Html || t.Hash == Head || t.Hash == Body) || t.Hash == Colgroup) { break } else if t.TokenType == html.EndTagToken { omitEndTag := false if !o.KeepEndTags { if t.Hash == Thead || t.Hash == Tbody || t.Hash == Tfoot || t.Hash == Tr || t.Hash == Th || t.Hash == Td || t.Hash == Option || t.Hash == Dd || t.Hash == Dt || t.Hash == Li || t.Hash == Rb || t.Hash == Rt || t.Hash == Rtc || t.Hash == Rp { omitEndTag = true // omit end tags } else if t.Hash == P { i := 0 for { next := tb.Peek(i) i++ // continue if text token is empty or whitespace if next.TokenType == html.TextToken && parse.IsAllWhitespace(next.Data) { continue } if next.TokenType == html.ErrorToken || next.TokenType == html.EndTagToken && next.Traits&keepPTag == 0 || next.TokenType == html.StartTagToken && next.Traits&omitPTag != 0 { omitEndTag = true // omit p end tag } break } } else if t.Hash == Optgroup { i := 0 for { next := tb.Peek(i) i++ // continue if text token if next.TokenType == html.TextToken { continue } if next.TokenType == html.ErrorToken || next.Hash != Option { omitEndTag = true // omit optgroup end tag } break } } } if t.Traits&nonPhrasingTag != 0 { omitSpace = true // omit spaces after block elements } else if o.KeepWhitespace || t.Traits&objectTag != 0 { omitSpace = false } if !omitEndTag { if len(t.Data) > 3+len(t.Text) { t.Data[2+len(t.Text)] = '>' t.Data = t.Data[:3+len(t.Text)] } w.Write(t.Data) } // skip text in select and optgroup tags if t.Hash == Option || t.Hash == Optgroup { if next := tb.Peek(0); next.TokenType == html.TextToken { tb.Shift() } } break } if o.KeepWhitespace || t.Traits&objectTag != 0 { omitSpace = false } else if t.Traits&nonPhrasingTag != 0 { omitSpace = true // omit spaces after block elements } w.Write(t.Data) if hasAttributes { if t.Hash == Meta { attrs := tb.Attributes(Content, Http_Equiv, Charset, Name) if content := attrs[0]; content != nil { if httpEquiv := attrs[1]; httpEquiv != nil { httpEquiv.AttrVal = parse.TrimWhitespace(httpEquiv.AttrVal) if charset := attrs[2]; charset == nil && parse.EqualFold(httpEquiv.AttrVal, []byte("content-type")) { content.AttrVal = minify.Mediatype(content.AttrVal) if bytes.Equal(content.AttrVal, []byte("text/html;charset=utf-8")) { httpEquiv.Text = nil content.Text = []byte("charset") content.Hash = Charset content.AttrVal = []byte("utf-8") } } } if name := attrs[3]; name != nil { name.AttrVal = parse.TrimWhitespace(name.AttrVal) if parse.EqualFold(name.AttrVal, []byte("keywords")) { content.AttrVal = bytes.ReplaceAll(content.AttrVal, []byte(", "), []byte(",")) } else if parse.EqualFold(name.AttrVal, []byte("viewport")) { content.AttrVal = bytes.ReplaceAll(content.AttrVal, []byte(" "), []byte("")) for i := 0; i < len(content.AttrVal); i++ { if content.AttrVal[i] == '=' && i+2 < len(content.AttrVal) { i++ if n := parse.Number(content.AttrVal[i:]); n > 0 { minNum := minify.Number(content.AttrVal[i:i+n], -1) if len(minNum) < n { copy(content.AttrVal[i:i+len(minNum)], minNum) copy(content.AttrVal[i+len(minNum):], content.AttrVal[i+n:]) content.AttrVal = content.AttrVal[:len(content.AttrVal)+len(minNum)-n] } i += len(minNum) } i-- // mitigate for-loop increase } } } } } } else if t.Hash == Script { attrs := tb.Attributes(Src, Charset) if attrs[0] != nil && attrs[1] != nil { attrs[1].Text = nil } } else if t.Hash == Input { attrs := tb.Attributes(Type, Value) if t, value := attrs[0], attrs[1]; t != nil && value != nil { isRadio := parse.EqualFold(t.AttrVal, radioBytes) if !isRadio && len(value.AttrVal) == 0 { value.Text = nil } else if isRadio && parse.EqualFold(value.AttrVal, onBytes) { value.Text = nil } } } else if t.Hash == A { attrs := tb.Attributes(Id, Name) if id, name := attrs[0], attrs[1]; id != nil && name != nil { if bytes.Equal(id.AttrVal, name.AttrVal) { name.Text = nil } } } // write attributes for { attr := *tb.Shift() if attr.TokenType != html.AttributeToken { break } else if attr.Text == nil { continue // removed attribute } val := attr.AttrVal if attr.Traits&trimAttr != 0 { val = parse.ReplaceMultipleWhitespaceAndEntities(val, EntitiesMap, nil) val = parse.TrimWhitespace(val) } else { val = parse.ReplaceEntities(val, EntitiesMap, nil) } if t.Traits != 0 { if len(val) == 0 && (attr.Hash == Class || attr.Hash == Dir || attr.Hash == Id || attr.Hash == Name || attr.Hash == Action && t.Hash == Form) { continue // omit empty attribute values } if attr.Traits&caselessAttr != 0 { val = parse.ToLower(val) if attr.Hash == Enctype || attr.Hash == Codetype || attr.Hash == Accept || attr.Hash == Type && (t.Hash == A || t.Hash == Link || t.Hash == Embed || t.Hash == Object || t.Hash == Source || t.Hash == Script || t.Hash == Style) { val = minify.Mediatype(val) } } if rawTagHash != 0 && attr.Hash == Type { rawTagMediatype = parse.Copy(val) } // default attribute values can be omitted if !o.KeepDefaultAttrVals && (attr.Hash == Type && (t.Hash == Script && jsMimetypes[string(val)] || t.Hash == Style && bytes.Equal(val, cssMimeBytes) || t.Hash == Link && bytes.Equal(val, cssMimeBytes) || t.Hash == Input && bytes.Equal(val, textBytes) || t.Hash == Button && bytes.Equal(val, submitBytes)) || attr.Hash == Language && t.Hash == Script || attr.Hash == Method && bytes.Equal(val, getBytes) || attr.Hash == Enctype && bytes.Equal(val, formMimeBytes) || attr.Hash == Colspan && bytes.Equal(val, oneBytes) || attr.Hash == Rowspan && bytes.Equal(val, oneBytes) || attr.Hash == Shape && bytes.Equal(val, rectBytes) || attr.Hash == Span && bytes.Equal(val, oneBytes) || attr.Hash == Clear && bytes.Equal(val, noneBytes) || attr.Hash == Frameborder && bytes.Equal(val, oneBytes) || attr.Hash == Scrolling && bytes.Equal(val, autoBytes) || attr.Hash == Valuetype && bytes.Equal(val, dataBytes) || attr.Hash == Media && t.Hash == Style && bytes.Equal(val, allBytes)) { continue } if attr.Hash == Style { // CSS minifier for attribute inline code val = parse.TrimWhitespace(val) attrMinifyBuffer.Reset() if err := m.MinifyMimetype(cssMimeBytes, attrMinifyBuffer, buffer.NewReader(val), inlineParams); err == nil { val = attrMinifyBuffer.Bytes() } else if err != minify.ErrNotExist { return minify.UpdateErrorPosition(err, z, attr.Offset) } if len(val) == 0 { continue } } else if len(attr.Text) > 2 && attr.Text[0] == 'o' && attr.Text[1] == 'n' { // JS minifier for attribute inline code val = parse.TrimWhitespace(val) if len(val) >= 11 && parse.EqualFold(val[:11], jsSchemeBytes) { val = val[11:] } attrMinifyBuffer.Reset() if err := m.MinifyMimetype(jsMimeBytes, attrMinifyBuffer, buffer.NewReader(val), nil); err == nil { val = attrMinifyBuffer.Bytes() } else if err != minify.ErrNotExist { return minify.UpdateErrorPosition(err, z, attr.Offset) } if len(val) == 0 { continue } } else if attr.Traits&urlAttr != 0 { // anchors are already handled val = parse.TrimWhitespace(val) if 5 < len(val) { if parse.EqualFold(val[:4], httpBytes) { if val[4] == ':' { if m.URL != nil && m.URL.Scheme == "http" { val = val[5:] } else { parse.ToLower(val[:4]) } } else if (val[4] == 's' || val[4] == 'S') && val[5] == ':' { if m.URL != nil && m.URL.Scheme == "https" { val = val[6:] } else { parse.ToLower(val[:5]) } } } else if parse.EqualFold(val[:5], dataSchemeBytes) { val = minify.DataURI(m, val) } } } } w.Write(spaceBytes) w.Write(attr.Text) if len(val) > 0 && attr.Traits&booleanAttr == 0 { w.Write(isBytes) // use double quotes for RDFa attributes isXML := attr.Hash == Vocab || attr.Hash == Typeof || attr.Hash == Property || attr.Hash == Resource || attr.Hash == Prefix || attr.Hash == Content || attr.Hash == About || attr.Hash == Rev || attr.Hash == Datatype || attr.Hash == Inlist // no quotes if possible, else prefer single or double depending on which occurs more often in value var quote byte if 0 < len(attr.Data) && (attr.Data[len(attr.Data)-1] == '\'' || attr.Data[len(attr.Data)-1] == '"') { quote = attr.Data[len(attr.Data)-1] } val = html.EscapeAttrVal(&attrByteBuffer, val, quote, o.KeepQuotes, isXML) w.Write(val) } } } else { _ = tb.Shift() // StartTagClose } w.Write(gtBytes) // skip text in select and optgroup tags if t.Hash == Select || t.Hash == Optgroup { if next := tb.Peek(0); next.TokenType == html.TextToken { tb.Shift() } } // keep space after phrasing tags (<i>, <span>, ...) FontAwesome etc. if t.TokenType == html.StartTagToken && t.Traits&nonPhrasingTag == 0 { if next := tb.Peek(0); next.Hash == t.Hash && next.TokenType == html.EndTagToken { omitSpace = false } } } } }