gotosocial/vendor/github.com/h2non/filetype/matchers/document.go
Tobi Smethurst 98263a7de6
Grand test fixup (#138)
* start fixing up tests

* fix up tests + automate with drone

* fiddle with linting

* messing about with drone.yml

* some more fiddling

* hmmm

* add cache

* add vendor directory

* verbose

* ci updates

* update some little things

* update sig
2021-08-12 21:03:24 +02:00

198 lines
4.4 KiB
Go

package matchers
import (
"bytes"
"encoding/binary"
)
var (
TypeDoc = newType("doc", "application/msword")
TypeDocx = newType("docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
TypeXls = newType("xls", "application/vnd.ms-excel")
TypeXlsx = newType("xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
TypePpt = newType("ppt", "application/vnd.ms-powerpoint")
TypePptx = newType("pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation")
)
var Document = Map{
TypeDoc: Doc,
TypeDocx: Docx,
TypeXls: Xls,
TypeXlsx: Xlsx,
TypePpt: Ppt,
TypePptx: Pptx,
}
type docType int
const (
TYPE_DOC docType = iota
TYPE_DOCX
TYPE_XLS
TYPE_XLSX
TYPE_PPT
TYPE_PPTX
TYPE_OOXML
)
//reference: https://bz.apache.org/ooo/show_bug.cgi?id=111457
func Doc(buf []byte) bool {
if len(buf) > 513 {
return buf[0] == 0xD0 && buf[1] == 0xCF &&
buf[2] == 0x11 && buf[3] == 0xE0 &&
buf[512] == 0xEC && buf[513] == 0xA5
} else {
return len(buf) > 3 &&
buf[0] == 0xD0 && buf[1] == 0xCF &&
buf[2] == 0x11 && buf[3] == 0xE0
}
}
func Docx(buf []byte) bool {
typ, ok := msooxml(buf)
return ok && typ == TYPE_DOCX
}
func Xls(buf []byte) bool {
if len(buf) > 513 {
return buf[0] == 0xD0 && buf[1] == 0xCF &&
buf[2] == 0x11 && buf[3] == 0xE0 &&
buf[512] == 0x09 && buf[513] == 0x08
} else {
return len(buf) > 3 &&
buf[0] == 0xD0 && buf[1] == 0xCF &&
buf[2] == 0x11 && buf[3] == 0xE0
}
}
func Xlsx(buf []byte) bool {
typ, ok := msooxml(buf)
return ok && typ == TYPE_XLSX
}
func Ppt(buf []byte) bool {
if len(buf) > 513 {
return buf[0] == 0xD0 && buf[1] == 0xCF &&
buf[2] == 0x11 && buf[3] == 0xE0 &&
buf[512] == 0xA0 && buf[513] == 0x46
} else {
return len(buf) > 3 &&
buf[0] == 0xD0 && buf[1] == 0xCF &&
buf[2] == 0x11 && buf[3] == 0xE0
}
}
func Pptx(buf []byte) bool {
typ, ok := msooxml(buf)
return ok && typ == TYPE_PPTX
}
func msooxml(buf []byte) (typ docType, found bool) {
signature := []byte{'P', 'K', 0x03, 0x04}
// start by checking for ZIP local file header signature
if ok := compareBytes(buf, signature, 0); !ok {
return
}
// make sure the first file is correct
if v, ok := checkMSOoml(buf, 0x1E); ok {
return v, ok
}
if !compareBytes(buf, []byte("[Content_Types].xml"), 0x1E) &&
!compareBytes(buf, []byte("_rels/.rels"), 0x1E) &&
!compareBytes(buf, []byte("docProps"), 0x1E) {
return
}
// skip to the second local file header
// since some documents include a 520-byte extra field following the file
// header, we need to scan for the next header
startOffset := int(binary.LittleEndian.Uint32(buf[18:22]) + 49)
idx := search(buf, startOffset, 6000)
if idx == -1 {
return
}
// now skip to the *third* local file header; again, we need to scan due to a
// 520-byte extra field following the file header
startOffset += idx + 4 + 26
idx = search(buf, startOffset, 6000)
if idx == -1 {
return
}
// and check the subdirectory name to determine which type of OOXML
// file we have. Correct the mimetype with the registered ones:
// http://technet.microsoft.com/en-us/library/cc179224.aspx
startOffset += idx + 4 + 26
if typ, ok := checkMSOoml(buf, startOffset); ok {
return typ, ok
}
// OpenOffice/Libreoffice orders ZIP entry differently, so check the 4th file
startOffset += 26
idx = search(buf, startOffset, 6000)
if idx == -1 {
return TYPE_OOXML, true
}
startOffset += idx + 4 + 26
if typ, ok := checkMSOoml(buf, startOffset); ok {
return typ, ok
} else {
return TYPE_OOXML, true
}
}
func compareBytes(slice, subSlice []byte, startOffset int) bool {
sl := len(subSlice)
if startOffset+sl > len(slice) {
return false
}
s := slice[startOffset : startOffset+sl]
for i := range s {
if subSlice[i] != s[i] {
return false
}
}
return true
}
func checkMSOoml(buf []byte, offset int) (typ docType, ok bool) {
ok = true
switch {
case compareBytes(buf, []byte("word/"), offset):
typ = TYPE_DOCX
case compareBytes(buf, []byte("ppt/"), offset):
typ = TYPE_PPTX
case compareBytes(buf, []byte("xl/"), offset):
typ = TYPE_XLSX
default:
ok = false
}
return
}
func search(buf []byte, start, rangeNum int) int {
length := len(buf)
end := start + rangeNum
signature := []byte{'P', 'K', 0x03, 0x04}
if end > length {
end = length
}
if start >= end {
return -1
}
return bytes.Index(buf[start:end], signature)
}