From ea8ad8b346978b04b067eead8e1f2bbc3c1bfb45 Mon Sep 17 00:00:00 2001 From: Tobi Smethurst <31960611+tsmethurst@users.noreply.github.com> Date: Wed, 28 Jul 2021 11:42:26 +0200 Subject: [PATCH] Link parsing (#120) * add link parsing + formatting functionality * refinement + docs * add missing test * credit url library --- README.md | 1 + docs/user_guide/posts.md | 149 +++++++++++++++++++++++++++ docs/user_guide/writing_posts.md | 48 --------- go.mod | 1 + go.sum | 4 + internal/processing/status/util.go | 9 +- internal/text/link.go | 115 +++++++++++++++++++++ internal/text/link_test.go | 155 +++++++++++++++++++++++++++++ internal/text/plain.go | 3 + 9 files changed, 434 insertions(+), 51 deletions(-) create mode 100644 docs/user_guide/posts.md delete mode 100644 docs/user_guide/writing_posts.md create mode 100644 internal/text/link.go create mode 100644 internal/text/link_test.go diff --git a/README.md b/README.md index 734f81619..7e11a8ac6 100644 --- a/README.md +++ b/README.md @@ -129,6 +129,7 @@ The following libraries and frameworks are used by GoToSocial, with gratitude * [gorilla/websocket](https://github.com/gorilla/websocket); Websocket connectivity. [BSD-2-Clause License](https://spdx.org/licenses/BSD-2-Clause.html). * [h2non/filetype](https://github.com/h2non/filetype); filetype checking. [MIT License](https://spdx.org/licenses/MIT.html). * [microcosm-cc/bluemonday](https://github.com/microcosm-cc/bluemonday); HTML user-input sanitization. [BSD-3-Clause License](https://spdx.org/licenses/BSD-3-Clause.html). +* [mvdan/xurls](https://github.com/mvdan/xurls); URL parsing regular expressions. [BSD-3-Clause License](https://spdx.org/licenses/BSD-3-Clause.html). * [nfnt/resize](https://github.com/nfnt/resize); convenient image resizing. [ISC License](https://spdx.org/licenses/ISC.html). * [oklog/ulid](https://github.com/oklog/ulid); sequential, database-friendly ID generation. [Apache-2.0 License](https://spdx.org/licenses/Apache-2.0.html). * [sirupsen/logrus](https://github.com/sirupsen/logrus); logging. [MIT License](https://spdx.org/licenses/MIT.html). diff --git a/docs/user_guide/posts.md b/docs/user_guide/posts.md new file mode 100644 index 000000000..4376d6c76 --- /dev/null +++ b/docs/user_guide/posts.md @@ -0,0 +1,149 @@ +# Posts + +## Input Types + +GoToSocial currently accepts two different types of input for posts. These are: + +* `plain` +* `markdown` + +Plain is the default method of posting: GtS accepts some plain looking text, and converts it into some nice HTML by parsing links and mentions etc. If you're used to Mastodon or Twitter or most other social media platforms, this way of writing posts will be immediately familiar. + +Markdown is a more complex way of organizing text, which gives you more control over how your text is parsed and formatted. + +For more information on markdown, see [The Markdown Guide](https://www.markdownguide.org/). + +## Formatting + +When a post is submitted in `plain` format, GoToSocial automatically does some tidying up and formatting of the post in order to convert it to HTML, as described below. + +### Whitespace + +Any leading or trailing whitespaces and newlines are removed from the post. So for example: + +```text + + +this post starts with some newlines +``` + +will become: + +```text +this post starts with some newlines +``` + +### Wrapping + +The whole post will be wrapped in `

`. + +So the following text: + +```text +Hi here's a little post! +``` + +Will become: + +```html +

Hi here's a little post!

+``` + +### Linebreaks + +Any newlines will be replaced with `
` + +So to continue the above example: + +```text +Hi here's a little post! + +And here's another line. +``` + +Will become: + +```html +

Hi here's a little post!

And here's another line

+``` + +### Links + +Any recognizable links in the text will be shortened and turned into proper hyperlinks, and have some additional attributes added to them. + +For example: + +```text +Here's a link to something: https://example.org/some/link/address +``` + +will become: + +```html +Here's a link to something: example.org/some/link/address +``` + +which will be rendered as: + +> Here's a link to something: [example.org/some/link/address](https://example.org/some/link/address) + +Note that this will only work for `http` and `https` links; other schemes are not supported. + +### Mentions + +You can 'mention' another account by referring to the account in the following way: + +> @some_account@example.org + +In this example, `some_account` is the username of the account you want to mention, and `example.org` is the domain that hosts their account. + +The mentioned account will get a notification that you've mentioned them, and be able to see the post in which they were mentioned. + +Mentions are formatted in a similar way to links, so: + +```text +hi @some_account@example.org how's it going? +``` + +will become: + +```html +hi @some_account how's it going? +``` + +which will be rendered as: + +> hi @some_account how's it going? + +When mentioning local accounts (ie., accounts on your instance), the second part of the mention is not necessary. If there's an account with username `local_account_person` on your instance, you can mention them just by writing: + +```text +hey @local_account_person you're my neighbour +``` + +This will become: + +```html +hey @local_account_person you're my neighbour +``` + +which will be rendered as: + +> hey @local_account_person you're my neighbour + +## Input Sanitization + +In order not to spread scripts, vulnerabilities, and glitchy HTML all over the place, GoToSocial performs the following types of input sanitization: + +`plain` input type: + +* Before parsing, any existing HTML is completely removed from the post body and content-warning fields. +* After parsing, all generated HTML is run through a sanitizer to remove harmful elements. + +`markdown` input type: + +* Before parsing, any existing HTML is completely removed from the content-warning field. +* Before parsing, any existing HTML in the post body is run through a sanitizer to remove harmful elements. +* After parsing, all generated HTML is run through a sanitizer to remove harmful elements. + +GoToSocial uses [bluemonday](https://github.com/microcosm-cc/bluemonday) for HTML sanitization. diff --git a/docs/user_guide/writing_posts.md b/docs/user_guide/writing_posts.md deleted file mode 100644 index 7318242f5..000000000 --- a/docs/user_guide/writing_posts.md +++ /dev/null @@ -1,48 +0,0 @@ -# Writing Posts - -TODO - -## Formatting - -This section describes the different post input types accepted by GoToSocial, and the method GtS uses to parse text into HTML. - -### Links - -Any recognized links in the text will be shortened and turned into proper hyperlinks. For example: - -> Here's a link to something: https://example.org/some/link/address - -will become: - -> Here's a link to something: [example.org/some/link/address](https://example.org/some/link/address) - -### Mentions - -You can 'mention' another account by referring to the account in the following way: - -> @some_account@example.org - -In this example, `some_account` is the username of the account you want to mention, and `example.org` is the domain that hosts their account. - -The mentioned account will get a notification that you've mentioned them, and be able to see the post in which they were mentioned. - -Mentions are formatted in a similar way to links, so: - -> @some_account@example.org - -will become: - -> @some_account - -## Input Types - -GoToSocial currently accepts two different types of input. These are: - -* `plain` -* `markdown` - -Plain is the default method of posting: GtS accepts some plain looking text, and converts it into some nice HTML by parsing links and mentions etc. - -Markdown is a more complex way of organizing text, which gives you more control over how your text is parsed and formatted. - -For more information on markdown, see [The Markdown Guide](https://www.markdownguide.org/). diff --git a/go.mod b/go.mod index bd7476d34..48febd4a6 100644 --- a/go.mod +++ b/go.mod @@ -55,4 +55,5 @@ require ( gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect gopkg.in/yaml.v2 v2.4.0 gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b // indirect + mvdan.cc/xurls/v2 v2.3.0 ) diff --git a/go.sum b/go.sum index 731a659ef..e3599fa9d 100644 --- a/go.sum +++ b/go.sum @@ -301,6 +301,7 @@ github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1y github.com/onsi/gomega v1.10.3/go.mod h1:V9xEwhxec5O8UDM77eCW8vLymOMltsqPVYWrpDsH8xc= github.com/onsi/gomega v1.14.0 h1:ep6kpPVwmr/nTbklSx2nrLNSIO62DoYAhnPNIMhK8gI= github.com/onsi/gomega v1.14.0/go.mod h1:cIuvLEne0aoVhAgh/O6ac0Op8WWw9H6eYCriF+tEHG0= +github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= @@ -308,6 +309,7 @@ github.com/quasoft/memstore v0.0.0-20180925164028-84a050167438/go.mod h1:wTPjTep github.com/quasoft/memstore v0.0.0-20191010062613-2bce066d2b0b h1:aUNXCGgukb4gtY99imuIeoh8Vr0GSwAlYxPAhqZrpFc= github.com/quasoft/memstore v0.0.0-20191010062613-2bce066d2b0b/go.mod h1:wTPjTepVu7uJBYgZ0SdWHQlIas582j6cn2jgk4DDdlg= github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= +github.com/rogpeppe/go-internal v1.8.0/go.mod h1:WmiCO8CzOY8rg0OYDC4/i/2WRWAB6poM+XZ2dLUbcbE= github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= @@ -729,6 +731,8 @@ honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9 honnef.co/go/tools v0.0.1-2020.1.4/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k= mellium.im/sasl v0.2.1 h1:nspKSRg7/SyO0cRGY71OkfHab8tf9kCts6a6oTDut0w= mellium.im/sasl v0.2.1/go.mod h1:ROaEDLQNuf9vjKqE1SrAfnsobm2YKXT1gnN1uDp1PjQ= +mvdan.cc/xurls/v2 v2.3.0 h1:59Olnbt67UKpxF1EwVBopJvkSUBmgtb468E4GVWIZ1I= +mvdan.cc/xurls/v2 v2.3.0/go.mod h1:AjuTy7gEiUArFMjgBBDU4SMxlfUYsRokpJQgNWOt3e4= rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA= diff --git a/internal/processing/status/util.go b/internal/processing/status/util.go index b4d115f8d..f85e05478 100644 --- a/internal/processing/status/util.go +++ b/internal/processing/status/util.go @@ -8,6 +8,7 @@ import ( "github.com/superseriousbusiness/gotosocial/internal/db" "github.com/superseriousbusiness/gotosocial/internal/gtsmodel" "github.com/superseriousbusiness/gotosocial/internal/id" + "github.com/superseriousbusiness/gotosocial/internal/text" "github.com/superseriousbusiness/gotosocial/internal/util" ) @@ -248,13 +249,15 @@ func (p *processor) processContent(form *apimodel.AdvancedStatusCreateForm, acco form.Format = apimodel.StatusFormatDefault } + // remove any existing html from the status + content := text.RemoveHTML(form.Status) + // parse content out of the status depending on what format has been submitted - var content string switch form.Format { case apimodel.StatusFormatPlain: - content = p.formatter.FromPlain(form.Status, status.GTSMentions, status.GTSTags) + content = p.formatter.FromPlain(content, status.GTSMentions, status.GTSTags) case apimodel.StatusFormatMarkdown: - content = p.formatter.FromMarkdown(form.Status, status.GTSMentions, status.GTSTags) + content = p.formatter.FromMarkdown(content, status.GTSMentions, status.GTSTags) default: return fmt.Errorf("format %s not recognised as a valid status format", form.Format) } diff --git a/internal/text/link.go b/internal/text/link.go new file mode 100644 index 000000000..440571a83 --- /dev/null +++ b/internal/text/link.go @@ -0,0 +1,115 @@ +/* + GoToSocial + Copyright (C) 2021 GoToSocial Authors admin@gotosocial.org + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . +*/ + +package text + +import ( + "fmt" + "net/url" + + "mvdan.cc/xurls/v2" +) + +// schemes is the regex for schemes we accept when looking for links. +// Basically, we accept https or http. +var schemes = `(((http|https))://)` + +// FindLinks parses the given string looking for recognizable URLs (including scheme). +// It returns a list of those URLs, without changing the string, or an error if something goes wrong. +// If no URLs are found within the given string, an empty slice and nil will be returned. +func FindLinks(in string) ([]*url.URL, error) { + rxStrict, err := xurls.StrictMatchingScheme(schemes) + if err != nil { + return nil, err + } + + urls := []*url.URL{} + + // bail already if we don't find anything + found := rxStrict.FindAllString(in, -1) + if len(found) == 0 { + return urls, nil + } + + // for each string we find, we want to parse it into a URL if we can + // if we fail to parse it, just ignore this match and continue + for _, f := range found { + u, err := url.Parse(f) + if err != nil { + continue + } + urls = append(urls, u) + } + + // deduplicate the URLs + urlsDeduped := []*url.URL{} + + for _, u := range urls { + if !contains(urlsDeduped, u) { + urlsDeduped = append(urlsDeduped, u) + } + } + + return urlsDeduped, nil +} + +// contains checks if the given url is already within a slice of URLs +func contains(urls []*url.URL, url *url.URL) bool { + for _, u := range urls { + if u.String() == url.String() { + return true + } + } + return false +} + +// ReplaceLinks replaces all detected links in a piece of text with their HTML (href) equivalents. +// Note: because Go doesn't allow negative lookbehinds in regex, it's possible that an already-formatted +// href will end up double-formatted, if the text you pass here contains one or more hrefs already. +// To avoid this, you should sanitize any HTML out of text before you pass it into this function. +func ReplaceLinks(in string) string { + rxStrict, err := xurls.StrictMatchingScheme(schemes) + if err != nil { + panic(err) + } + + replaced := rxStrict.ReplaceAllStringFunc(in, func(urlString string) string { + thisURL, err := url.Parse(urlString) + if err != nil { + return urlString // we can't parse it as a URL so don't replace it + } + + shortString := thisURL.Hostname() + + if thisURL.Path != "" { + shortString = shortString + thisURL.Path + } + + if thisURL.Fragment != "" { + shortString = shortString + "#" + thisURL.Fragment + } + + if thisURL.RawQuery != "" { + shortString = shortString + "?" + thisURL.RawQuery + } + + replacement := fmt.Sprintf(`%s`, urlString, shortString) + return replacement + }) + return replaced +} diff --git a/internal/text/link_test.go b/internal/text/link_test.go new file mode 100644 index 000000000..636f26f7f --- /dev/null +++ b/internal/text/link_test.go @@ -0,0 +1,155 @@ +/* + GoToSocial + Copyright (C) 2021 GoToSocial Authors admin@gotosocial.org + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . +*/ + +package text_test + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/suite" + "github.com/superseriousbusiness/gotosocial/internal/text" +) + +const text1 = ` +This is a text with some links in it. Here's link number one: https://example.org/link/to/something#fragment + +Here's link number two: http://test.example.org?q=bahhhhhhhhhhhh + +https://another.link.example.org/with/a/pretty/long/path/at/the/end/of/it + +really.cool.website <-- this one shouldn't be parsed as a link because it doesn't contain the scheme + +https://example.orghttps://google.com <-- this shouldn't work either, but it does?! OK +` + +const text2 = ` +this is one link: https://example.org + +this is the same link again: https://example.org + +these should be deduplicated +` + +const text3 = ` +here's a mailto link: mailto:whatever@test.org +` + +const text4 = ` +two similar links: + +https://example.org + +https://example.org/test +` + +const text5 = ` +what happens when we already have a link within an href? + +https://example.org +` + +type TextTestSuite struct { + suite.Suite +} + +func (suite *TextTestSuite) TestParseURLsFromText1() { + urls, err := text.FindLinks(text1) + + assert.NoError(suite.T(), err) + + assert.Equal(suite.T(), "https://example.org/link/to/something#fragment", urls[0].String()) + assert.Equal(suite.T(), "http://test.example.org?q=bahhhhhhhhhhhh", urls[1].String()) + assert.Equal(suite.T(), "https://another.link.example.org/with/a/pretty/long/path/at/the/end/of/it", urls[2].String()) + assert.Equal(suite.T(), "https://example.orghttps://google.com", urls[3].String()) +} + +func (suite *TextTestSuite) TestParseURLsFromText2() { + urls, err := text.FindLinks(text2) + assert.NoError(suite.T(), err) + + // assert length 1 because the found links will be deduplicated + assert.Len(suite.T(), urls, 1) +} + +func (suite *TextTestSuite) TestParseURLsFromText3() { + urls, err := text.FindLinks(text3) + assert.NoError(suite.T(), err) + + // assert length 0 because `mailto:` isn't accepted + assert.Len(suite.T(), urls, 0) +} + +func (suite *TextTestSuite) TestReplaceLinksFromText1() { + replaced := text.ReplaceLinks(text1) + assert.Equal(suite.T(), ` +This is a text with some links in it. Here's link number one: example.org/link/to/something#fragment + +Here's link number two: test.example.org?q=bahhhhhhhhhhhh + +another.link.example.org/with/a/pretty/long/path/at/the/end/of/it + +really.cool.website <-- this one shouldn't be parsed as a link because it doesn't contain the scheme + +example.orghttps//google.com <-- this shouldn't work either, but it does?! OK +`, replaced) +} + +func (suite *TextTestSuite) TestReplaceLinksFromText2() { + replaced := text.ReplaceLinks(text2) + assert.Equal(suite.T(), ` +this is one link: example.org + +this is the same link again: example.org + +these should be deduplicated +`, replaced) +} + +func (suite *TextTestSuite) TestReplaceLinksFromText3() { + // we know mailto links won't be replaced with hrefs -- we only accept https and http + replaced := text.ReplaceLinks(text3) + assert.Equal(suite.T(), ` +here's a mailto link: mailto:whatever@test.org +`, replaced) +} + +func (suite *TextTestSuite) TestReplaceLinksFromText4() { + replaced := text.ReplaceLinks(text4) + assert.Equal(suite.T(), ` +two similar links: + +example.org + +example.org/test +`, replaced) +} + +func (suite *TextTestSuite) TestReplaceLinksFromText5() { + // we know this one doesn't work properly, which is why html should always be sanitized before being passed into the ReplaceLinks function + replaced := text.ReplaceLinks(text5) + assert.Equal(suite.T(), ` +what happens when we already have a link within an href? + +example.org">example.org +`, replaced) +} + +func TestTextTestSuite(t *testing.T) { + suite.Run(t, new(TextTestSuite)) +} diff --git a/internal/text/plain.go b/internal/text/plain.go index 24ef16f8e..4f6659484 100644 --- a/internal/text/plain.go +++ b/internal/text/plain.go @@ -28,6 +28,9 @@ import ( func (f *formatter) FromPlain(plain string, mentions []*gtsmodel.Mention, tags []*gtsmodel.Tag) string { content := preformat(plain) + // format links nicely + content = ReplaceLinks(content) + // format mentions nicely for _, menchie := range mentions { targetAccount := >smodel.Account{}