gotosocial/internal/web/robots.go
Daenney dcab555a6b
[chore] Update robots.txt (#2856)
This updates the robots.txt based on the list of the ai.robots.txt
repository. We can look at automating that at some point.

It's worth pointing out that some robots, namely the ones by Bytedance,
are known to ignore robots.txt entirely.
2024-04-22 11:01:37 +02:00

111 lines
3 KiB
Go

// GoToSocial
// Copyright (C) GoToSocial Authors admin@gotosocial.org
// SPDX-License-Identifier: AGPL-3.0-or-later
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
package web
import (
"net/http"
"github.com/gin-gonic/gin"
)
const (
robotsPath = "/robots.txt"
robotsMetaAllowSome = "nofollow, noarchive, nositelinkssearchbox, max-image-preview:standard" // https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag#robotsmeta
robotsTxt = `# GoToSocial robots.txt -- to edit, see internal/web/robots.go
# More info @ https://developers.google.com/search/docs/crawling-indexing/robots/intro
# AI scrapers and the like.
# https://github.com/ai-robots-txt/ai.robots.txt/
User-agent: AdsBot-Google
User-agent: Amazonbot
User-agent: anthropic-ai
User-agent: Applebot
User-agent: AwarioRssBot
User-agent: AwarioSmartBot
User-agent: Bytespider
User-agent: CCBot
User-agent: ChatGPT-User
User-agent: ClaudeBot
User-agent: Claude-Web
User-agent: cohere-ai
User-agent: DataForSeoBot
User-agent: FacebookBot
User-agent: FriendlyCrawler
User-agent: Google-Extended
User-agent: GoogleOther
User-agent: GPTBot
User-agent: ImagesiftBot
User-agent: magpie-crawler
User-agent: Meltwater
User-agent: omgili
User-agent: omgilibot
User-agent: peer39_crawler
User-agent: peer39_crawler/1.0
User-agent: PerplexityBot
User-agent: PiplBot
User-agent: Seekr
User-agent: YouBot
Disallow: /
# Well-known.dev crawler. Indexes stuff under /.well-known.
# https://well-known.dev/about/
User-agent: WellKnownBot
Disallow: /
# Rules for everything else.
User-agent: *
Crawl-delay: 500
# API endpoints.
Disallow: /api/
# Auth/Sign in endpoints.
Disallow: /auth/
Disallow: /oauth/
Disallow: /check_your_email
Disallow: /wait_for_approval
Disallow: /account_disabled
Disallow: /signup
# Well-known endpoints.
Disallow: /.well-known/
# Fileserver/media.
Disallow: /fileserver/
# Fedi S2S API endpoints.
Disallow: /users/
Disallow: /emoji/
# Settings panels.
Disallow: /admin
Disallow: /user
Disallow: /settings/
# Domain blocklist.
Disallow: /about/suspended`
)
// robotsGETHandler returns a decent robots.txt that prevents crawling
// the api, auth pages, settings pages, etc.
//
// More granular robots meta tags are then applied for web pages
// depending on user preferences (see internal/web).
func (m *Module) robotsGETHandler(c *gin.Context) {
c.String(http.StatusOK, robotsTxt)
}