From dcab555a6b46de69a2c2a26f1fe409017ad229f3 Mon Sep 17 00:00:00 2001
From: Daenney <daenney@users.noreply.github.com>
Date: Mon, 22 Apr 2024 11:01:37 +0200
Subject: [PATCH] [chore] Update robots.txt (#2856)

This updates the robots.txt based on the list of the ai.robots.txt
repository. We can look at automating that at some point.

It's worth pointing out that some robots, namely the ones by Bytedance,
are known to ignore robots.txt entirely.
---
 docs/admin/robots.md   | 13 +++++++++
 internal/web/robots.go | 64 ++++++++++++++++++++----------------------
 mkdocs.yml             |  1 +
 3 files changed, 45 insertions(+), 33 deletions(-)
 create mode 100644 docs/admin/robots.md

diff --git a/docs/admin/robots.md b/docs/admin/robots.md
new file mode 100644
index 000000000..b9e0468ce
--- /dev/null
+++ b/docs/admin/robots.md
@@ -0,0 +1,13 @@
+# Robots.txt
+
+GoToSocial serves a `robots.txt` file on the host domain. This file contains rules that attempt to block known AI scrapers, as well as some other indexers. It also includes some rules to ensure things like API endpoints aren't indexed by search engines since there really isn't any point to them.
+
+## AI scrapers
+
+The AI scrapers come from a [community maintained repository][airobots]. It's manually kept in sync for the time being. If you know of any missing robots, please send them a PR!
+
+A number of AI scrapers are known to ignore entries in `robots.txt` even if it explicitly matches their User-Agent. This means the `robots.txt` file is not a foolproof way of ensuring AI scrapers don't grab your content.
+    
+If you want to block these things fully, you'll need to block based on the User-Agent header in a reverse proxy until GoToSocial can filter requests by User-Agent header.
+
+[airobots]: https://github.com/ai-robots-txt/ai.robots.txt/
diff --git a/internal/web/robots.go b/internal/web/robots.go
index 2511ee1d3..58b541413 100644
--- a/internal/web/robots.go
+++ b/internal/web/robots.go
@@ -29,45 +29,43 @@ const (
 	robotsTxt           = `# GoToSocial robots.txt -- to edit, see internal/web/robots.go
 # More info @ https://developers.google.com/search/docs/crawling-indexing/robots/intro
 
-# Before we commence, a giant fuck you to ChatGPT in particular.
-# https://platform.openai.com/docs/gptbot
-User-agent: GPTBot
-Disallow: /
-
-# As of September 2023, GPTBot and ChatGPT-User are equivalent. But there's no telling
-# when OpenAI might decide to change that, so block this one too.
-User-agent: ChatGPT-User
-Disallow: /
-
-# And a giant fuck you to Google Bard and their other generative AI ventures too.
-# https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers
-User-agent: Google-Extended
-Disallow: /
-
-# Block CommonCrawl. Used in training LLMs and specifically GPT-3.
-# https://commoncrawl.org/faq
+# AI scrapers and the like.
+# https://github.com/ai-robots-txt/ai.robots.txt/
+User-agent: AdsBot-Google
+User-agent: Amazonbot
+User-agent: anthropic-ai
+User-agent: Applebot
+User-agent: AwarioRssBot
+User-agent: AwarioSmartBot
+User-agent: Bytespider
 User-agent: CCBot
-Disallow: /
-
-# Block Omgilike/Webz.io, a "Big Web Data" engine.
-# https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/
-User-agent: Omgilibot
-Disallow: /
-
-# Block Faceboobot, because Meta.
-# https://developers.facebook.com/docs/sharing/bot
+User-agent: ChatGPT-User
+User-agent: ClaudeBot
+User-agent: Claude-Web
+User-agent: cohere-ai
+User-agent: DataForSeoBot
 User-agent: FacebookBot
+User-agent: FriendlyCrawler
+User-agent: Google-Extended
+User-agent: GoogleOther
+User-agent: GPTBot
+User-agent: ImagesiftBot
+User-agent: magpie-crawler
+User-agent: Meltwater
+User-agent: omgili
+User-agent: omgilibot
+User-agent: peer39_crawler
+User-agent: peer39_crawler/1.0
+User-agent: PerplexityBot
+User-agent: PiplBot
+User-agent: Seekr
+User-agent: YouBot
 Disallow: /
 
 # Well-known.dev crawler. Indexes stuff under /.well-known.
 # https://well-known.dev/about/
-User-agent: WellKnownBot
-Disallow: /
-
-# Block Amazonbot, because Amazon.
-# https://developer.amazon.com/amazonbot
-User-agent: Amazonbot
-Disallow: /
+User-agent: WellKnownBot     
+Disallow: /   
 
 # Rules for everything else.
 User-agent: *
diff --git a/mkdocs.yml b/mkdocs.yml
index 737e23a75..8d7ecc65a 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -118,6 +118,7 @@ nav:
       - "admin/signups.md"
       - "admin/federation_modes.md"
       - "admin/domain_blocks.md"
+      - "admin/robots.md"
       - "admin/cli.md"
       - "admin/backup_and_restore.md"
       - "admin/media_caching.md"