From 9b50151f17b5921b68b3c413a26edf8ec6cdc6f8 Mon Sep 17 00:00:00 2001 From: Daenney Date: Fri, 2 Aug 2024 18:22:39 +0200 Subject: [PATCH] [feature] Beef up our AI opt-outs (#3165) * [chore] Synchronise our robots.txt with upstream * [feature] Add headers to escape AI crawlers This adds 2 headers that a number of AI crawlers respect to signal that content should not be included in their datasets. --- internal/middleware/extraheaders.go | 7 +++++++ internal/web/robots.go | 9 +++++++++ 2 files changed, 16 insertions(+) diff --git a/internal/middleware/extraheaders.go b/internal/middleware/extraheaders.go index c75b65551..fb91bcc93 100644 --- a/internal/middleware/extraheaders.go +++ b/internal/middleware/extraheaders.go @@ -44,5 +44,12 @@ func ExtraHeaders() gin.HandlerFunc { // // See: https://github.com/patcg-individual-drafts/topics c.Header("Permissions-Policy", "browsing-topics=()") + + // Some AI scrapers respect the following tags to opt-out + // of their crawling and datasets. + c.Header("X-Robots-Tag", "noimageai") + // c.Header calls .Set(), but we want to emit the header + // twice, not override it. + c.Writer.Header().Add("X-Robots-Tag", "noai") } } diff --git a/internal/web/robots.go b/internal/web/robots.go index 39708eb55..3309de97c 100644 --- a/internal/web/robots.go +++ b/internal/web/robots.go @@ -43,15 +43,24 @@ User-agent: Claude-Web User-agent: cohere-ai User-agent: Diffbot User-agent: FacebookBot +User-agent: facebookexternalhit User-agent: FriendlyCrawler User-agent: Google-Extended User-agent: GoogleOther +User-agent: GoogleOther-Image +User-agent: GoogleOther-Video User-agent: GPTBot User-agent: ImagesiftBot User-agent: img2dataset +User-agent: Meta-ExternalAgent +User-agent: OAI-SearchBot User-agent: omgili User-agent: omgilibot User-agent: PerplexityBot +User-agent: PetalBot +User-agent: Scrapy +User-agent: Timpibot +User-agent: VelenPublicWebCrawler User-agent: YouBot Disallow: /