# ==============================================================================
# robots.txt — coglyai.com
# Last updated: 2026-06-03
# ==============================================================================
# Strategy:
#   - Search engines (Google, Bing, etc.)  → full access to public content
#   - AI crawlers used for CITATIONS/RAG   → full access (GEO: appear in AI answers)
#   - AI crawlers used for TRAINING ONLY   → blocked (no free data for training)
#   - Sensitive / technical paths          → blocked for all
# ==============================================================================


# ------------------------------------------------------------------------------
# 1. DEFAULT — All crawlers not explicitly listed below
# ------------------------------------------------------------------------------
User-agent: *
Allow: /
Allow: /blog/
Allow: /en/
Allow: /en/blog/
Allow: /contact
Allow: /en/contact
Allow: /privacy-policy
Allow: /en/privacy-policy

Disallow: /404
Disallow: /en/404
Disallow: /cdn-cgi/

Sitemap: https://coglyai.com/sitemap_fr.xml
Sitemap: https://coglyai.com/sitemap_en-US.xml


# ------------------------------------------------------------------------------
# 2. GOOGLE
# ------------------------------------------------------------------------------
User-agent: Googlebot
Allow: /

User-agent: Googlebot-Image
Allow: /

# Google-Extended controls Bard / Gemini / Vertex AI citation crawling
# ALLOW: we want CoglyAI to be cited in Google AI Overviews and Gemini answers
User-agent: Google-Extended
Allow: /


# ------------------------------------------------------------------------------
# 3. BING / MICROSOFT
# ------------------------------------------------------------------------------
User-agent: Bingbot
Allow: /

# OAI-SearchBot is used by Bing Copilot for RAG/citation — allow
User-agent: OAI-SearchBot
Allow: /


# ------------------------------------------------------------------------------
# 4. AI CRAWLERS — CITATION / RAG USE (ALLOW — GEO strategy)
# ------------------------------------------------------------------------------

# OpenAI — ChatGPT browsing and citations
User-agent: ChatGPT-User
Allow: /

# Perplexity AI — citation crawler
User-agent: PerplexityBot
Allow: /
User-agent: Perplexity-User
Allow: /

# Anthropic Claude — web retrieval for answers
User-agent: ClaudeBot
Allow: /
User-agent: Claude-User
Allow: /

# You.com — AI search
User-agent: YouBot
Allow: /

# Cohere — enterprise AI search
User-agent: cohere-ai
Allow: /

# Meta AI (Llama-based products)
User-agent: Meta-ExternalAgent
Allow: /

# Brave Search AI
User-agent: Brave
Allow: /


# ------------------------------------------------------------------------------
# 5. AI CRAWLERS — TRAINING ONLY (BLOCK — protect content from bulk ingestion)
# ------------------------------------------------------------------------------

# OpenAI GPTBot — used for GPT training dataset collection
User-agent: GPTBot
Disallow: /

# Common Crawl — open dataset used to train most LLMs
User-agent: CCBot
Disallow: /

# Diffbot — scrapes for training data and knowledge graphs
User-agent: Diffbot
Disallow: /

# ByteDance / TikTok AI training
User-agent: Bytespider
Disallow: /

# Amazon Alexa crawler (training)
User-agent: Amazonbot
Disallow: /

# Apple AI training
User-agent: Applebot-Extended
Disallow: /

# Omgili / Webz.io — bulk data harvesting
User-agent: omgili
Disallow: /
User-agent: omgilibot
Disallow: /

# DataForSEO — commercial scraper
User-agent: DataForSeoBot
Disallow: /

# Semrush, Ahrefs, Majestic — SEO crawlers (crawl budget protection)
# Remove these lines if you want SEO tool monitoring enabled
User-agent: SemrushBot
Disallow: /

User-agent: AhrefsBot
Disallow: /

User-agent: MJ12bot
Disallow: /

User-agent: DotBot
Disallow: /


# ------------------------------------------------------------------------------
# 6. ALWAYS BLOCKED — Malicious / spam / scraper agents
# ------------------------------------------------------------------------------
User-agent: MegaIndex
Disallow: /

User-agent: ZoominfoBot
Disallow: /

User-agent: PetalBot
Disallow: /

User-agent: serpstatbot
Disallow: /

User-agent: ia_archiver
Disallow: /


# ------------------------------------------------------------------------------
# 7. APP SUBDOMAIN — handled by app.coglyai.com's own robots.txt
# ------------------------------------------------------------------------------
# Note: app.coglyai.com requires authentication.
# Its own robots.txt should Disallow: / for all agents.
# This file does NOT govern that subdomain.