← Retour au lexique
⚙️ SEO Technique

XML Sitemap

Fichier XML listant les URLs d'un site web pour faciliter leur découverte et indexation par les moteurs de recherche.

Définition

Un sitemap XML est un fichier structuré qui liste toutes les URLs importantes d’un site web, fournissant aux moteurs de recherche une carte complète du contenu à explorer et indexer. Ce protocole standardisé inclut des métadonnées sur chaque URL comme la date de dernière modification, la fréquence de mise à jour et la priorité relative, optimisant ainsi l’efficacité du crawl et l’indexation.

Structure et syntaxe

Format XML standard

<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
        xmlns:image="http://www.google.com/schemas/sitemap-image/1.1"
        xmlns:video="http://www.google.com/schemas/sitemap-video/1.1">
    
    <!-- URL avec toutes les balises optionnelles -->
    <url>
        <loc>https://www.example.com/page-importante</loc>
        <lastmod>2024-12-01T10:30:00+00:00</lastmod>
        <changefreq>weekly</changefreq>
        <priority>0.8</priority>
        
        <!-- Images associées -->
        <image:image>
            <image:loc>https://www.example.com/image.jpg</image:loc>
            <image:title>Titre de l'image</image:title>
            <image:caption>Description de l'image</image:caption>
        </image:image>
    </url>
    
    <!-- URL minimale (seulement loc requis) -->
    <url>
        <loc>https://www.example.com/autre-page</loc>
    </url>
    
    <!-- Page avec vidéo -->
    <url>
        <loc>https://www.example.com/page-video</loc>
        <video:video>
            <video:thumbnail_loc>https://www.example.com/thumb.jpg</video:thumbnail_loc>
            <video:title>Titre de la vidéo</video:title>
            <video:description>Description de la vidéo</video:description>
            <video:content_loc>https://www.example.com/video.mp4</video:content_loc>
            <video:duration>600</video:duration>
        </video:video>
    </url>
</urlset>

Sitemap index pour grands sites

<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
    <!-- Index pour sites avec plusieurs sitemaps -->
    <sitemap>
        <loc>https://www.example.com/sitemap-posts-2024.xml</loc>
        <lastmod>2024-12-01T12:00:00+00:00</lastmod>
    </sitemap>
    
    <sitemap>
        <loc>https://www.example.com/sitemap-products.xml</loc>
        <lastmod>2024-12-01T11:00:00+00:00</lastmod>
    </sitemap>
    
    <sitemap>
        <loc>https://www.example.com/sitemap-categories.xml</loc>
        <lastmod>2024-11-30T23:00:00+00:00</lastmod>
    </sitemap>
    
    <sitemap>
        <loc>https://www.example.com/sitemap-images.xml</loc>
        <lastmod>2024-12-01T10:00:00+00:00</lastmod>
    </sitemap>
</sitemapindex>

Génération dynamique

PHP implementation

<?php
// Générateur sitemap XML dynamique
class XMLSitemapGenerator {
    private $domain;
    private $urls = [];
    private $maxUrls = 50000;
    private $maxSize = 52428800; // 50MB
    
    public function __construct($domain) {
        $this->domain = rtrim($domain, '/');
    }
    
    public function addUrl($path, $lastmod = null, $changefreq = null, $priority = null) {
        if (count($this->urls) >= $this->maxUrls) {
            throw new Exception('Sitemap URL limit reached');
        }
        
        $url = [
            'loc' => $this->domain . '/' . ltrim($path, '/'),
            'lastmod' => $lastmod ?: date('c'),
            'changefreq' => $changefreq,
            'priority' => $priority
        ];
        
        $this->urls[] = $url;
    }
    
    public function generateFromDatabase() {
        // Pages statiques importantes
        $this->addUrl('/', date('c'), 'daily', 1.0);
        $this->addUrl('/about', date('c'), 'monthly', 0.8);
        $this->addUrl('/contact', date('c'), 'yearly', 0.5);
        
        // Articles blog
        $posts = $this->getPosts();
        foreach ($posts as $post) {
            $this->addUrl(
                '/blog/' . $post['slug'],
                $post['updated_at'],
                $this->calculateChangefreq($post['updated_at']),
                $this->calculatePriority($post)
            );
        }
        
        // Produits
        $products = $this->getProducts();
        foreach ($products as $product) {
            $this->addUrl(
                '/products/' . $product['slug'],
                $product['updated_at'],
                'weekly',
                0.9
            );
        }
        
        // Catégories
        $categories = $this->getCategories();
        foreach ($categories as $category) {
            $this->addUrl(
                '/category/' . $category['slug'],
                $category['updated_at'],
                'weekly',
                0.7
            );
        }
    }
    
    private function calculateChangefreq($lastmod) {
        $daysSinceUpdate = (time() - strtotime($lastmod)) / 86400;
        
        if ($daysSinceUpdate < 1) return 'hourly';
        if ($daysSinceUpdate < 7) return 'daily';
        if ($daysSinceUpdate < 30) return 'weekly';
        if ($daysSinceUpdate < 365) return 'monthly';
        return 'yearly';
    }
    
    private function calculatePriority($item) {
        // Logique priorité basée sur importance
        $priority = 0.5; // Base
        
        if ($item['views'] > 10000) $priority += 0.2;
        if ($item['comments'] > 50) $priority += 0.1;
        if (strtotime($item['created_at']) > strtotime('-30 days')) $priority += 0.1;
        if ($item['is_featured']) $priority += 0.1;
        
        return min($priority, 1.0);
    }
    
    public function render() {
        $xml = '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
        $xml .= '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' . "\n";
        
        foreach ($this->urls as $url) {
            $xml .= "\t<url>\n";
            $xml .= "\t\t<loc>" . htmlspecialchars($url['loc']) . "</loc>\n";
            
            if ($url['lastmod']) {
                $xml .= "\t\t<lastmod>" . $url['lastmod'] . "</lastmod>\n";
            }
            
            if ($url['changefreq']) {
                $xml .= "\t\t<changefreq>" . $url['changefreq'] . "</changefreq>\n";
            }
            
            if ($url['priority'] !== null) {
                $xml .= "\t\t<priority>" . number_format($url['priority'], 1) . "</priority>\n";
            }
            
            $xml .= "\t</url>\n";
        }
        
        $xml .= "</urlset>";
        
        return $xml;
    }
    
    public function save($filename = 'sitemap.xml') {
        $xml = $this->render();
        
        // Vérifier taille
        if (strlen($xml) > $this->maxSize) {
            throw new Exception('Sitemap size limit exceeded');
        }
        
        file_put_contents($filename, $xml);
        
        // Créer version gzippée
        $gz = gzopen($filename . '.gz', 'w9');
        gzwrite($gz, $xml);
        gzclose($gz);
        
        return true;
    }
}

// Usage
$sitemap = new XMLSitemapGenerator('https://example.com');
$sitemap->generateFromDatabase();
$sitemap->save();
?>

Node.js avec sitemap package

// Génération sitemap avec Node.js
const { SitemapStream, streamToPromise } = require('sitemap');
const { createWriteStream } = require('fs');
const { pipeline } = require('stream');

class SitemapGenerator {
    constructor(hostname) {
        this.hostname = hostname;
        this.stream = new SitemapStream({ hostname });
    }
    
    async generateDynamic() {
        // URLs statiques
        const staticUrls = [
            { url: '/', changefreq: 'daily', priority: 1.0 },
            { url: '/about', changefreq: 'monthly', priority: 0.8 },
            { url: '/services', changefreq: 'monthly', priority: 0.9 },
            { url: '/contact', changefreq: 'yearly', priority: 0.5 }
        ];
        
        // Ajouter URLs statiques
        staticUrls.forEach(url => this.stream.write(url));
        
        // Récupérer contenu dynamique
        const dynamicContent = await this.fetchDynamicContent();
        
        // Articles blog
        dynamicContent.posts.forEach(post => {
            this.stream.write({
                url: `/blog/${post.slug}`,
                lastmod: post.updatedAt,
                changefreq: this.calculateChangefreq(post.updatedAt),
                priority: this.calculatePriority(post),
                img: post.featuredImage ? [{
                    url: post.featuredImage,
                    title: post.title,
                    caption: post.excerpt
                }] : undefined
            });
        });
        
        // Produits avec images
        dynamicContent.products.forEach(product => {
            const images = product.images.map(img => ({
                url: img.url,
                title: img.alt || product.name,
                caption: img.caption
            }));
            
            this.stream.write({
                url: `/products/${product.slug}`,
                lastmod: product.updatedAt,
                changefreq: 'weekly',
                priority: 0.9,
                img: images
            });
        });
        
        // Pages avec vidéos
        dynamicContent.videos.forEach(video => {
            this.stream.write({
                url: `/videos/${video.slug}`,
                video: [{
                    title: video.title,
                    thumbnail_loc: video.thumbnail,
                    description: video.description,
                    content_loc: video.url,
                    duration: video.duration,
                    publication_date: video.publishedAt
                }]
            });
        });
        
        this.stream.end();
    }
    
    calculateChangefreq(lastModified) {
        const daysSince = (Date.now() - new Date(lastModified)) / (1000 * 60 * 60 * 24);
        
        if (daysSince < 1) return 'hourly';
        if (daysSince < 7) return 'daily';
        if (daysSince < 30) return 'weekly';
        if (daysSince < 365) return 'monthly';
        return 'yearly';
    }
    
    calculatePriority(item) {
        let priority = 0.5;
        
        // Boost pour contenu populaire
        if (item.pageviews > 1000) priority += 0.1;
        if (item.shares > 100) priority += 0.1;
        if (item.isFeatured) priority += 0.2;
        
        // Réduire pour ancien contenu
        const ageInDays = (Date.now() - new Date(item.createdAt)) / (1000 * 60 * 60 * 24);
        if (ageInDays > 365) priority -= 0.1;
        
        return Math.max(0.1, Math.min(1.0, priority));
    }
    
    async save(filepath) {
        await this.generateDynamic();
        
        const sitemap = await streamToPromise(this.stream);
        
        // Sauvegarder XML
        createWriteStream(filepath).write(sitemap);
        
        // Créer version compressée
        const zlib = require('zlib');
        const gzip = zlib.createGzip();
        
        pipeline(
            sitemap,
            gzip,
            createWriteStream(`${filepath}.gz`),
            (err) => {
                if (err) console.error('Compression failed:', err);
            }
        );
        
        return sitemap.toString();
    }
}

Types de sitemaps spécialisés

News sitemap

<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
        xmlns:news="http://www.google.com/schemas/sitemap-news/0.9">
    <url>
        <loc>https://www.example.com/actualites/breaking-news</loc>
        <news:news>
            <news:publication>
                <news:name>Example News</news:name>
                <news:language>fr</news:language>
            </news:publication>
            <news:publication_date>2024-12-01T15:30:00+01:00</news:publication_date>
            <news:title>Titre de l'actualité importante</news:title>
            <news:keywords>mot-clé1, mot-clé2, mot-clé3</news:keywords>
        </news:news>
    </url>
</urlset>

Mobile sitemap

<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
        xmlns:mobile="http://www.google.com/schemas/sitemap-mobile/1.0">
    <url>
        <loc>https://m.example.com/page</loc>
        <mobile:mobile/>
    </url>
</urlset>

Optimisation et bonnes pratiques

Validation et tests

# Validateur sitemap XML
import xml.etree.ElementTree as ET
from urllib.parse import urlparse
import requests
from datetime import datetime

class SitemapValidator:
    def __init__(self, sitemap_url):
        self.sitemap_url = sitemap_url
        self.errors = []
        self.warnings = []
        self.stats = {
            'total_urls': 0,
            'valid_urls': 0,
            'broken_urls': 0,
            'redirect_urls': 0
        }
    
    def validate(self):
        """
        Valide complètement un sitemap
        """
        # Télécharger sitemap
        try:
            response = requests.get(self.sitemap_url)
            response.raise_for_status()
        except Exception as e:
            self.errors.append(f"Cannot fetch sitemap: {e}")
            return False
        
        # Parser XML
        try:
            root = ET.fromstring(response.content)
        except ET.ParseError as e:
            self.errors.append(f"Invalid XML: {e}")
            return False
        
        # Vérifier namespace
        namespace = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
        
        # Valider structure
        if root.tag == '{http://www.sitemaps.org/schemas/sitemap/0.9}sitemapindex':
            return self.validate_sitemap_index(root, namespace)
        else:
            return self.validate_urlset(root, namespace)
    
    def validate_urlset(self, root, namespace):
        """
        Valide un sitemap standard
        """
        urls = root.findall('ns:url', namespace)
        self.stats['total_urls'] = len(urls)
        
        # Limites protocole
        if len(urls) > 50000:
            self.errors.append(f"Too many URLs: {len(urls)} (max 50,000)")
        
        if len(response.content) > 52428800:  # 50MB
            self.errors.append(f"File too large: {len(response.content)} bytes (max 50MB)")
        
        # Valider chaque URL
        for url_elem in urls:
            self.validate_url_entry(url_elem, namespace)
        
        return len(self.errors) == 0
    
    def validate_url_entry(self, url_elem, namespace):
        """
        Valide une entrée URL
        """
        # Loc obligatoire
        loc_elem = url_elem.find('ns:loc', namespace)
        if loc_elem is None or not loc_elem.text:
            self.errors.append("Missing required <loc> element")
            return
        
        url = loc_elem.text.strip()
        
        # Valider format URL
        if not self.is_valid_url(url):
            self.errors.append(f"Invalid URL format: {url}")
            return
        
        # Vérifier accessibilité (optionnel mais recommandé)
        if self.check_url_accessibility:
            status = self.check_url_status(url)
            if status >= 400:
                self.stats['broken_urls'] += 1
                self.warnings.append(f"Broken URL (HTTP {status}): {url}")
            elif 300 <= status < 400:
                self.stats['redirect_urls'] += 1
                self.warnings.append(f"Redirect (HTTP {status}): {url}")
            else:
                self.stats['valid_urls'] += 1
        
        # Valider lastmod
        lastmod_elem = url_elem.find('ns:lastmod', namespace)
        if lastmod_elem is not None and lastmod_elem.text:
            if not self.is_valid_datetime(lastmod_elem.text):
                self.warnings.append(f"Invalid lastmod format for {url}: {lastmod_elem.text}")
        
        # Valider changefreq
        changefreq_elem = url_elem.find('ns:changefreq', namespace)
        if changefreq_elem is not None and changefreq_elem.text:
            valid_freqs = ['always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never']
            if changefreq_elem.text not in valid_freqs:
                self.warnings.append(f"Invalid changefreq for {url}: {changefreq_elem.text}")
        
        # Valider priority
        priority_elem = url_elem.find('ns:priority', namespace)
        if priority_elem is not None and priority_elem.text:
            try:
                priority = float(priority_elem.text)
                if not 0.0 <= priority <= 1.0:
                    self.warnings.append(f"Priority out of range for {url}: {priority}")
            except ValueError:
                self.warnings.append(f"Invalid priority format for {url}: {priority_elem.text}")
    
    def generate_report(self):
        """
        Génère rapport de validation
        """
        return {
            'valid': len(self.errors) == 0,
            'errors': self.errors,
            'warnings': self.warnings,
            'statistics': self.stats,
            'recommendations': self.generate_recommendations()
        }

Submission et monitoring

// Gestion soumission sitemaps
class SitemapManager {
    constructor(domain) {
        this.domain = domain;
        this.sitemapUrl = `${domain}/sitemap.xml`;
    }
    
    async submitToSearchEngines() {
        const submissions = [];
        
        // Google
        submissions.push({
            engine: 'Google',
            url: `https://www.google.com/ping?sitemap=${encodeURIComponent(this.sitemapUrl)}`,
            method: 'GET'
        });
        
        // Bing
        submissions.push({
            engine: 'Bing',
            url: `https://www.bing.com/ping?sitemap=${encodeURIComponent(this.sitemapUrl)}`,
            method: 'GET'
        });
        
        // Soumettre à tous
        const results = await Promise.all(
            submissions.map(async (submission) => {
                try {
                    const response = await fetch(submission.url, {
                        method: submission.method
                    });
                    
                    return {
                        engine: submission.engine,
                        success: response.ok,
                        status: response.status
                    };
                } catch (error) {
                    return {
                        engine: submission.engine,
                        success: false,
                        error: error.message
                    };
                }
            })
        );
        
        return results;
    }
    
    async monitorIndexation() {
        // Vérifier statut dans Search Console
        const searchConsoleData = await this.getSearchConsoleData();
        
        return {
            submitted_urls: searchConsoleData.sitemaps[0].submitted,
            indexed_urls: searchConsoleData.sitemaps[0].indexed,
            coverage_ratio: searchConsoleData.sitemaps[0].indexed / 
                          searchConsoleData.sitemaps[0].submitted,
            errors: searchConsoleData.sitemaps[0].errors,
            last_downloaded: searchConsoleData.sitemaps[0].lastDownloaded
        };
    }
    
    setupAutomaticUpdates() {
        // Webhook pour mises à jour automatiques
        const updateTriggers = {
            'new_content': async (content) => {
                await this.addUrlToSitemap(content.url);
                await this.pingSearchEngines();
            },
            
            'content_updated': async (content) => {
                await this.updateUrlInSitemap(content.url, {
                    lastmod: new Date().toISOString()
                });
            },
            
            'content_deleted': async (content) => {
                await this.removeUrlFromSitemap(content.url);
            }
        };
        
        return updateTriggers;
    }
}

Le sitemap XML reste un élément fondamental du SEO technique, facilitant la découverte et l’indexation efficace du contenu par les moteurs de recherche, particulièrement crucial pour les grands sites et le contenu nouveau.