Définition
L’analyse de logs consiste à examiner les fichiers journaux (logs) du serveur web pour extraire des informations précieuses sur la façon dont les moteurs de recherche explorent un site. Cette technique révèle les pages réellement crawlées par Googlebot, la fréquence de passage, les erreurs rencontrées et permet d’optimiser le budget de crawl en identifiant les ressources gaspillées sur des pages sans valeur SEO.
Types de données dans les logs
Format de log standard
# Format Apache Combined Log
66.249.64.17 - - [15/Dec/2024:10:23:45 +0100] "GET /products/laptop-dell HTTP/1.1" 200 15234 "https://www.google.com/" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
# Décomposition:
# IP: 66.249.64.17 (Googlebot)
# Date/Heure: 15/Dec/2024:10:23:45 +0100
# Requête: GET /products/laptop-dell HTTP/1.1
# Code réponse: 200
# Taille: 15234 bytes
# Referer: https://www.google.com/
# User-Agent: Googlebot/2.1
Extraction des insights
# Analyseur de logs SEO
import re
from datetime import datetime
from collections import defaultdict
class SEOLogAnalyzer:
def __init__(self, log_file):
self.log_file = log_file
self.bot_patterns = {
'googlebot': r'Googlebot',
'bingbot': r'bingbot',
'yandex': r'YandexBot',
'baidu': r'Baiduspider'
}
def parse_log_line(self, line):
"""Parse une ligne de log Apache"""
pattern = r'(\S+) - - \[(.*?)\] "(.*?)" (\d+) (\d+) "(.*?)" "(.*?)"'
match = re.match(pattern, line)
if match:
return {
'ip': match.group(1),
'timestamp': datetime.strptime(match.group(2), '%d/%b/%Y:%H:%M:%S %z'),
'request': match.group(3),
'status_code': int(match.group(4)),
'bytes': int(match.group(5)),
'referer': match.group(6),
'user_agent': match.group(7)
}
return None
def analyze_bot_behavior(self):
"""Analyse comportement des bots"""
bot_stats = defaultdict(lambda: {
'total_hits': 0,
'unique_urls': set(),
'status_codes': defaultdict(int),
'crawl_frequency': defaultdict(int),
'bandwidth_used': 0
})
with open(self.log_file, 'r') as f:
for line in f:
entry = self.parse_log_line(line)
if not entry:
continue
# Identifier le bot
bot_name = self.identify_bot(entry['user_agent'])
if bot_name:
stats = bot_stats[bot_name]
stats['total_hits'] += 1
stats['unique_urls'].add(entry['request'].split()[1])
stats['status_codes'][entry['status_code']] += 1
stats['bandwidth_used'] += entry['bytes']
# Fréquence par heure
hour = entry['timestamp'].hour
stats['crawl_frequency'][hour] += 1
return self.generate_insights(bot_stats)
def identify_bot(self, user_agent):
"""Identifie le bot depuis user-agent"""
for bot_name, pattern in self.bot_patterns.items():
if re.search(pattern, user_agent, re.IGNORECASE):
return bot_name
return None
Métriques clés à analyser
Budget de crawl
// Analyse utilisation budget crawl
class CrawlBudgetAnalyzer {
constructor(logData) {
this.logData = logData;
this.googlebotIPs = [
'66.249.64.0/19',
'66.249.80.0/20',
'64.233.160.0/19'
];
}
analyzeCrawlBudget() {
const analysis = {
totalCrawls: 0,
uniquePagesCrawled: new Set(),
wastedCrawls: {
redirects: 0,
errors: 0,
duplicates: 0,
lowValuePages: 0
},
crawlDistribution: {},
recommendations: []
};
// Analyser chaque entrée Googlebot
this.logData.forEach(entry => {
if (this.isGooglebot(entry)) {
analysis.totalCrawls++;
analysis.uniquePagesCrawled.add(entry.url);
// Identifier crawls gaspillés
if ([301, 302, 307].includes(entry.statusCode)) {
analysis.wastedCrawls.redirects++;
} else if (entry.statusCode >= 400) {
analysis.wastedCrawls.errors++;
} else if (this.isLowValuePage(entry.url)) {
analysis.wastedCrawls.lowValuePages++;
}
// Distribution par type de page
const pageType = this.categorizePageType(entry.url);
analysis.crawlDistribution[pageType] =
(analysis.crawlDistribution[pageType] || 0) + 1;
}
});
// Calculer efficacité
const wastedTotal = Object.values(analysis.wastedCrawls)
.reduce((sum, val) => sum + val, 0);
analysis.efficiency = ((analysis.totalCrawls - wastedTotal) /
analysis.totalCrawls * 100).toFixed(2);
// Générer recommandations
if (analysis.wastedCrawls.redirects > analysis.totalCrawls * 0.1) {
analysis.recommendations.push({
issue: 'Trop de redirections',
impact: 'High',
solution: 'Mettre à jour liens internes vers URLs finales'
});
}
return analysis;
}
isLowValuePage(url) {
const lowValuePatterns = [
/\?sort=/,
/\?filter=/,
/\?sessionid=/,
/\/print\//,
/\/amp\//,
/\?replytocom=/
];
return lowValuePatterns.some(pattern => pattern.test(url));
}
}
Insights avancés
Patterns de crawl
# Détection patterns crawl avancés
def analyze_crawl_patterns(log_entries):
"""
Identifie patterns de crawl pour optimisation
"""
patterns = {
'crawl_depth': defaultdict(int),
'time_patterns': defaultdict(list),
'section_priority': defaultdict(int),
'crawl_velocity': {},
'orphan_pages': set()
}
# Analyser profondeur de crawl
for entry in log_entries:
if entry['bot'] == 'googlebot':
depth = entry['url'].count('/')
patterns['crawl_depth'][depth] += 1
# Pattern temporel
hour = entry['timestamp'].hour
patterns['time_patterns'][hour].append(entry['url'])
# Priorité par section
section = extract_section(entry['url'])
patterns['section_priority'][section] += 1
# Calculer vélocité de crawl
first_crawl = {}
last_crawl = {}
for entry in sorted(log_entries, key=lambda x: x['timestamp']):
url = entry['url']
if url not in first_crawl:
first_crawl[url] = entry['timestamp']
last_crawl[url] = entry['timestamp']
for url in first_crawl:
if url in last_crawl and first_crawl[url] != last_crawl[url]:
days_between = (last_crawl[url] - first_crawl[url]).days
patterns['crawl_velocity'][url] = {
'frequency': calculate_crawl_frequency(url, log_entries),
'avg_days_between': days_between / patterns['crawl_velocity'][url]['frequency']
}
return patterns
def generate_crawl_insights(patterns):
"""
Génère insights actionnables
"""
insights = []
# Insight profondeur
deep_pages = sum(count for depth, count in patterns['crawl_depth'].items() if depth > 4)
total_pages = sum(patterns['crawl_depth'].values())
if deep_pages / total_pages > 0.3:
insights.append({
'type': 'architecture',
'issue': 'Trop de pages profondes',
'impact': f'{deep_pages} pages à >4 clics de la home',
'recommendation': 'Améliorer architecture et maillage interne'
})
# Insight sections négligées
avg_crawls = sum(patterns['section_priority'].values()) / len(patterns['section_priority'])
neglected_sections = [
section for section, count in patterns['section_priority'].items()
if count < avg_crawls * 0.5
]
if neglected_sections:
insights.append({
'type': 'crawl_distribution',
'issue': 'Sections sous-crawlées',
'sections': neglected_sections,
'recommendation': 'Améliorer liens internes vers ces sections'
})
return insights
Corrélation avec performance
// Corrélation logs avec métriques SEO
class LogSEOCorrelation {
constructor(logData, seoMetrics) {
this.logData = logData;
this.seoMetrics = seoMetrics;
}
correlateCrawlWithRankings() {
const correlation = {
crawlFrequency: {},
rankingChanges: {},
insights: []
};
// Pour chaque URL
Object.keys(this.seoMetrics.rankings).forEach(url => {
// Fréquence de crawl
const crawlCount = this.logData.filter(
entry => entry.url === url && entry.bot === 'googlebot'
).length;
// Changement de position
const rankingChange = this.seoMetrics.rankings[url].change;
correlation.crawlFrequency[url] = crawlCount;
correlation.rankingChanges[url] = rankingChange;
});
// Analyser corrélation
const highCrawlGoodRanking = Object.keys(correlation.crawlFrequency)
.filter(url =>
correlation.crawlFrequency[url] > 10 &&
correlation.rankingChanges[url] > 0
);
const lowCrawlBadRanking = Object.keys(correlation.crawlFrequency)
.filter(url =>
correlation.crawlFrequency[url] < 2 &&
correlation.rankingChanges[url] < 0
);
// Générer insights
if (highCrawlGoodRanking.length > 0) {
correlation.insights.push({
pattern: 'Positive correlation',
message: 'Pages fréquemment crawlées ont tendance à mieux ranker',
examples: highCrawlGoodRanking.slice(0, 5)
});
}
if (lowCrawlBadRanking.length > 0) {
correlation.insights.push({
pattern: 'Crawl insuffisant',
message: 'Pages peu crawlées perdent des positions',
action: 'Améliorer signaux pour augmenter crawl',
examples: lowCrawlBadRanking.slice(0, 5)
});
}
return correlation;
}
}
Outils et automatisation
Pipeline d’analyse
# Pipeline automatisé analyse logs
class LogAnalysisPipeline:
def __init__(self, config):
self.config = config
self.output_dir = config['output_dir']
def run_daily_analysis(self):
"""
Analyse quotidienne automatisée
"""
import subprocess
from datetime import datetime, timedelta
yesterday = datetime.now() - timedelta(days=1)
log_file = f"/var/log/apache2/access.log.{yesterday.strftime('%Y%m%d')}"
# 1. Extraire logs du jour
self.extract_daily_logs(log_file)
# 2. Parser et analyser
analysis_results = {
'date': yesterday.strftime('%Y-%m-%d'),
'bot_activity': self.analyze_bot_activity(),
'crawl_budget': self.analyze_crawl_budget(),
'errors': self.analyze_errors(),
'performance': self.analyze_performance()
}
# 3. Générer rapport
report = self.generate_report(analysis_results)
# 4. Alertes si nécessaire
self.check_alerts(analysis_results)
# 5. Sauvegarder données
self.save_results(analysis_results)
return report
def analyze_bot_activity(self):
"""
Analyse activité par bot
"""
bot_stats = {}
# GoAccess pour analyse rapide
cmd = f"goaccess {self.current_log} --log-format=COMBINED -o json"
result = subprocess.run(cmd, shell=True, capture_output=True)
data = json.loads(result.stdout)
# Filtrer par user-agent bot
for ua in data['user_agents']:
if any(bot in ua['data'] for bot in ['bot', 'spider', 'crawler']):
bot_stats[ua['data']] = {
'hits': ua['hits'],
'bandwidth': ua['bytes'],
'percentage': ua['percent']
}
return bot_stats
def check_alerts(self, analysis):
"""
Vérifier conditions d'alerte
"""
alerts = []
# Alerte erreurs excessives
error_rate = analysis['errors']['4xx_count'] / analysis['total_requests']
if error_rate > 0.05: # >5% d'erreurs
alerts.append({
'severity': 'HIGH',
'type': 'error_rate',
'message': f'Taux d\'erreur élevé: {error_rate*100:.1f}%',
'action': 'Vérifier pages 404 et redirections'
})
# Alerte chute crawl
if analysis['bot_activity']['googlebot']['hits'] < self.config['min_daily_crawls']:
alerts.append({
'severity': 'MEDIUM',
'type': 'low_crawl',
'message': 'Crawl Googlebot inhabituellement bas',
'action': 'Vérifier robots.txt et erreurs serveur'
})
if alerts:
self.send_alerts(alerts)
return alerts
Visualisation des données
// Dashboard visualisation logs
class LogVisualizationDashboard {
constructor(logData) {
this.logData = logData;
this.charts = {};
}
createCrawlTimeline() {
// Timeline crawl par heure
const hourlyData = Array(24).fill(0);
this.logData.forEach(entry => {
if (entry.bot === 'googlebot') {
const hour = new Date(entry.timestamp).getHours();
hourlyData[hour]++;
}
});
return {
type: 'line',
data: {
labels: Array.from({length: 24}, (_, i) => `${i}:00`),
datasets: [{
label: 'Googlebot Crawls',
data: hourlyData,
borderColor: 'rgb(75, 192, 192)',
tension: 0.1
}]
}
};
}
createStatusCodeDistribution() {
const statusCodes = {};
this.logData.forEach(entry => {
const statusGroup = Math.floor(entry.statusCode / 100) + 'xx';
statusCodes[statusGroup] = (statusCodes[statusGroup] || 0) + 1;
});
return {
type: 'doughnut',
data: {
labels: Object.keys(statusCodes),
datasets: [{
data: Object.values(statusCodes),
backgroundColor: [
'#4CAF50', // 2xx
'#2196F3', // 3xx
'#FF9800', // 4xx
'#F44336' // 5xx
]
}]
}
};
}
}
L’analyse de logs reste une technique fondamentale pour comprendre réellement comment les moteurs de recherche interagissent avec un site, permettant des optimisations basées sur des données factuelles plutôt que des suppositions.