Current File : //opt/cloudlinux/venv/lib/python3.11/site-packages/cl_website_collector/docroot_processor.py |
# -*- coding: utf-8 -*-
#
# Copyright © Cloud Linux GmbH & Cloud Linux Software, Inc 2010-2024 All Rights Reserved
#
# Licensed under CLOUD LINUX LICENSE AGREEMENT
# http://cloudlinux.com/docs/LICENSE.TXT
import logging
import os
import time
from pathlib import Path
from typing import Dict, List, Optional, Any
# System directories to exclude from scanning
SYSTEM_EXCLUDE_DIRS = [
'node_modules', 'vendor',
'.idea', '.vscode', '.well-known',
'.git', '.svn', '.hg',
]
class DocrootProcessor:
"""
Processes individual docroot to collect .htaccess files and metadata.
"""
def __init__(self, logger: logging.Logger):
self.logger = logger
def collect_htaccess_paths(self, docroot: str, domains: list, username: str, timeout: int = 30) -> Optional[
Dict[str, Any]]:
"""
Collect .htaccess file paths from a docroot without reading file contents.
Args:
docroot: Document root path
domains: Domain names
username: Owner username
timeout: Processing timeout in seconds
Returns:
Dictionary with collected file paths or None if failed
"""
start_time = time.time()
result = {
'docroot': docroot,
'domains': domains,
'username': username,
'htaccess_file_paths': [],
'symlinks': [],
'timeout_reached': False,
'processing_time_seconds': 0,
'htaccess_files_found': 0,
}
try:
self.logger.debug("Finding .htaccess files in %s", docroot)
htaccess_files = self._find_htaccess_files(docroot, max_depth=4, timeout=timeout - 5)
self.logger.debug("Found %d .htaccess files in %s", len(htaccess_files), docroot)
for file_path in htaccess_files:
self.logger.debug(" - %s", file_path)
if not htaccess_files:
self.logger.debug("No .htaccess files found in %s", docroot)
else:
# Process each found file path (no content reading)
for file_path in htaccess_files:
if time.time() - start_time > timeout:
result['timeout_reached'] = True
self.logger.error("[WEBSITE-COLLECTOR] Timeout reached while collecting paths in %s", docroot)
break
try:
self.logger.debug("Collecting .htaccess path: %s", file_path)
# Handle symlinks
p = Path(file_path)
is_symlink = p.is_symlink()
real_path = str(p.resolve(strict=False)) if is_symlink else file_path
if is_symlink:
result['symlinks'].append({
'link': self._normalize_path(file_path, docroot),
'target': real_path
})
# Check if file is readable
if Path(real_path).exists() and os.access(real_path, os.R_OK):
# Store file path info for on-demand reading
location = self._normalize_path(file_path, docroot)
result['htaccess_file_paths'].append({
'location': location,
'file_path': file_path,
'real_path': real_path,
'is_symlink': is_symlink
})
else:
self.logger.debug("Cannot read file: %s", file_path)
except Exception as e:
self.logger.error("[WEBSITE-COLLECTOR] Error collecting path %s: %s", file_path, e)
result['htaccess_files_found'] = len(result['htaccess_file_paths'])
result['processing_time_seconds'] = time.time() - start_time
self.logger.debug("Collected %d .htaccess file paths from %s in %.2fs",
result['htaccess_files_found'], docroot, result['processing_time_seconds'])
except Exception as e:
self.logger.error("[WEBSITE-COLLECTOR] Error processing docroot %s: %s", docroot, e)
return result
def _find_htaccess_files(self, docroot: str, max_depth: int = 4, timeout: int = 25) -> List[str]:
"""
Find .htaccess files.
"""
start_time = time.time()
htaccess_files = []
try:
for root, dirs, files in os.walk(docroot):
# Check timeout
if time.time() - start_time > timeout:
self.logger.error("[WEBSITE-COLLECTOR] os.walk timeout for %s", docroot)
break
# Calculate current depth robustly regardless of trailing separators
if root == docroot:
depth = 0
else:
depth = os.path.relpath(root, docroot).count(os.sep)
if depth >= max_depth:
dirs[:] = [] # Don't go deeper, but still process files at this level
# Apply exclusion filters for directories
dirs[:] = [d for d in dirs if not self._should_exclude_directory(root, d)]
# Look for .htaccess files
if '.htaccess' in files:
file_path = Path(root) / '.htaccess'
# Consider empty .htaccess files as valid as well
if (file_path.is_file() and
os.access(str(file_path), os.R_OK)):
htaccess_files.append(str(file_path))
except Exception as e:
self.logger.error("[WEBSITE-COLLECTOR] Error walking %s: %s", docroot, e)
return htaccess_files
def _should_exclude_directory(self, parent_path: str, dirname: str) -> bool:
"""
Check if directory should be excluded based on SYSTEM_EXCLUDE_DIRS.
Supports both plain directory names (e.g. "node_modules") and nested
paths (e.g. "wp-content/cache"). The check is performed against the
full candidate path composed from parent_path and dirname.
"""
try:
candidate = Path(parent_path) / dirname
candidate_normalized = candidate.resolve(strict=False)
for exclude_dir in SYSTEM_EXCLUDE_DIRS:
pattern = Path(exclude_dir)
# Match exact directory name or nested path suffix
if (str(candidate_normalized).endswith(os.sep + str(pattern)) or
candidate.name == pattern.name):
return True
except Exception:
# Be conservative on errors and do not exclude
return False
return False
def _normalize_path(self, file_path: str, docroot: str) -> str:
"""
Normalize file path relative to docroot.
"""
try:
return str(Path(file_path).relative_to(Path(docroot)))
except ValueError:
# If relative path calculation fails, return filename only
return Path(file_path).name