From ad84b496f5090d4c1b02cc34a6a1ee1c2ce3ab7a Mon Sep 17 00:00:00 2001 From: Ireneusz Bachanowicz Date: Sun, 6 Apr 2025 02:09:52 +0200 Subject: [PATCH] Added hot cold store script --- my-app/docs/file_manager_enhancement_plan.md | 112 +++ my-app/utils/config.yaml | 5 + my-app/utils/file_manager.py | 819 +++++++++++++++++++ 3 files changed, 936 insertions(+) create mode 100644 my-app/docs/file_manager_enhancement_plan.md create mode 100644 my-app/utils/config.yaml create mode 100644 my-app/utils/file_manager.py diff --git a/my-app/docs/file_manager_enhancement_plan.md b/my-app/docs/file_manager_enhancement_plan.md new file mode 100644 index 0000000..643187e --- /dev/null +++ b/my-app/docs/file_manager_enhancement_plan.md @@ -0,0 +1,112 @@ +# File Manager Enhancement Plan + +This document outlines the plan to enhance the `my-app/utils/file_manager.py` script based on user feedback. + +**Goals:** + +1. Add support for loading configuration from a `config.yaml` file. +2. Implement a new action (`--move-cold`) to move inactive ("cold") files from fast storage back to slow storage based on modification time. +3. Add an `--interactive` flag to prompt for confirmation before moving files. +4. Implement a new action (`--generate-stats`) to create a JSON file containing storage statistics (file counts, sizes by age) for both source and target directories. +5. Calculate and log the total size of files being moved by the `--move-cold` action. + +**Detailed Plan:** + +1. **Configuration File (`config.yaml`):** + * **Goal:** Allow users to define common settings in a YAML file. + * **Implementation:** + * Define structure for `config.yaml` (e.g., `~/.config/file_manager/config.yaml` or specified via `--config`). + * Use `PyYAML` library (requires `pip install PyYAML`). + * Modify `parse_arguments` to load settings, allowing command-line overrides. + * Add `--config` argument. + +2. **Move Cold Files Back (`--move-cold` action):** + * **Goal:** Move files from fast (target) to slow (source) storage if inactive. + * **Implementation:** + * Add action: `--move-cold`. + * Add argument: `--stale-days` (default 30, uses modification time `st_mtime`). + * New function `find_stale_files(directory, days)`: Scans `target_dir` based on `st_mtime`. + * New function `move_files_cold(relative_file_list, source_dir, target_dir, dry_run, interactive)`: + * Similar to `move_files`. + * Moves files from `target_dir` to `source_dir` using `rsync`. + * Handles paths relative to `target_dir`. + * Calculates and logs total size of files to be moved before `rsync`. + * Incorporates interactive confirmation. + +3. **Interactive Confirmation (`--interactive` flag):** + * **Goal:** Add a safety check before moving files. + * **Implementation:** + * Add global flag: `--interactive`. + * Modify `move_files` and `move_files_cold`: + * If `--interactive` and not `--dry-run`: + * Log files/count. + * Use `input()` for user confirmation (`yes/no`). + * Proceed only on "yes". + +4. **Enhanced Reporting/Stats File (`--generate-stats` action):** + * **Goal:** Create a persistent JSON file with storage statistics. + * **Implementation:** + * Add action: `--generate-stats`. + * Add argument: `--stats-file` (overrides config). + * New function `analyze_directory(directory)`: + * Walks directory, calculates total count/size, count/size by modification time brackets. + * Returns data as a dictionary. + * Modify `main` or create orchestrator for `--generate-stats`: + * Call `analyze_directory` for source and target. + * Combine results with a timestamp. + * Write dictionary to `stats_file` using `json`. + * **(Optional):** Modify `--summarize-unused` to potentially use the stats file. + +**Workflow Visualization (Mermaid):** + +```mermaid +graph TD + Start --> ReadConfig{Read config.yaml (Optional)} + ReadConfig --> ParseArgs[Parse Command Line Args] + ParseArgs --> ValidateArgs{Validate Args & Config} + ValidateArgs --> ActionRouter{Route based on Action} + + ActionRouter -- --generate-stats --> AnalyzeSrc[Analyze Source Dir] + AnalyzeSrc --> AnalyzeTgt[Analyze Target Dir] + AnalyzeTgt --> WriteStatsFile[Write stats.json] + WriteStatsFile --> End + + ActionRouter -- --move --> FindRecent[Find Recent Files (Source)] + FindRecent --> CheckInteractiveHot{Interactive?} + CheckInteractiveHot -- Yes --> ConfirmHot(Confirm Move Hot?) + CheckInteractiveHot -- No --> ExecuteMoveHot[Execute rsync Hot (Source->Target)] + ConfirmHot -- Yes --> ExecuteMoveHot + ConfirmHot -- No --> AbortHot(Abort Hot Move) + AbortHot --> End + ExecuteMoveHot --> End + + ActionRouter -- --move-cold --> FindStale[Find Stale Files (Target)] + FindStale --> CalculateColdSize[Calculate Total Size of Cold Files] + CalculateColdSize --> CheckInteractiveCold{Interactive?} + CheckInteractiveCold -- Yes --> ConfirmCold(Confirm Move Cold?) + CheckInteractiveCold -- No --> ExecuteMoveCold[Execute rsync Cold (Target->Source)] + ConfirmCold -- Yes --> ExecuteMoveCold + ConfirmCold -- No --> AbortCold(Abort Cold Move) + AbortCold --> End + ExecuteMoveCold --> End + + ActionRouter -- --count --> FindRecentForCount[Find Recent Files (Source)] + FindRecentForCount --> CountFiles[Log Count] + CountFiles --> End + + ActionRouter -- --summarize-unused --> SummarizeUnused[Summarize Unused (Target)] + SummarizeUnused --> LogSummary[Log Summary] + LogSummary --> End + + ActionRouter -- No Action/Error --> ShowHelp[Show Help / Error] + ShowHelp --> End +``` + +**Summary of Changes:** + +* New dependencies: `PyYAML`. +* New command-line arguments: `--move-cold`, `--stale-days`, `--interactive`, `--generate-stats`, `--stats-file`, `--config`. +* New functions: `find_stale_files`, `move_files_cold`, `analyze_directory`. +* Modifications to existing functions: `parse_arguments`, `move_files`, `main`. +* Introduction of `config.yaml` for settings. +* Introduction of a JSON stats file for persistent reporting. \ No newline at end of file diff --git a/my-app/utils/config.yaml b/my-app/utils/config.yaml new file mode 100644 index 0000000..1fa19cb --- /dev/null +++ b/my-app/utils/config.yaml @@ -0,0 +1,5 @@ +source_dir: /mnt/archive_nfs +target_dir: /mnt/local_ssd +recent_days: 2 +stale_days: 45 +stats_file: /home/user/logs/file_manager_stats.json \ No newline at end of file diff --git a/my-app/utils/file_manager.py b/my-app/utils/file_manager.py new file mode 100644 index 0000000..30c6e3f --- /dev/null +++ b/my-app/utils/file_manager.py @@ -0,0 +1,819 @@ +#!/usr/bin/env python3 + +import argparse +import os +import subprocess +import sys +import time +import logging +import json # Added for stats file +from datetime import datetime, timedelta +from pathlib import Path # Added for easier path handling + +# --- Dependencies --- +# Requires PyYAML: pip install PyYAML +try: + import yaml +except ImportError: + print("Error: PyYAML library not found. Please install it using: pip install PyYAML", file=sys.stderr) + sys.exit(1) + + +# --- Configuration --- +# These act as fallback defaults if not specified in config file or command line +DEFAULT_SOURCE_DIR = "/mnt/slow_storage" +DEFAULT_TARGET_DIR = "/mnt/fast_storage" +DEFAULT_RECENT_DAYS = 1 +DEFAULT_STALE_DAYS = 30 # Default for moving cold files back +DEFAULT_STATS_FILE = None # Default: Don't generate stats unless requested +DEFAULT_MIN_SIZE = "0" # Default: No minimum size filter +DEFAULT_CONFIG_PATH = Path.home() / ".config" / "file_manager" / "config.yaml" + +# --- Logging Setup --- +def setup_logging(): + """Configures basic logging.""" + logging.basicConfig( + level=logging.INFO, + format="[%(asctime)s] [%(levelname)s] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + +# --- Helper Function --- +def format_bytes(size): + """Converts bytes to a human-readable string (KB, MB, GB).""" + if size is None: return "N/A" + if size < 1024: + return f"{size} B" + elif size < 1024**2: + return f"{size / 1024:.2f} KB" + elif size < 1024**3: + return f"{size / 1024**2:.2f} MB" + else: + return f"{size / 1024**3:.2f} GB" + +# --- Helper Function: Parse Size String --- +def parse_size_string(size_str): + """Converts a size string (e.g., '10G', '500M', '10k') to bytes.""" + size_str = str(size_str).strip().upper() + if not size_str: + return 0 + if size_str == '0': + return 0 + + units = {"B": 1, "K": 1024, "M": 1024**2, "G": 1024**3, "T": 1024**4} + unit = "B" # Default unit + + # Check last character for unit + if size_str[-1] in units: + unit = size_str[-1] + numeric_part = size_str[:-1] + else: + numeric_part = size_str + + if not numeric_part.replace('.', '', 1).isdigit(): # Allow float for parsing e.g. 1.5G + raise ValueError(f"Invalid numeric part in size string: '{numeric_part}'") + + try: + value = float(numeric_part) + except ValueError: + raise ValueError(f"Cannot convert numeric part to float: '{numeric_part}'") + + return int(value * units[unit]) + + +# --- Configuration Loading --- +def load_config(config_path): + """Loads configuration from a YAML file.""" + config = {} + resolved_path = Path(config_path).resolve() # Resolve potential symlinks/relative paths + if resolved_path.is_file(): + try: + with open(resolved_path, 'r') as f: + config = yaml.safe_load(f) + if config is None: # Handle empty file case + config = {} + logging.info(f"Loaded configuration from: {resolved_path}") + except yaml.YAMLError as e: + logging.warning(f"Error parsing config file {resolved_path}: {e}. Using defaults.") + except OSError as e: + logging.warning(f"Error reading config file {resolved_path}: {e}. Using defaults.") + else: + # It's okay if the default config doesn't exist, don't log warning unless user specified one + if str(resolved_path) != str(DEFAULT_CONFIG_PATH.resolve()): + logging.warning(f"Specified config file not found at {resolved_path}. Using defaults/CLI args.") + else: + logging.info(f"Default config file not found at {resolved_path}. Using defaults/CLI args.") + return config + +# --- Argument Parsing --- +def parse_arguments(): + """Parses command line arguments, considering config file defaults.""" + + # Initial minimal parse to find config path *before* defining all args + pre_parser = argparse.ArgumentParser(add_help=False) + pre_parser.add_argument('--config', default=str(DEFAULT_CONFIG_PATH), help=f'Path to YAML configuration file (Default: {DEFAULT_CONFIG_PATH}).') + pre_args, _ = pre_parser.parse_known_args() + + # Load config based on pre-parsed path + config = load_config(pre_args.config) + + # Get defaults from config or fallback constants + cfg_source_dir = config.get('source_dir', DEFAULT_SOURCE_DIR) + cfg_target_dir = config.get('target_dir', DEFAULT_TARGET_DIR) + cfg_recent_days = config.get('recent_days', DEFAULT_RECENT_DAYS) + cfg_stale_days = config.get('stale_days', DEFAULT_STALE_DAYS) + cfg_stats_file = config.get('stats_file', DEFAULT_STATS_FILE) + cfg_min_size = config.get('min_size', DEFAULT_MIN_SIZE) + + # Main parser using loaded config defaults + parser = argparse.ArgumentParser( + description="Manages files between storage tiers based on access/modification time, generates stats, and summarizes.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=f"""Examples: + # Move hot files (accessed < {cfg_recent_days}d ago) from {cfg_source_dir} to {cfg_target_dir} + {sys.argv[0]} --move + + # Move cold files (modified > {cfg_stale_days}d ago) from {cfg_target_dir} to {cfg_source_dir} (interactive) + {sys.argv[0]} --move-cold --interactive + + # Simulate moving hot files with custom settings + {sys.argv[0]} --move --recent-days 3 --source-dir /data/archive --target-dir /data/hot --dry-run + + # Count potential hot files larger than 100MB to move + {sys.argv[0]} --count --min-size 100M + {sys.argv[0]} --count + + # Summarize unused files in target directory + {sys.argv[0]} --summarize-unused + + # Generate storage statistics report + {sys.argv[0]} --generate-stats --stats-file /var/log/file_manager_stats.json + + # Use a specific configuration file + {sys.argv[0]} --config /path/to/my_config.yaml --move +""" + ) + + action_group = parser.add_argument_group('Actions (at least one required)') + action_group.add_argument('--move', action='store_true', help='Move recently accessed ("hot") files from source to target.') + action_group.add_argument('--move-cold', action='store_true', help='Move old unmodified ("cold") files from target back to source.') + action_group.add_argument('--count', action='store_true', help='Count hot files in source that would be moved (based on access time).') + action_group.add_argument('--summarize-unused', action='store_true', help='Analyze target directory for unused files based on modification time.') + action_group.add_argument('--generate-stats', action='store_true', help='Generate a JSON stats report for source and target directories.') + + config_group = parser.add_argument_group('Configuration Options (Overrides config file)') + config_group.add_argument('--config', default=str(DEFAULT_CONFIG_PATH), help=f'Path to YAML configuration file (Default: {DEFAULT_CONFIG_PATH}).') # Re-add for help text + config_group.add_argument('--source-dir', default=cfg_source_dir, help=f'Source directory (Default: "{cfg_source_dir}").') + config_group.add_argument('--target-dir', default=cfg_target_dir, help=f'Target directory (Default: "{cfg_target_dir}").') + config_group.add_argument('--recent-days', type=int, default=cfg_recent_days, help=f'Define "recent" access in days for --move/--count (Default: {cfg_recent_days}).') + config_group.add_argument('--stale-days', type=int, default=cfg_stale_days, help=f'Define "stale" modification in days for --move-cold (Default: {cfg_stale_days}).') + config_group.add_argument('--stats-file', default=cfg_stats_file, help=f'Output file for --generate-stats (Default: {"None" if cfg_stats_file is None else cfg_stats_file}).') + config_group.add_argument('--min-size', default=cfg_min_size, help=f'Minimum file size to consider for move actions (e.g., 100M, 1G, 0 to disable). (Default: {cfg_min_size})') + behavior_group = parser.add_argument_group('Behavior Modifiers') + behavior_group.add_argument('--dry-run', action='store_true', help='Simulate move actions without actual changes.') + behavior_group.add_argument('--interactive', action='store_true', help='Prompt for confirmation before executing move actions (ignored if --dry-run).') + + + # If no arguments were given (just script name), print help + if len(sys.argv) == 1: + parser.print_help(sys.stderr) + sys.exit(1) + + args = parser.parse_args() + + # Validate that at least one action is selected + action_selected = args.move or args.move_cold or args.count or args.summarize_unused or args.generate_stats + if not action_selected: + parser.error("At least one action flag (--move, --move-cold, --count, --summarize-unused, --generate-stats) is required.") + + # Validate days arguments + if args.recent_days <= 0: + parser.error("--recent-days must be a positive integer.") + if args.stale_days <= 0: + parser.error("--stale-days must be a positive integer.") + + # Validate stats file if action is selected + if args.generate_stats and not args.stats_file: + parser.error("--stats-file must be specified when using --generate-stats (or set in config file).") + + # Validate and parse min_size + try: + args.min_size_bytes = parse_size_string(args.min_size) + if args.min_size_bytes < 0: + parser.error("--min-size cannot be negative.") + except ValueError as e: + parser.error(f"Invalid --min-size value: {e}") + + return args + +# --- Core Logic Functions --- + +def find_recent_files(source_dir, days, min_size_bytes): + """Finds files accessed within the last 'days' in the source directory.""" + size_filter_msg = f" and size >= {format_bytes(min_size_bytes)}" if min_size_bytes > 0 else "" + logging.info(f"Scanning '{source_dir}' for files accessed within the last {days} day(s){size_filter_msg}...") + recent_files = [] + cutoff_time = time.time() - (days * 86400) # 86400 seconds in a day + try: + for root, _, files in os.walk(source_dir): + for filename in files: + filepath = os.path.join(root, filename) + try: + # Check if it's a file and not a broken symlink etc. + if not os.path.isfile(filepath) or os.path.islink(filepath): + continue + stat_result = os.stat(filepath) + # Check access time AND minimum size + if stat_result.st_atime > cutoff_time and stat_result.st_size >= min_size_bytes: + # Get path relative to source_dir for rsync --files-from + relative_path = os.path.relpath(filepath, source_dir) + recent_files.append(relative_path) + except FileNotFoundError: + logging.warning(f"File not found during scan, skipping: {filepath}") + continue # File might have been deleted during scan + except OSError as e: + logging.warning(f"Cannot access file stats, skipping: {filepath} ({e})") + continue + except FileNotFoundError: + logging.error(f"Source directory '{source_dir}' not found during scan.") + return None # Indicate error + except Exception as e: + logging.error(f"An unexpected error occurred during 'recent' file scan: {e}") + return None + + logging.info(f"Found {len(recent_files)} files matching the 'recent' criteria.") + return recent_files + +# --- New Function: Find Stale Files --- +def find_stale_files(target_dir, days, min_size_bytes): + """Finds files modified more than 'days' ago in the target directory.""" + size_filter_msg = f" and size >= {format_bytes(min_size_bytes)}" if min_size_bytes > 0 else "" + logging.info(f"Scanning '{target_dir}' for files modified more than {days} day(s) ago{size_filter_msg}...") + stale_files = [] + # Cutoff time is *before* this time + cutoff_time = time.time() - (days * 86400) # 86400 seconds in a day + try: + for root, _, files in os.walk(target_dir): + for filename in files: + filepath = os.path.join(root, filename) + try: + # Check if it's a file and not a broken symlink etc. + if not os.path.isfile(filepath) or os.path.islink(filepath): + continue + stat_result = os.stat(filepath) + # Check modification time + # Check modification time AND minimum size + if stat_result.st_mtime < cutoff_time and stat_result.st_size >= min_size_bytes: + # Get path relative to target_dir for rsync --files-from + relative_path = os.path.relpath(filepath, target_dir) + stale_files.append(relative_path) + except FileNotFoundError: + logging.warning(f"File not found during stale scan, skipping: {filepath}") + continue # File might have been deleted during scan + except OSError as e: + logging.warning(f"Cannot access file stats during stale scan, skipping: {filepath} ({e})") + continue + except FileNotFoundError: + logging.error(f"Target directory '{target_dir}' not found during stale scan.") + return None # Indicate error + except Exception as e: + logging.error(f"An unexpected error occurred during 'stale' file scan: {e}") + return None + + logging.info(f"Found {len(stale_files)} files matching the 'stale' criteria (modified > {days} days ago).") + return stale_files + + +def move_files(relative_file_list, source_dir, target_dir, dry_run, interactive): # Added interactive + """Moves files using rsync (hot files: source -> target).""" + if not relative_file_list: + logging.info("No 'hot' files found to move.") + return True # Nothing to do, considered success + + action_desc = "move hot files" + simulating = dry_run + num_files = len(relative_file_list) + + logging.info(f"--- {'Simulating ' if simulating else ''}{action_desc.capitalize()} ---") + logging.info(f"Source Base: {source_dir}") + logging.info(f"Target Base: {target_dir}") + logging.info(f"Files to process: {num_files}") + logging.info("--------------------") + + # Interactive prompt + if interactive and not simulating: + try: + confirm = input(f"Proceed with moving {num_files} hot files from '{source_dir}' to '{target_dir}'? (yes/no): ").lower().strip() + if confirm != 'yes': + logging.warning("Move operation cancelled by user.") + return False # Indicate cancellation + except EOFError: # Handle non-interactive environments gracefully + logging.warning("Cannot prompt in non-interactive mode. Aborting move.") + return False + + + rsync_cmd = ['rsync', '-avP', '--relative', '--info=progress2'] # archive, verbose, progress/partial, relative paths + + if simulating: + rsync_cmd.append('--dry-run') + else: + rsync_cmd.append('--remove-source-files') + + # Use --files-from=- with source as '.' because paths are relative to source_dir + # Target directory is the destination for the relative structure + rsync_cmd.extend(['--files-from=-', '.', target_dir]) + + # Prepare file list for stdin (newline separated) + files_input = "\n".join(relative_file_list).encode('utf-8') + + try: + logging.info(f"Executing rsync command: {' '.join(rsync_cmd)}") + # Run rsync in the source directory context + process = subprocess.run( + rsync_cmd, + input=files_input, + capture_output=True, + # text=True, # Removed: Input is bytes, output will be bytes + check=False, # Don't raise exception on non-zero exit + cwd=source_dir # Execute rsync from the source directory + ) + + # Decode output/error streams + stdout_str = process.stdout.decode('utf-8', errors='replace') if process.stdout else "" + stderr_str = process.stderr.decode('utf-8', errors='replace') if process.stderr else "" + + if stdout_str: + logging.info("rsync output:\n" + stdout_str) + if stderr_str: + # rsync often prints stats to stderr, log as info unless exit code is bad + log_level = logging.WARNING if process.returncode != 0 else logging.INFO + logging.log(log_level, "rsync stderr:\n" + stderr_str) + + if process.returncode == 0: + logging.info(f"rsync {'simulation' if simulating else action_desc} completed successfully.") + logging.info("--------------------") + return True + else: + logging.error(f"rsync {'simulation' if simulating else action_desc} failed with exit code {process.returncode}.") + logging.info("--------------------") + return False + + except FileNotFoundError: + logging.error("Error: 'rsync' command not found. Please ensure rsync is installed and in your PATH.") + return False + except Exception as e: + logging.error(f"An unexpected error occurred during rsync execution for hot files: {e}") + return False + +# --- New Function: Move Cold Files --- +def move_files_cold(relative_file_list, source_dir, target_dir, dry_run, interactive): + """Moves files using rsync (cold files: target -> source).""" + if not relative_file_list: + logging.info("No 'cold' files found to move back.") + return True # Nothing to do, considered success + + action_desc = "move cold files back" + simulating = dry_run + num_files = len(relative_file_list) + total_size = 0 + + # Calculate total size before prompt/move + logging.info("Calculating total size of cold files...") + for rel_path in relative_file_list: + try: + full_path = os.path.join(target_dir, rel_path) + if os.path.isfile(full_path): # Check again in case it vanished + total_size += os.path.getsize(full_path) + except OSError as e: + logging.warning(f"Could not get size for {rel_path}: {e}") + + + logging.info(f"--- {'Simulating ' if simulating else ''}{action_desc.capitalize()} ---") + logging.info(f"Source (of cold files): {target_dir}") + logging.info(f"Destination (archive): {source_dir}") + logging.info(f"Files to process: {num_files}") + logging.info(f"Total size: {format_bytes(total_size)}") + logging.info("--------------------") + + # Interactive prompt + if interactive and not simulating: + try: + confirm = input(f"Proceed with moving {num_files} cold files ({format_bytes(total_size)}) from '{target_dir}' to '{source_dir}'? (yes/no): ").lower().strip() + if confirm != 'yes': + logging.warning("Move operation cancelled by user.") + return False # Indicate cancellation + except EOFError: # Handle non-interactive environments gracefully + logging.warning("Cannot prompt in non-interactive mode. Aborting move.") + return False + + # Note: We run rsync from the TARGET directory now + rsync_cmd = ['rsync', '-avP', '--relative'] # archive, verbose, progress/partial, relative paths + + if simulating: + rsync_cmd.append('--dry-run') + else: + rsync_cmd.append('--remove-source-files') # Remove from TARGET after successful transfer + + # Use --files-from=- with source as '.' (relative to target_dir) + # Target directory is the destination (source_dir in this context) + rsync_cmd.extend(['--files-from=-', '.', source_dir]) + + # Prepare file list for stdin (newline separated) + files_input = "\n".join(relative_file_list).encode('utf-8') + + try: + logging.info(f"Executing rsync command: {' '.join(rsync_cmd)}") + # Run rsync in the TARGET directory context + process = subprocess.run( + rsync_cmd, + input=files_input, + capture_output=True, + # text=True, # Removed: Input is bytes, output will be bytes + check=False, # Don't raise exception on non-zero exit + cwd=target_dir # <<< Execute rsync from the TARGET directory + ) + + # Decode output/error streams + stdout_str = process.stdout.decode('utf-8', errors='replace') if process.stdout else "" + stderr_str = process.stderr.decode('utf-8', errors='replace') if process.stderr else "" + + if stdout_str: + logging.info("rsync output:\n" + stdout_str) + if stderr_str: + log_level = logging.WARNING if process.returncode != 0 else logging.INFO + logging.log(log_level, "rsync stderr:\n" + stderr_str) + + if process.returncode == 0: + logging.info(f"rsync {'simulation' if simulating else action_desc} completed successfully.") + logging.info("--------------------") + return True + else: + logging.error(f"rsync {'simulation' if simulating else action_desc} failed with exit code {process.returncode}.") + logging.info("--------------------") + return False + + except FileNotFoundError: + logging.error("Error: 'rsync' command not found. Please ensure rsync is installed and in your PATH.") + return False + except Exception as e: + logging.error(f"An unexpected error occurred during rsync execution for cold files: {e}") + return False + + +def count_files(file_list): + """Logs the count of files found.""" + logging.info("--- Counting Hot Move Candidates ---") + if file_list is None: + logging.warning("File list is not available (likely due to earlier error).") + else: + logging.info(f"Found {len(file_list)} potential hot files to move based on access time.") + logging.info("----------------------------") + +def summarize_unused(target_dir): + """Summarizes unused files in the target directory based on modification time.""" + logging.info("--- Summarizing Unused Files in Target ---") + logging.info(f"Target Directory: {target_dir}") + logging.info("Criteria: Based on modification time (-mtime)") + logging.info("------------------------------------------") + + periods_days = [1, 3, 7, 14, 30] + now = time.time() + period_cutoffs = {days: now - (days * 86400) for days in periods_days} + # Add a bucket for > 30 days + size_by_period = {days: 0 for days in periods_days + ['30+']} + count_by_period = {days: 0 for days in periods_days + ['30+']} # Also count files + + file_count = 0 + total_processed_size = 0 + + try: + for root, _, files in os.walk(target_dir): + for filename in files: + filepath = os.path.join(root, filename) + try: + # Check if it's a file and not a broken symlink etc. + if not os.path.isfile(filepath) or os.path.islink(filepath): + continue + stat_result = os.stat(filepath) + mtime = stat_result.st_mtime + fsize = stat_result.st_size + file_count += 1 + total_processed_size += fsize + + # Check against periods in descending order of age (longest first) + period_assigned = False + if mtime < period_cutoffs[30]: + size_by_period['30+'] += fsize + count_by_period['30+'] += 1 + period_assigned = True + elif mtime < period_cutoffs[14]: + size_by_period[30] += fsize + count_by_period[30] += 1 + period_assigned = True + elif mtime < period_cutoffs[7]: + size_by_period[14] += fsize + count_by_period[14] += 1 + period_assigned = True + elif mtime < period_cutoffs[3]: + size_by_period[7] += fsize + count_by_period[7] += 1 + period_assigned = True + elif mtime < period_cutoffs[1]: + size_by_period[3] += fsize + count_by_period[3] += 1 + period_assigned = True + # else: # Modified within the last day - doesn't count for these summaries + + except FileNotFoundError: + logging.warning(f"File not found during summary, skipping: {filepath}") + continue + except OSError as e: + logging.warning(f"Cannot access file stats during summary, skipping: {filepath} ({e})") + continue + + logging.info(f"Scanned {file_count} files, total size: {format_bytes(total_processed_size)}") + + # Calculate cumulative sizes and counts + cumulative_size = {days: 0 for days in periods_days + ['30+']} + cumulative_count = {days: 0 for days in periods_days + ['30+']} + + # Iterate backwards through sorted periods for cumulative calculation + # These keys represent the *lower bound* of the age bucket (e.g., key '30' means 14 < age <= 30 days) + # The cumulative value for key 'X' means "total size/count of files older than X days" + sorted_periods_desc = ['30+'] + sorted(periods_days, reverse=True) # e.g., ['30+', 30, 14, 7, 3, 1] + last_period_size = 0 + last_period_count = 0 + temp_cumulative_size = {} + temp_cumulative_count = {} + + for period_key in sorted_periods_desc: + current_size = size_by_period[period_key] + current_count = count_by_period[period_key] + temp_cumulative_size[period_key] = current_size + last_period_size + temp_cumulative_count[period_key] = current_count + last_period_count + last_period_size = temp_cumulative_size[period_key] + last_period_count = temp_cumulative_count[period_key] + + # Map temporary cumulative values to the correct "older than X days" meaning + # cumulative_size[1] should be size of files older than 1 day (i.e. temp_cumulative_size[3]) + cumulative_size[1] = temp_cumulative_size.get(3, 0) + cumulative_count[1] = temp_cumulative_count.get(3, 0) + cumulative_size[3] = temp_cumulative_size.get(7, 0) + cumulative_count[3] = temp_cumulative_count.get(7, 0) + cumulative_size[7] = temp_cumulative_size.get(14, 0) + cumulative_count[7] = temp_cumulative_count.get(14, 0) + cumulative_size[14] = temp_cumulative_size.get(30, 0) + cumulative_count[14] = temp_cumulative_count.get(30, 0) + cumulative_size[30] = temp_cumulative_size.get('30+', 0) + cumulative_count[30] = temp_cumulative_count.get('30+', 0) + cumulative_size['30+'] = temp_cumulative_size.get('30+', 0) # Redundant but harmless + cumulative_count['30+'] = temp_cumulative_count.get('30+', 0) + + + logging.info("Cumulative stats for files NOT modified for more than:") + # Display in ascending order of days for clarity + logging.info(f" > 1 day: {format_bytes(cumulative_size[1])} ({cumulative_count[1]} files)") + logging.info(f" > 3 days: {format_bytes(cumulative_size[3])} ({cumulative_count[3]} files)") + logging.info(f" > 7 days: {format_bytes(cumulative_size[7])} ({cumulative_count[7]} files)") + logging.info(f" > 14 days:{format_bytes(cumulative_size[14])} ({cumulative_count[14]} files)") + logging.info(f" > 30 days:{format_bytes(cumulative_size[30])} ({cumulative_count[30]} files)") + + + except FileNotFoundError: + logging.error(f"Target directory '{target_dir}' not found for summary.") + except Exception as e: + logging.error(f"An unexpected error occurred during unused file summary: {e}") + + logging.info("------------------------------------------") + +# --- New Function: Analyze Directory for Stats --- +def analyze_directory(directory): + """Analyzes a directory and returns statistics.""" + logging.info(f"Analyzing directory for statistics: {directory}") + stats = { + 'total_files': 0, + 'total_size': 0, + 'size_by_mod_time_days': { # Buckets represent age > X days (key '1' means 0 < age <= 1 day) + '1': {'count': 0, 'size': 0}, # <= 1 day old + '3': {'count': 0, 'size': 0}, # > 1 day, <= 3 days old + '7': {'count': 0, 'size': 0}, # > 3 days, <= 7 days old + '14': {'count': 0, 'size': 0},# > 7 days, <= 14 days old + '30': {'count': 0, 'size': 0}, # > 14 days, <= 30 days old + 'over_30': {'count': 0, 'size': 0} # > 30 days old + }, + 'error_count': 0, + } + periods_days = [1, 3, 7, 14, 30] + now = time.time() + # Cutoffs: if mtime < cutoff[X], file is older than X days + period_cutoffs = {days: now - (days * 86400) for days in periods_days} + + try: + for root, _, files in os.walk(directory): + for filename in files: + filepath = os.path.join(root, filename) + try: + if not os.path.isfile(filepath) or os.path.islink(filepath): + continue + stat_result = os.stat(filepath) + mtime = stat_result.st_mtime + fsize = stat_result.st_size + + stats['total_files'] += 1 + stats['total_size'] += fsize + + # Assign to age buckets based on modification time (oldest first) + if mtime < period_cutoffs[30]: + stats['size_by_mod_time_days']['over_30']['count'] += 1 + stats['size_by_mod_time_days']['over_30']['size'] += fsize + elif mtime < period_cutoffs[14]: + stats['size_by_mod_time_days']['30']['count'] += 1 + stats['size_by_mod_time_days']['30']['size'] += fsize + elif mtime < period_cutoffs[7]: + stats['size_by_mod_time_days']['14']['count'] += 1 + stats['size_by_mod_time_days']['14']['size'] += fsize + elif mtime < period_cutoffs[3]: + stats['size_by_mod_time_days']['7']['count'] += 1 + stats['size_by_mod_time_days']['7']['size'] += fsize + elif mtime < period_cutoffs[1]: + stats['size_by_mod_time_days']['3']['count'] += 1 + stats['size_by_mod_time_days']['3']['size'] += fsize + else: # Modified within the last day + stats['size_by_mod_time_days']['1']['count'] += 1 + stats['size_by_mod_time_days']['1']['size'] += fsize + + except FileNotFoundError: + logging.warning(f"File not found during stats analysis, skipping: {filepath}") + stats['error_count'] += 1 + continue + except OSError as e: + logging.warning(f"Cannot access file stats during stats analysis, skipping: {filepath} ({e})") + stats['error_count'] += 1 + continue + + logging.info(f"Analysis complete for {directory}: Found {stats['total_files']} files, total size {format_bytes(stats['total_size'])}.") + if stats['error_count'] > 0: + logging.warning(f"Encountered {stats['error_count']} errors during analysis of {directory}.") + return stats + + except FileNotFoundError: + logging.error(f"Directory '{directory}' not found for statistics analysis.") + return None # Indicate error + except Exception as e: + logging.error(f"An unexpected error occurred during statistics analysis of {directory}: {e}") + return None + +# --- New Function: Generate Stats Report --- +def generate_stats(args): + """Generates a JSON statistics report for source and target directories.""" + logging.info("--- Generating Statistics Report ---") + report = { + 'report_generated_utc': datetime.utcnow().isoformat() + 'Z', + 'source_directory': args.source_dir, + 'target_directory': args.target_dir, + 'source_stats': None, + 'target_stats': None, + } + success = True + + # Analyze source directory if it exists + if os.path.isdir(args.source_dir): + logging.info(f"Analyzing source directory: {args.source_dir}") + source_stats = analyze_directory(args.source_dir) + if source_stats is None: + logging.error(f"Failed to analyze source directory: {args.source_dir}") + success = False # Mark as partial failure, but continue + report['source_stats'] = source_stats + else: + logging.warning(f"Source directory '{args.source_dir}' not found, skipping analysis.") + report['source_stats'] = {'error': 'Directory not found'} + + + # Analyze target directory if it exists + if os.path.isdir(args.target_dir): + logging.info(f"Analyzing target directory: {args.target_dir}") + target_stats = analyze_directory(args.target_dir) + if target_stats is None: + logging.error(f"Failed to analyze target directory: {args.target_dir}") + success = False # Mark as partial failure + report['target_stats'] = target_stats + else: + logging.warning(f"Target directory '{args.target_dir}' not found, skipping analysis.") + report['target_stats'] = {'error': 'Directory not found'} + + + if not success: + logging.warning("Stats generation encountered errors analyzing one or both directories.") + # Continue to write partial report + + # Write the report to the specified file + stats_file_path = Path(args.stats_file) + try: + # Create parent directories if they don't exist + stats_file_path.parent.mkdir(parents=True, exist_ok=True) + with open(stats_file_path, 'w') as f: + json.dump(report, f, indent=4) + logging.info(f"Successfully wrote statistics report to: {stats_file_path}") + return success # Return True if both analyses succeeded, False otherwise + except OSError as e: + logging.error(f"Error writing statistics report to {stats_file_path}: {e}") + return False + except Exception as e: + logging.error(f"An unexpected error occurred while writing stats report: {e}") + return False + + +# --- Main Execution --- +def main(): + """Main function to orchestrate the script.""" + setup_logging() + args = parse_arguments() # Now handles config loading + + # --- Directory Validation --- + # Check source if needed + source_ok = True + if (args.move or args.count or args.generate_stats or args.move_cold): # move_cold needs source as destination + if not os.path.isdir(args.source_dir): + logging.error(f"Source directory '{args.source_dir}' not found or is not a directory.") + source_ok = False + else: + logging.debug(f"Source directory validated: {args.source_dir}") + + # Check target if needed + target_ok = True + if (args.move or args.summarize_unused or args.generate_stats or args.move_cold): # move_cold needs target as source + if not os.path.isdir(args.target_dir): + logging.error(f"Target directory '{args.target_dir}' not found or is not a directory.") + target_ok = False + else: + logging.debug(f"Target directory validated: {args.target_dir}") + + # Exit if essential directories are missing for the requested actions that *require* them + if not source_ok and (args.move or args.count): + logging.error("Aborting: Source directory required for --move or --count is invalid.") + sys.exit(1) + if not target_ok and (args.summarize_unused): + logging.error("Aborting: Target directory required for --summarize-unused is invalid.") + sys.exit(1) + if (not source_ok or not target_ok) and args.move_cold: + logging.error("Aborting: Both source and target directories required for --move-cold are invalid.") + sys.exit(1) + # Note: generate_stats handles missing dirs internally + + # --- Action Execution --- + exit_code = 0 # Track if any operation fails + + # --- Find files first if needed by multiple actions --- + hot_files_to_process = None + if args.move or args.count: + # We already checked source_ok above for these actions + hot_files_to_process = find_recent_files(args.source_dir, args.recent_days, args.min_size_bytes) + if hot_files_to_process is None: + logging.error("Aborting due to error finding recent 'hot' files.") + sys.exit(1) # Abort if find failed + + cold_files_to_process = None + if args.move_cold: + # We already checked target_ok above for this action + cold_files_to_process = find_stale_files(args.target_dir, args.stale_days, args.min_size_bytes) + if cold_files_to_process is None: + logging.error("Aborting due to error finding 'cold' files.") + sys.exit(1) # Abort if find failed + + + # --- Execute Actions --- + if args.count: + count_files(hot_files_to_process) # Counts hot files + + if args.move: + # We already checked source_ok and target_ok for this action + move_success = move_files(hot_files_to_process, args.source_dir, args.target_dir, args.dry_run, args.interactive) + if not move_success and not args.dry_run: + logging.error("Move 'hot' files operation failed or was cancelled.") + exit_code = 1 # Mark failure + + if args.move_cold: + # We already checked source_ok and target_ok for this action + move_cold_success = move_files_cold(cold_files_to_process, args.source_dir, args.target_dir, args.dry_run, args.interactive) + if not move_cold_success and not args.dry_run: + logging.error("Move 'cold' files operation failed or was cancelled.") + exit_code = 1 # Mark failure + + if args.summarize_unused: + # We already checked target_ok for this action + summarize_unused(args.target_dir) + + if args.generate_stats: + # generate_stats handles its own directory checks internally now + stats_success = generate_stats(args) + if not stats_success: + # generate_stats already logged errors + exit_code = 1 + + + logging.info("Script finished.") + sys.exit(exit_code) # Exit with 0 on success, 1 on failure + + +if __name__ == "__main__": + main() \ No newline at end of file