Introduction: The Need for File Automation

When using computers, we spend a significant amount of time dealing with files and folders. Tasks like renaming hundreds of files, finding and organizing files that meet certain criteria, or copying files for backup take a long time when done manually and are prone to errors.

Python provides powerful tools for working with the file system. In this part, we will learn how to efficiently automate files and folders using Python's os, pathlib, shutil, and glob modules.

1. os Module Basics

The os module is Python's basic module for interacting with the operating system. It provides essential functions for file system operations.

1.1 Basic Path Operations

import os

# Get current working directory
current_dir = os.getcwd()
print(f"Current directory: {current_dir}")

# Change working directory
os.chdir("/path/to/directory")

# Access environment variables
home_dir = os.environ.get('HOME')  # Linux/Mac
user_profile = os.environ.get('USERPROFILE')  # Windows

# User home directory (cross-platform)
home = os.path.expanduser("~")
print(f"Home directory: {home}")

1.2 Path Manipulation

import os

# Join paths (uses OS-appropriate separator)
full_path = os.path.join("folder", "subfolder", "file.txt")
print(full_path)  # Windows: folder\subfolder\file.txt

# Split path
directory = os.path.dirname("/path/to/file.txt")  # /path/to
filename = os.path.basename("/path/to/file.txt")  # file.txt

# Split filename and extension
name, extension = os.path.splitext("document.pdf")
print(f"Filename: {name}, Extension: {extension}")  # document, .pdf

# Get absolute path
abs_path = os.path.abspath("relative/path/file.txt")

# Check if path exists
if os.path.exists("/path/to/check"):
    print("Path exists.")

# Check if it's a file or folder
if os.path.isfile("/path/to/file.txt"):
    print("It's a file.")

if os.path.isdir("/path/to/folder"):
    print("It's a folder.")

1.3 Directory Listing

import os

# List items in directory
items = os.listdir("/path/to/directory")
print(items)  # ['file1.txt', 'folder1', 'file2.py']

# Filter files only
files = [f for f in os.listdir(".") if os.path.isfile(f)]

# Filter folders only
folders = [f for f in os.listdir(".") if os.path.isdir(f)]

# Filter specific extension files only
python_files = [f for f in os.listdir(".") if f.endswith(".py")]

1.4 Directory Traversal (os.walk)

import os

# Traverse all subdirectories
for root, dirs, files in os.walk("/path/to/start"):
    print(f"Current directory: {root}")
    print(f"Subdirectories: {dirs}")
    print(f"Files: {files}")
    print("-" * 50)

# Find all files with specific extension
def find_files_by_extension(start_path, extension):
    """Finds all files with the specified extension in the given path."""
    found_files = []
    for root, dirs, files in os.walk(start_path):
        for file in files:
            if file.endswith(extension):
                full_path = os.path.join(root, file)
                found_files.append(full_path)
    return found_files

# Find all .txt files
txt_files = find_files_by_extension(".", ".txt")
for f in txt_files:
    print(f)

2. Using the pathlib Module

pathlib is an object-oriented file system path library introduced in Python 3.4. It allows you to work with paths in a more intuitive and modern way than os.path.

2.1 Path Object Basics

from pathlib import Path

# Create Path object
p = Path("/path/to/file.txt")
current = Path(".")
home = Path.home()

# Join paths (using / operator)
full_path = Path.home() / "Documents" / "project" / "file.txt"
print(full_path)

# Path attributes
print(p.name)       # file.txt (filename)
print(p.stem)       # file (filename without extension)
print(p.suffix)     # .txt (extension)
print(p.parent)     # /path/to (parent directory)
print(p.parts)      # ('/', 'path', 'to', 'file.txt')

# Absolute path
abs_path = Path("relative/path").resolve()

# Check existence
if p.exists():
    print("Exists.")

if p.is_file():
    print("It's a file.")

if p.is_dir():
    print("It's a directory.")

2.2 Directory Exploration

from pathlib import Path

# All items in current directory
for item in Path(".").iterdir():
    print(item)

# Pattern matching (current directory only)
for py_file in Path(".").glob("*.py"):
    print(py_file)

# Recursive pattern matching (all subdirectories)
for py_file in Path(".").rglob("*.py"):
    print(py_file)

# Search multiple extensions
extensions = ["*.jpg", "*.png", "*.gif"]
images = []
for ext in extensions:
    images.extend(Path(".").rglob(ext))

# Filter files only
files_only = [p for p in Path(".").iterdir() if p.is_file()]

# Filter folders only
dirs_only = [p for p in Path(".").iterdir() if p.is_dir()]

2.3 Creating and Deleting Files/Folders

from pathlib import Path

# Create directory
new_dir = Path("new_folder")
new_dir.mkdir(exist_ok=True)  # No error if already exists

# Create nested directories
nested_dir = Path("parent/child/grandchild")
nested_dir.mkdir(parents=True, exist_ok=True)

# Create file (empty file)
new_file = Path("new_file.txt")
new_file.touch()

# Delete file
if new_file.exists():
    new_file.unlink()

# Delete empty directory
if new_dir.exists() and new_dir.is_dir():
    new_dir.rmdir()  # Directory must be empty

# Rename file
old_path = Path("old_name.txt")
new_path = Path("new_name.txt")
if old_path.exists():
    old_path.rename(new_path)

3. File Reading/Writing

Let's learn how to read and write files in Python. Using the with statement is recommended as it automatically closes the file.

3.1 Text File Processing

# Write file
with open("example.txt", "w", encoding="utf-8") as f:
    f.write("First line\n")
    f.write("Second line\n")

# Append to file
with open("example.txt", "a", encoding="utf-8") as f:
    f.write("Appended line\n")

# Read file (entire content)
with open("example.txt", "r", encoding="utf-8") as f:
    content = f.read()
    print(content)

# Read file (line by line)
with open("example.txt", "r", encoding="utf-8") as f:
    for line in f:
        print(line.strip())  # Remove newline character

# Read file (as list)
with open("example.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()
    print(lines)  # ['First line\n', 'Second line\n', ...]

# Write multiple lines at once
lines_to_write = ["Line 1\n", "Line 2\n", "Line 3\n"]
with open("output.txt", "w", encoding="utf-8") as f:
    f.writelines(lines_to_write)

3.2 Reading/Writing with pathlib

from pathlib import Path

# Simple file writing
Path("simple.txt").write_text("Hello, World!", encoding="utf-8")

# Simple file reading
content = Path("simple.txt").read_text(encoding="utf-8")
print(content)

# Read binary file
binary_content = Path("image.png").read_bytes()

# Write binary file
Path("copy.png").write_bytes(binary_content)

3.3 CSV File Processing

import csv

# Write CSV
data = [
    ["Name", "Age", "City"],
    ["John", 30, "New York"],
    ["Jane", 25, "Los Angeles"],
    ["Bob", 28, "Chicago"]
]

with open("data.csv", "w", newline="", encoding="utf-8-sig") as f:
    writer = csv.writer(f)
    writer.writerows(data)

# Read CSV
with open("data.csv", "r", encoding="utf-8-sig") as f:
    reader = csv.reader(f)
    for row in reader:
        print(row)

# Work with CSV as dictionaries
with open("data.csv", "r", encoding="utf-8-sig") as f:
    reader = csv.DictReader(f)
    for row in reader:
        print(f"{row['Name']} is {row['Age']} years old.")

3.4 JSON File Processing

import json

# Write JSON
data = {
    "name": "John Doe",
    "age": 30,
    "skills": ["Python", "JavaScript", "SQL"],
    "address": {
        "city": "New York",
        "district": "Manhattan"
    }
}

with open("data.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

# Read JSON
with open("data.json", "r", encoding="utf-8") as f:
    loaded_data = json.load(f)
    print(loaded_data["name"])
    print(loaded_data["skills"])

4. Creating/Deleting/Moving Folders

4.1 Creating Folders

import os
from pathlib import Path

# Create folder with os
os.makedirs("parent/child/grandchild", exist_ok=True)

# Create folder with pathlib
Path("another/nested/folder").mkdir(parents=True, exist_ok=True)

# Create date-based folders
from datetime import datetime

today = datetime.now().strftime("%Y-%m-%d")
daily_folder = Path("backups") / today
daily_folder.mkdir(parents=True, exist_ok=True)

4.2 Deleting Folders

import os
import shutil
from pathlib import Path

# Delete empty folder
os.rmdir("empty_folder")
# or
Path("empty_folder").rmdir()

# Delete folder with contents (Caution: Cannot be recovered!)
shutil.rmtree("folder_with_contents")

# Safe delete (confirm before deleting)
def safe_delete_folder(folder_path):
    """Safely deletes a folder."""
    path = Path(folder_path)
    if not path.exists():
        print(f"'{folder_path}' does not exist.")
        return False

    # Check folder contents
    items = list(path.rglob("*"))
    file_count = sum(1 for item in items if item.is_file())
    folder_count = sum(1 for item in items if item.is_dir())

    print(f"Folder to delete: {folder_path}")
    print(f"Files included: {file_count}")
    print(f"Folders included: {folder_count}")

    confirm = input("Are you sure you want to delete? (yes/no): ")
    if confirm.lower() == "yes":
        shutil.rmtree(path)
        print("Delete complete!")
        return True
    else:
        print("Cancelled.")
        return False

5. File Search (glob)

The glob module searches for files using Unix shell-style pattern matching.

5.1 Basic glob Usage

import glob

# All .txt files in current directory
txt_files = glob.glob("*.txt")

# All .py files in specific folder
py_files = glob.glob("src/*.py")

# Recursive search (all subdirectories)
all_py_files = glob.glob("**/*.py", recursive=True)

# Search multiple extensions
import itertools
extensions = ["*.jpg", "*.png", "*.gif"]
images = list(itertools.chain.from_iterable(
    glob.glob(ext, recursive=True) for ext in ["**/" + e for e in extensions]
))

# Pattern matching examples
# ? : Single character
# * : Any characters (0 or more)
# [abc] : One of a, b, c
# [0-9] : Digit

files = glob.glob("file?.txt")      # file1.txt, fileA.txt
files = glob.glob("data[0-9].csv")  # data0.csv ~ data9.csv
files = glob.glob("[!_]*.py")       # .py files not starting with underscore

5.2 pathlib's glob

from pathlib import Path

# Search in current directory
for txt_file in Path(".").glob("*.txt"):
    print(txt_file)

# Recursive search
for py_file in Path(".").rglob("*.py"):
    print(py_file)

# Complex patterns
for file in Path("data").glob("**/report_*.xlsx"):
    print(file)

6. File Copy/Move (shutil)

The shutil module provides high-level operations (copy, move, delete) for files and folders.

6.1 Copying Files

import shutil

# Copy file (without metadata)
shutil.copy("source.txt", "destination.txt")

# Copy file (with metadata)
shutil.copy2("source.txt", "destination.txt")

# Copy to folder (keeps original filename)
shutil.copy("source.txt", "backup_folder/")

# Copy entire folder
shutil.copytree("source_folder", "destination_folder")

# Copy excluding certain files
def ignore_patterns(directory, files):
    """Ignores files matching certain patterns."""
    return [f for f in files if f.endswith('.pyc') or f.startswith('.')]

shutil.copytree("source", "dest", ignore=ignore_patterns)

# Or use built-in function
shutil.copytree("source", "dest",
                ignore=shutil.ignore_patterns('*.pyc', '*.tmp', '__pycache__'))

6.2 Moving Files

import shutil

# Move file
shutil.move("source.txt", "new_location/source.txt")

# Rename file (move within same folder)
shutil.move("old_name.txt", "new_name.txt")

# Move folder
shutil.move("source_folder", "new_location/")

# Safe move function
from pathlib import Path

def safe_move(source, destination):
    """Safely moves a file."""
    src = Path(source)
    dst = Path(destination)

    if not src.exists():
        print(f"Source '{source}' does not exist.")
        return False

    # If destination is a folder, keep original filename
    if dst.is_dir():
        dst = dst / src.name

    # Confirm if destination already exists
    if dst.exists():
        confirm = input(f"'{dst}' already exists. Overwrite? (y/n): ")
        if confirm.lower() != 'y':
            print("Cancelled.")
            return False

    shutil.move(str(src), str(dst))
    print(f"'{source}' -> '{dst}' move complete")
    return True

7. Batch File Renaming

Batch renaming files is a typical use case for automation.

7.1 Basic File Renaming

import os
from pathlib import Path

# Add prefix
def add_prefix(folder, prefix):
    """Adds a prefix to all files in the folder."""
    folder_path = Path(folder)
    for file in folder_path.iterdir():
        if file.is_file():
            new_name = folder_path / f"{prefix}{file.name}"
            file.rename(new_name)
            print(f"'{file.name}' -> '{new_name.name}'")

# Add suffix
def add_suffix(folder, suffix):
    """Adds a suffix to filenames (before extension)."""
    folder_path = Path(folder)
    for file in folder_path.iterdir():
        if file.is_file():
            new_name = folder_path / f"{file.stem}{suffix}{file.suffix}"
            file.rename(new_name)
            print(f"'{file.name}' -> '{new_name.name}'")

7.2 Rename with Sequence Numbers

from pathlib import Path

def rename_with_sequence(folder, base_name, start=1, padding=3):
    """Renames files with sequence numbers.

    Example: image_001.jpg, image_002.jpg, ...
    """
    folder_path = Path(folder)
    files = sorted([f for f in folder_path.iterdir() if f.is_file()])

    for i, file in enumerate(files, start=start):
        # Format sequence number (001, 002, ...)
        sequence = str(i).zfill(padding)
        new_name = folder_path / f"{base_name}_{sequence}{file.suffix}"

        file.rename(new_name)
        print(f"'{file.name}' -> '{new_name.name}'")

7.3 Date-based File Renaming

from pathlib import Path
from datetime import datetime
import os

def rename_with_date(folder, include_time=False):
    """Renames files based on their modification date."""
    folder_path = Path(folder)

    for file in folder_path.iterdir():
        if file.is_file():
            # Get file modification time
            mtime = os.path.getmtime(file)
            date_obj = datetime.fromtimestamp(mtime)

            if include_time:
                date_str = date_obj.strftime("%Y%m%d_%H%M%S")
            else:
                date_str = date_obj.strftime("%Y%m%d")

            new_name = folder_path / f"{date_str}_{file.name}"

            # Prevent duplicates
            counter = 1
            while new_name.exists():
                new_name = folder_path / f"{date_str}_{counter}_{file.name}"
                counter += 1

            file.rename(new_name)
            print(f"'{file.name}' -> '{new_name.name}'")

7.4 Rename with Regular Expressions

import re
from pathlib import Path

def rename_with_regex(folder, pattern, replacement):
    """Renames files using regular expressions."""
    folder_path = Path(folder)

    for file in folder_path.iterdir():
        if file.is_file():
            new_name = re.sub(pattern, replacement, file.stem)
            if new_name != file.stem:
                new_path = folder_path / f"{new_name}{file.suffix}"
                file.rename(new_path)
                print(f"'{file.name}' -> '{new_path.name}'")

# Usage examples
# Replace spaces with underscores
rename_with_regex(".", r"\s+", "_")

# Remove special characters
rename_with_regex(".", r"[^\w\-_.]", "")

8. Finding Duplicate Files

Let's write a script to find and organize duplicate files to save hard disk space.

import hashlib
from pathlib import Path
from collections import defaultdict

def get_file_hash(filepath, chunk_size=8192):
    """Calculates the MD5 hash of a file."""
    hasher = hashlib.md5()
    with open(filepath, 'rb') as f:
        while chunk := f.read(chunk_size):
            hasher.update(chunk)
    return hasher.hexdigest()

def find_duplicates(folder):
    """Finds duplicate files in the folder."""
    folder_path = Path(folder)

    # Step 1: Group by file size
    size_dict = defaultdict(list)
    for file in folder_path.rglob("*"):
        if file.is_file():
            size_dict[file.stat().st_size].append(file)

    # Step 2: Compare hashes only for files with same size
    hash_dict = defaultdict(list)
    for size, files in size_dict.items():
        if len(files) > 1:  # Only when 2+ files have the same size
            for file in files:
                file_hash = get_file_hash(file)
                hash_dict[file_hash].append(file)

    # Step 3: Filter duplicate files
    duplicates = {h: files for h, files in hash_dict.items() if len(files) > 1}

    return duplicates

def report_duplicates(folder):
    """Prints a duplicate file report."""
    duplicates = find_duplicates(folder)

    if not duplicates:
        print("No duplicate files found.")
        return

    total_wasted = 0
    print("=" * 60)
    print("Duplicate File Report")
    print("=" * 60)

    for hash_value, files in duplicates.items():
        file_size = files[0].stat().st_size
        wasted = file_size * (len(files) - 1)
        total_wasted += wasted

        print(f"\nHash: {hash_value[:16]}...")
        print(f"File size: {file_size:,} bytes")
        print(f"Duplicate count: {len(files)}")
        print("File list:")
        for f in files:
            print(f"  - {f}")

    print("\n" + "=" * 60)
    print(f"Total wasted space: {total_wasted:,} bytes ({total_wasted / 1024 / 1024:.2f} MB)")
    print("=" * 60)

def delete_duplicates(folder, keep='first'):
    """Deletes duplicate files.

    Args:
        folder: Target folder
        keep: 'first' (keep first) or 'newest' (keep newest file)
    """
    duplicates = find_duplicates(folder)

    deleted_count = 0
    freed_space = 0

    for hash_value, files in duplicates.items():
        if keep == 'newest':
            # Sort by modification time (newest first)
            files = sorted(files, key=lambda f: f.stat().st_mtime, reverse=True)

        # Keep first file, delete rest
        for file in files[1:]:
            file_size = file.stat().st_size
            print(f"Deleted: {file}")
            file.unlink()
            deleted_count += 1
            freed_space += file_size

    print(f"\nDeleted files: {deleted_count}")
    print(f"Freed space: {freed_space:,} bytes ({freed_space / 1024 / 1024:.2f} MB)")

9. File Organization Automation Project

Let's create a practical file organization automation project combining everything we've learned.

"""
Advanced File Organization Automation Script
- Organize by extension
- Organize by date
- Organize by size
- Handle duplicate files
- Logging support
"""

import os
import shutil
import hashlib
import logging
from pathlib import Path
from datetime import datetime
from collections import defaultdict

# Logging setup
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('file_organizer.log', encoding='utf-8'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Extension to category mapping
CATEGORIES = {
    'Images': ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp', '.ico'],
    'Documents': ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.txt', '.rtf', '.odt'],
    'Videos': ['.mp4', '.avi', '.mkv', '.mov', '.wmv', '.flv', '.webm'],
    'Music': ['.mp3', '.wav', '.flac', '.aac', '.ogg', '.wma', '.m4a'],
    'Archives': ['.zip', '.rar', '.7z', '.tar', '.gz', '.bz2'],
    'Programs': ['.exe', '.msi', '.dmg', '.deb', '.rpm', '.apk'],
    'Code': ['.py', '.js', '.html', '.css', '.java', '.cpp', '.c', '.h', '.json', '.xml'],
    'Data': ['.csv', '.sql', '.db', '.sqlite']
}

def get_category(extension):
    """Returns the category for the given extension."""
    ext_lower = extension.lower()
    for category, extensions in CATEGORIES.items():
        if ext_lower in extensions:
            return category
    return 'Others'

def get_file_hash(filepath):
    """Calculates the MD5 hash of a file."""
    hasher = hashlib.md5()
    with open(filepath, 'rb') as f:
        for chunk in iter(lambda: f.read(8192), b''):
            hasher.update(chunk)
    return hasher.hexdigest()

class FileOrganizer:
    def __init__(self, source_folder, dest_folder=None):
        self.source = Path(source_folder)
        self.dest = Path(dest_folder) if dest_folder else self.source
        self.stats = {
            'moved': 0,
            'skipped': 0,
            'duplicates': 0,
            'errors': 0
        }
        self.hash_dict = defaultdict(list)

    def organize_by_extension(self):
        """Organizes files by extension."""
        logger.info(f"Starting organization by extension: {self.source}")

        for file in self.source.iterdir():
            if file.is_file():
                category = get_category(file.suffix)
                target_folder = self.dest / category
                target_folder.mkdir(exist_ok=True)

                target_path = target_folder / file.name

                # Handle duplicate filenames
                if target_path.exists():
                    target_path = self._get_unique_path(target_path)

                try:
                    shutil.move(str(file), str(target_path))
                    logger.info(f"Moved: {file.name} -> {category}/")
                    self.stats['moved'] += 1
                except Exception as e:
                    logger.error(f"Error: {file.name} - {e}")
                    self.stats['errors'] += 1

        self._print_stats()

    def organize_by_date(self, date_format="%Y-%m"):
        """Organizes files by modification date."""
        logger.info(f"Starting organization by date: {self.source}")

        for file in self.source.iterdir():
            if file.is_file():
                mtime = datetime.fromtimestamp(file.stat().st_mtime)
                date_folder = mtime.strftime(date_format)

                target_folder = self.dest / date_folder
                target_folder.mkdir(exist_ok=True)

                target_path = target_folder / file.name

                if target_path.exists():
                    target_path = self._get_unique_path(target_path)

                try:
                    shutil.move(str(file), str(target_path))
                    logger.info(f"Moved: {file.name} -> {date_folder}/")
                    self.stats['moved'] += 1
                except Exception as e:
                    logger.error(f"Error: {file.name} - {e}")
                    self.stats['errors'] += 1

        self._print_stats()

    def _get_unique_path(self, path):
        """Returns a non-duplicate file path."""
        counter = 1
        new_path = path
        while new_path.exists():
            new_path = path.parent / f"{path.stem}_{counter}{path.suffix}"
            counter += 1
        return new_path

    def _print_stats(self):
        """Prints statistics."""
        logger.info("=" * 50)
        logger.info("Processing Results:")
        logger.info(f"  Files moved: {self.stats['moved']}")
        logger.info(f"  Files skipped: {self.stats['skipped']}")
        logger.info(f"  Duplicates: {self.stats['duplicates']}")
        logger.info(f"  Errors: {self.stats['errors']}")
        logger.info("=" * 50)


def main():
    """Main execution function"""
    print("=" * 60)
    print("Advanced File Organization Automation")
    print("=" * 60)

    source = input("Enter folder path to organize: ").strip()
    if not source:
        source = str(Path.home() / "Downloads")
        print(f"Using default path: {source}")

    print("\nSelect organization method:")
    print("1. Organize by extension")
    print("2. Organize by date")
    print("3. Organize by size")
    print("4. Find duplicate files")
    print("5. Perform all organization")

    choice = input("\nSelect (1-5): ").strip()

    organizer = FileOrganizer(source)

    if choice == '1':
        organizer.organize_by_extension()
    elif choice == '2':
        organizer.organize_by_date()
    else:
        print("Invalid selection.")


if __name__ == "__main__":
    main()

10. Conclusion and Preview of Next Part

In this part, we learned the core techniques for file and folder automation using Python:

  • Basic file system operations using the os module
  • Object-oriented path handling with pathlib
  • File reading/writing (text, CSV, JSON)
  • Creating, deleting, and moving folders
  • File searching with glob
  • File copying/moving with shutil
  • Batch file renaming techniques
  • Finding and handling duplicate files
  • Comprehensive file organization automation project

This knowledge will serve as the foundation for various automation projects in the future. When you can freely work with the file system, endless possibilities open up including backup automation, log analysis, and data preprocessing.

Next Part Preview: In Python Automation Master Part 3, we will cover web scraping automation. We'll learn how to automatically collect data from the web using requests, BeautifulSoup, and Selenium!