Dyota's blog

PowerShell: text from epubs

I discovered recently that .epubs are actually .zip files with HTML files inside! This means that they can be worked on and manipulated with PowerShell.

This script rips all of the chapters in a .epub file into a single HTML.

using namespace System.Text

# folder location where the .epub lives
$root = ''

# the title of the .epub (no extension)
$bookTitle = ''

# unzip the .epub
dir $root |
    ? {
        $_.Extension -eq '.epub' -and $_.BaseName -eq $bookTitle
    } |
    % {
        $dest = "$root\$($_.BaseName)"

        if (-not (Test-Path $dest)) {
            Expand-Archive -Path $_.FullName -DestinationPath $dest
        }
    }

# title of the folder of extracted .zip contents file (without extension)

# this tells you where
$container = [xml](Get-Content "$root\$bookTitle\META-INF\container.xml")

$contentFilePath = $container.container.rootfiles.rootfile.'full-path'.Replace('/', '\')

# fetch the folder location of all of the chapters
$contentDir = "$root\$bookTitle\$contentFilePath"
$contentSubdir = (Get-ChildItem $contentDir)[0].Directory.FullName

$content = [xml](Get-Content $contentDir)

# this has the "spine" of the .epub and dictates what order the contents go in
$spine = $content.package.spine.itemref.idref

# this manifest contains all of the parts and  which files they refer to
$manifest = $content.package.manifest

# StringBuilder to collect all of the text into one
$sb = [StringBuilder]::new()

# for every part in the spine, pull out the text and append it to the StringBuilder object
$spine |
    ForEach-Object {
        $id = $_
        # $id

        $manifest.item |
            Where-Object {
                $_.id -eq $id
            } |
            ForEach-Object {
                $partFilePath = "$contentSubdir\$($_.href)"
                # Write-Host -ForegroundColor Yellow $partFilePath
                # (cat $partFilePath)
                [void] $sb.AppendLine((Get-Content $partFilePath))
            }

        }

# write the combined text into one html file
$outFile = "$contentSubdir\combined.html"
$sb.ToString() > $outFile

# make a shortcut for the combined file
$WshShell = New-Object -comObject WScript.Shell
$Shortcut = $WshShell.CreateShortcut("$root\$bookTitle.lnk")
$Shortcut.TargetPath = "$outFile"
$Shortcut.Save()

#epub #powershell