<#
Copy 7z.exe and 7z.dll into the same folder as this script.
Requires iTextSharp.dll in the same folder as this script - can be extracted from
itextsharp.5.5.13.1.nupkg by changing extension to .zip - package can be downloaded from
https://github.com/itext/itextsharp/releases
#>
# Initial folder with archives
$folder1 = 'G:\test\folder1'
# Folder for text based PDFs
$folder2 = 'G:\test\Text'
# Folder for non-text PDFs
$folder3 = 'G:\test\Image'
# Threshold to determine whether PDF is text or image based, equals minimum number of
# lines of text to be detected before PDF is considered text based.
# The threshold applies to the number of lines detected across first 5 pages, (or all
# if less than 5 pages).
$TextPDFThreshold = 20
# Delete extracted archives: $true or $false
# Archives that fail extraction won't be deleted regardless of this setting.
$delArc = $false
<# ------------------- #>
Add-Type -Path ".\itextsharp.dll"
Function Delete-Dupes {
Get-ChildItem "$($folder1)\*" -File -Include *.zip
,*.rar
,*.7z
,*.pdf
| Get
-FileHash
| Group -Property Hash
| Where { $_.count
-gt 1 } `
| % { $_.
group | Select -Skip
1 } | Remove-Item -Force }
Function Extract-Items {
for ($j = 0; $j -lt 2; $j++) {
if ($j -ne 1) {
$files = Get-ChildItem "$($folder1)\*" -File -Include *.zip,*.rar,*.7z
} else {
$files = Get-ChildItem "$($folder1)" -Include *.zip,*.rar,*.7z -Recurse
}
for ($i = 0; $i -lt $files.Count; $i++) {
$tempdest = "$(([io.path]::GetDirectoryName($files[$i])))\$(([io.path]::GetFileNameWithoutExtension($files[$i])))"
& ".\7z.exe" "x" "-y" "$($files[$i])" "-o$tempdest" | Out-Null
if ($? -and $delArc) {
Remove-Item "$($files[$i])" -Force
}
}
}
}
Function Delete-SmallPDF {
param (
[bool]$delSmall
)
if ($delSmall) {
Get-ChildItem "$($folder1)\*" -Include *.pdf
-Recurse | ? {$_.length
-lt 2048} | % {Remove-Item $_.fullname
-Force} }
Get-ChildItem "$($folder1)" -recurse | Where {$_.PSIsContainer
-and `
@(Get-ChildItem -Lit
$_.Fullname
-r | Where {!$_.PSIsContainer
}).Length
-eq 0} | Remove-Item -recurse Get-ChildItem "$($folder2)" -recurse | Where {$_.PSIsContainer
-and `
@(Get-ChildItem -Lit
$_.Fullname
-r | Where {!$_.PSIsContainer
}).Length
-eq 0} | Remove-Item -recurse Get-ChildItem "$($folder3)" -recurse | Where {$_.PSIsContainer
-and `
@(Get-ChildItem -Lit
$_.Fullname
-r | Where {!$_.PSIsContainer
}).Length
-eq 0} | Remove-Item -recurse }
Function Fix-FileNames {
$files = Get-ChildItem "$($folder1)\*" -Include *.pdf -Recurse
for ($i = 0; $i -lt $files.Count; $i++) {
$j = 0
$noSpecialChars = (Convert-ToLatinCharacters "$([io.path]::GetFileNameWithoutExtension($files[$i]))") -replace '[\[\]]', '_'
$tempName = "$(([io.path]::GetDirectoryName($files[$i])))\$($noSpecialChars)"
$pathLength = ([io.path]::GetDirectoryName($files[$i])).Length
$totalLength = $tempName.Length + 4
if ($pathLength -lt 248) {
if ($totalLength -gt 251) {
$fn1 = "$($noSpecialChars.Substring(0, (251 - $pathLength)))"
} else {
$fn1 = "$($noSpecialChars)"
}
} else {
$fn1 = $null
break
}
if (($fn1 -ne $null) -and ($fn1 -ne "$([io.path]::GetFileNameWithoutExtension($files[$i]))")) {
$newName = "$([io.path]::GetDirectoryName($files[$i]))\$($fn1).pdf"
if (Test-Path $newName) {
do {
$j++
$k = "{0:0000}" -f $j
$newName = "$([io.path]::GetDirectoryName($files[$i]))\$($fn1)_$($k).pdf"
} while (Test-Path $newName)
}
Rename-Item -LiteralPath "$($files[$i])" "$($newName)"
}
}
}
Function Convert-ToLatinCharacters {
# https://lazywinadmin.com/2015/05/powershell-remove-diacritics-accents.html
# https://lazywinadmin.com/2015/08/powershell-remove-special-characters.html
param (
[string]$inputString
)
return ([Text.Encoding]::ASCII.GetString([Text.Encoding]::GetEncoding("Cyrillic").GetBytes($inputString)) -replace '[/;]|[^\p{L}\p{Nd}/(/)/_/ \[\]]', '')
}
Function Check-PDF {
# https://superuser.com/questions/1278479/search-pdf-contents-with-powershell-and-output-a-file-list/1278521#1278521
$files = (Get-ChildItem "$($folder1)\*" -Include *.pdf -Recurse)
for ($i = 0; $i -lt $files.Count; $i++) {
if ($files[$i].FullName.Length -lt 260) {
Write-Host "Processing - $($files[$i]) ..."
$reader = New-Object iTextSharp.text.pdf.pdfreader -ArgumentList $files[$i].FullName
if ($?) {
$linesOfText = 0
for ($page = 1; $page -le $reader.NumberOfPages; $page++) {
$pageText = [iTextSharp.text.pdf.parser.PdfTextExtractor]::GetTextFromPage($reader, $page).Split([char]0x000A)
$linesOfText += $pageText.Count
if ($page -gt 4) {
break
}
}
$reader.Close()
if ($linesOfText -ge $TextPDFThreshold) {
$outfile = "$($folder2)\$(($files[$i].FullName).Substring($folder1.Length))"
If(-Not (Test-Path (Split-Path -Path $outfile))) {
New-Item (Split-Path -Path $outfile) -Type Directory | Out-Null
}
Move-Item "$($files[$i])" -Destination "$($outfile)"
} else {
$outfile = "$($folder3)\$(($files[$i].FullName).Substring($folder1.Length))"
If(-Not (Test-Path (Split-Path -Path $outfile))) {
New-Item (Split-Path -Path $outfile) -Type Directory | Out-Null
}
Move-Item "$($files[$i])" -Destination "$($outfile)"
}
}
}
}
}
Write-Host 'Removing duplicate archives ...'
Delete-Dupes
Write-Host 'Extracting archives ...'
Extract-Items
Write-Host 'Deleting small PDFs and empty folders ...'
Delete-SmallPDF $true
Write-Host 'Removing diacritics, etc, and fix long paths ...'
Fix-FileNames
Write-Host 'Testing for text PDFs ...'
Check-PDF
Write-Host 'Deleting empty folders ...'
Delete-SmallPDF $false
Write-Host 'Finished ...'