Applescript to find a text string in a PDF
Hello,
I'm trying to find an Applescript to find a text string in a PDF - if it finds the string it succeeds, if not it fails.
Needs to work on single and multi-page PDFs.
Thanks to Hiroto from these forums I have a script that already finds a string, splits the PDF into single pages if required and saves each page named as the string.
I have tried to adapt the script so that it doesn't split the pages, and searches on all pages. But whatever I try it fails.
It this script tweakable or would a simpler scrip do it?
--APPLESCRIPT
on main(inputs, outputFolder, params)
(*
list inputs : list of POSIX path of input files
string outputFolder : POSIX path of output folder
list params : optional parameters as list of strings
return string : "OK" | "Warning" | "Error"
* to be invoked by Esko Automation Engine
cf.
https://docs.esko.com/docs/en-us/automationengine/16/userguide/pdf/ae_ScriptRunner.pdf
*)
set a1 to ""
repeat with a in inputs
set a1 to a1 & a's quoted form & ":"
end repeat
set a1 to a1's text 1 thru -2 -- remove last excessive :
set a2 to outputFolder's quoted form
set ar to ""
repeat with a in params
set ar to ar & a's quoted form & space
end repeat
set args to a1 & space & a2 & space & ar
try
do shell script "/usr/bin/python <<'EOF' - " & args & "
# coding: utf-8
# sys.argv[1] : input files separatated by :
# sys.argv[2] : output directory
# sys.argv[3..] : additional parameters
# sys.argv[3] => search string in page
#
import sys, re
from Foundation import NSURL
from Quartz.PDFKit import PDFDocument
uargv = [ a.decode('utf-8') for a in sys.argv ]
outdir = uargv[2].rstrip('/')
re_pattern = re.compile(re.escape(uargv[3]) + '\\S*')
ret = 0
for f in [ a for a in uargv[1].split(':') if re.search(r'\\.pdf$', a, re.I) ]:
url = NSURL.fileURLWithPath_(f)
doc = PDFDocument.alloc().initWithURL_(url)
path = doc.documentURL().path()
pcnt = doc.pageCount()
for i in range(0, pcnt):
page = doc.pageAtIndex_(i)
m = re.search(re_pattern, page.string())
if not m:
ret = max(1, ret)
print 'no matching string in page %d of %s' % (i + 1, path.encode('utf-8'))
continue # ignore this page
name = m.group()
doc1 = PDFDocument.alloc().initWithData_(page.dataRepresentation()) # doc for this page
if not doc1.writeToFile_('%s/%s.pdf' % (outdir, name)):
ret = max(2, ret)
print 'failed to save page %d of %s' % (i + 1, path.encode('utf-8'))
sys.exit(ret)
EOF"
set {r, err} to {result, 0}
on error errs number errn
set {r, err} to {errs, errn}
end try
if err = 0 then
return "OK"
else if err = 1 then
log r
return "Warning"
else
log r
return "Error"
end if
end main
--END OF APPLESCRIPT