Applescript to find a text string in a PDF - continued
I have this script from Hiroto which finds a string in a PDF and not only splits the file into each separate page that contains the string, it also names each page using the found string.
For various different scenarios It's been tweaked, for example VikingOSX changed it for me to that if it just finds the string it's OK and if not it fails. Ideal for checking if a PDF contains a code.
Now I have another scenario, which I have tried to adjust both scripts to perform but I have failed as usual!
What I want to do is search a multi-page PDF for part of a string i.e "ABC_" and the script finds the full string: "ABC_123456" and saves the whole PDF with that result as the new filename.
The original script does this but it splits the output into separate pages.
In this scenario I just want to search the first page only for the string, (ABC_), grab the whole line (ABC_123456) and then save the whole file back as ABC_123456. - without being split into pages, because some pages don't contain the code.
Any help would be greatly appreciated!
--APPLESCRIPT
on main(inputs, outputFolder, params)
(*
list inputs : list of POSIX path of input files
string outputFolder : POSIX path of output folder
list params : optional parameters as list of strings
return string : "OK" | "Warning" | "Error"
* to be invoked by Esko Automation Engine
cf.
https://docs.esko.com/docs/en-us/automationengine/16/userguide/pdf/ae_ScriptRunner.pdf
*)
set a1 to ""
repeat with a in inputs
set a1 to a1 & a's quoted form & ":"
end repeat
set a1 to a1's text 1 thru -2 -- remove last excessive :
set a2 to outputFolder's quoted form
set ar to ""
repeat with a in params
set ar to ar & a's quoted form & space
end repeat
set args to a1 & space & a2 & space & ar
try
do shell script "/usr/bin/python <<'EOF' - " & args & "
# coding: utf-8
# sys.argv[1] : input files separatated by :
# sys.argv[2] : output directory
# sys.argv[3..] : additional parameters
# sys.argv[3] => search string in page
#
import sys, re
from Foundation import NSURL
from Quartz.PDFKit import PDFDocument
uargv = [ a.decode('utf-8') for a in sys.argv ]
outdir = uargv[2].rstrip('/')
re_pattern = re.compile(re.escape(uargv[3]) + '\\S*')
ret = 0
for f in [ a for a in uargv[1].split(':') if re.search(r'\\.pdf$', a, re.I) ]:
url = NSURL.fileURLWithPath_(f)
doc = PDFDocument.alloc().initWithURL_(url)
path = doc.documentURL().path()
pcnt = doc.pageCount()
for i in range(0, pcnt):
page = doc.pageAtIndex_(i)
m = re.search(re_pattern, page.string())
if not m:
ret = max(1, ret)
print 'no matching string in page %d of %s' % (i + 1, path.encode('utf-8'))
continue # ignore this page
name = m.group()
doc1 = PDFDocument.alloc().initWithData_(page.dataRepresentation()) # doc for this page
if not doc1.writeToFile_('%s/%s.pdf' % (outdir, name)):
ret = max(2, ret)
print 'failed to save page %d of %s' % (i + 1, path.encode('utf-8'))
sys.exit(ret)
EOF"
set {r, err} to {result, 0}
on error errs number errn
set {r, err} to {errs, errn}
end try
if err = 0 then
return "OK"
else if err = 1 then
log r
return "Warning"
else
log r
return "Error"
end if
end main
--END OF APPLESCRIPT