Applescript to find a text string in a PDF - continued

I have this script from Hiroto which finds a string in a PDF and not only splits the file into each separate page that contains the string, it also names each page using the found string.


For various different scenarios It's been tweaked, for example VikingOSX changed it for me to that if it just finds the string it's OK and if not it fails. Ideal for checking if a PDF contains a code.


Now I have another scenario, which I have tried to adjust both scripts to perform but I have failed as usual!


What I want to do is search a multi-page PDF for part of a string i.e "ABC_" and the script finds the full string: "ABC_123456" and saves the whole PDF with that result as the new filename.


The original script does this but it splits the output into separate pages.


In this scenario I just want to search the first page only for the string, (ABC_), grab the whole line (ABC_123456) and then save the whole file back as ABC_123456. - without being split into pages, because some pages don't contain the code.


Any help would be greatly appreciated!




--APPLESCRIPT


on main(inputs, outputFolder, params)


(*


        list inputs : list of POSIX path of input files


        string outputFolder : POSIX path of output folder


        list params : optional parameters as list of strings


        return string : "OK" | "Warning" | "Error"


       


        * to be invoked by Esko Automation Engine


        cf.


        https://docs.esko.com/docs/en-us/automationengine/16/userguide/pdf/ae_ScriptRunner.pdf


    *)


set a1 to ""


repeat with a in inputs


set a1 to a1 & a's quoted form & ":"


end repeat


set a1 to a1's text 1 thru -2 -- remove last excessive :


set a2 to outputFolder's quoted form


set ar to ""


repeat with a in params


set ar to ar & a's quoted form & space


end repeat


set args to a1 & space & a2 & space & ar



try


do shell script "/usr/bin/python <<'EOF' - " & args & "


# coding: utf-8


#   sys.argv[1]   : input files separatated by :


#   sys.argv[2]   : output directory


#   sys.argv[3..] : additional parameters


#   sys.argv[3] => search string in page



import sys, re


from Foundation import NSURL


from Quartz.PDFKit import PDFDocument




uargv = [ a.decode('utf-8') for a in sys.argv ]


outdir = uargv[2].rstrip('/')


re_pattern = re.compile(re.escape(uargv[3]) + '\\S*')


ret = 0




for f in [ a for a in uargv[1].split(':') if re.search(r'\\.pdf$', a, re.I) ]:


    url = NSURL.fileURLWithPath_(f)


    doc = PDFDocument.alloc().initWithURL_(url)


    path = doc.documentURL().path()


    pcnt = doc.pageCount()




    for i in range(0, pcnt):


        page = doc.pageAtIndex_(i)


        m = re.search(re_pattern, page.string())


        if not m:


            ret = max(1, ret)


            print 'no matching string in page %d of %s' % (i + 1, path.encode('utf-8'))


            continue    # ignore this page


        name = m.group()


        doc1 = PDFDocument.alloc().initWithData_(page.dataRepresentation()) # doc for this page


        if not doc1.writeToFile_('%s/%s.pdf' % (outdir, name)):


            ret = max(2, ret)


            print 'failed to save page %d of %s' % (i + 1, path.encode('utf-8'))




sys.exit(ret)


EOF"


set {r, err} to {result, 0}


on error errs number errn


set {r, err} to {errs, errn}


end try



if err = 0 then


return "OK"


else if err = 1 then


log r


return "Warning"


else


log r


return "Error"


end if


end main


--END OF APPLESCRIPT


Posted on Aug 19, 2022 7:33 AM

Reply
Question marked as Top-ranking reply

Posted on Aug 29, 2022 6:37 AM

Phil,


Here is a Python handler for AppleScript that attempts to match any pattern XXX_nnnnnn on the first page of the PDF passed to it. If there is a match, it returns that matched string back to AppleScript where the Finder then renames the original PDF to the matched string.


use scripting additions

set pdfFile to ((path to desktop as text) & "briggs.pdf") as text as alias
set arg to (POSIX path of pdfFile as text)'s quoted form
set ret to my extract_text(arg)
if not (ret = 0) = true then
	tell application "Finder"
		# rename current PDF with string extracted from PDF's first page
		set the name of pdfFile to ret & ".pdf"
	end tell
end if
return

on extract_text(apdf)
    # no python 2.7.*, or python/objective-c support in macOS 12.3.1 and later
	return (do shell script "/usr/bin/python <<'EOF' - " & apdf & "
#!/usr/bin/env python

from Foundation import NSString, NSRegularExpression, NSNumber
from AppKit import NSURL, NSMakeRange
from Quartz.PDFKit import PDFDocument
import sys

ret = 0
# any pattern matching ABC_123456
pattern = \"[[:upper:]]{3}_[[:digit:]]{6}\"
opt = NSNumber.numberWithUnsignedLongLong_(0)
regex, err = (NSRegularExpression
			 .regularExpressionWithPattern_options_error_(pattern,
														  opt,
														  None
														 ))
pdf = PDFDocument.alloc().initWithURL_(NSURL.fileURLWithPath_(sys.argv[1]))
first_page = pdf.pageAtIndex_(0).string()
p1_text = NSString.stringWithString_(first_page)
p1_text_length = p1_text.length()
p1_range = NSMakeRange(opt, p1_text_length)

matches = regex.matchesInString_options_range_(p1_text, 0, p1_range)

if not matches:
	# ret = max(1, ret)
	sys.exit()

for match in matches:
	result = p1_text.substringWithRange_(match.range())

if result:
	print(result)
EOF")
end extract_text


Similar questions

28 replies
Question marked as Top-ranking reply

Aug 29, 2022 6:37 AM in response to Phillip Briggs

Phil,


Here is a Python handler for AppleScript that attempts to match any pattern XXX_nnnnnn on the first page of the PDF passed to it. If there is a match, it returns that matched string back to AppleScript where the Finder then renames the original PDF to the matched string.


use scripting additions

set pdfFile to ((path to desktop as text) & "briggs.pdf") as text as alias
set arg to (POSIX path of pdfFile as text)'s quoted form
set ret to my extract_text(arg)
if not (ret = 0) = true then
	tell application "Finder"
		# rename current PDF with string extracted from PDF's first page
		set the name of pdfFile to ret & ".pdf"
	end tell
end if
return

on extract_text(apdf)
    # no python 2.7.*, or python/objective-c support in macOS 12.3.1 and later
	return (do shell script "/usr/bin/python <<'EOF' - " & apdf & "
#!/usr/bin/env python

from Foundation import NSString, NSRegularExpression, NSNumber
from AppKit import NSURL, NSMakeRange
from Quartz.PDFKit import PDFDocument
import sys

ret = 0
# any pattern matching ABC_123456
pattern = \"[[:upper:]]{3}_[[:digit:]]{6}\"
opt = NSNumber.numberWithUnsignedLongLong_(0)
regex, err = (NSRegularExpression
			 .regularExpressionWithPattern_options_error_(pattern,
														  opt,
														  None
														 ))
pdf = PDFDocument.alloc().initWithURL_(NSURL.fileURLWithPath_(sys.argv[1]))
first_page = pdf.pageAtIndex_(0).string()
p1_text = NSString.stringWithString_(first_page)
p1_text_length = p1_text.length()
p1_range = NSMakeRange(opt, p1_text_length)

matches = regex.matchesInString_options_range_(p1_text, 0, p1_range)

if not matches:
	# ret = max(1, ret)
	sys.exit()

for match in matches:
	result = p1_text.substringWithRange_(match.range())

if result:
	print(result)
EOF")
end extract_text


Sep 4, 2022 12:08 PM in response to Phillip Briggs

Phil,


Here is a script that replaces Python with AppleScript/Objective-C to perform the regular expression match. This example is for searching the first page of a PDF for a string like ABC_123456, and then renaming the matched PDF with that string. I do not use Eskoo here, so you may need to tweak the code somewhat to return the appropriate ret value. Without the on main loop, I was able to have it match the string in a PDF and then rename it, but catch the circumstance where there might be two PDFs with the identical string.


This is time consuming.


(*
 Eskoo_firstpage.applescript
 
 Process a list of PDFs (inputs) and attempt to locate a supplied
 search string on the first page of the PDF. If found, then rename
 the PDF using that found string name.

*)

use framework "Foundation"
use framework "AppKit"
use framework "PDFKit"
use scripting additions

property NSString : a reference to current application's NSString
property NSURL : a reference to current application's NSURL
property NSArray : a reference to current application's NSArray
property NSPredicate : a reference to current application's NSPredicate
property PDFDocument : a reference to current application's PDFDocument
property NSRegularExpression : a reference to current application's NSRegularExpression

property ret : 0

on main(inputs, outputFolder, params)
	
	set pred to NSPredicate's predicateWithFormat:"self ENDSWITH[c] '.pdf'"
	set pdfArray to (NSArray's arrayWithArray:inputs)'s filteredArrayUsingPredicate:pred
	set outdir to POSIX path of outputFolder
	set searchStr to (item 1 of params) as text
	
	try
		repeat with pdf in pdfArray
			repeat 1 times
				set doc to (PDFDocument's alloc()'s initWithURL:(NSURL's fileURLWithPath:pdf))
				set apath to doc's documentURL()'s |path|()
				set apage to ((doc's pageAtIndex:0)'s |string|()) as text
				
				set found to my search_firstpage_only(apage, searchStr)
				
				if (class of found is integer) = true then
					set ret to max(1, ret)
					log "Search String not Found in " & apath as text
					exit repeat -- get next PDF
				end if
				set newname to (found as text) & ".pdf"
				try
					tell application "Finder"
						-- rename to matched search string
						set oldname to (apath as text as POSIX file as alias)
						set name of oldname to newname
					end tell
				on error errs number errn
					set ret to max(2, ret)
					log (errn & ":" & errs) as text
					exit repeat
				end try
			end repeat
		end repeat
		set result to ret
		set {r, err} to {result, 0}
	on error errs number errn
		set {r, err} to {errs, errn}
	end try
	
	if err = 0 then
		return "OK"
	else if err = 1 then
		log r
		return "Warning"
	else
		log r
		return "Error"
	end if

end main


to max(n1, n2)
	if n1 > n2 then
		return n1
	else
		return n2
	end if
end max

on search_firstpage_only(pageStr, searchStr)
	-- look for searchStr on the first page of the PDF and return that string
	-- if found, otherwise return 0
	set pstr to NSString's alloc()'s initWithString:pageStr
	-- set the regex capture group 
	set cgroup to "(" & searchStr & ")"
	set re_pattern to NSString's alloc()'s initWithString:cgroup
	set regex to NSRegularExpression's regularExpressionWithPattern:re_pattern options:0 |error|:0
	set mrange to current application's NSMakeRange(0, pstr's |length|())
	set matches to (regex's firstMatchInString:pstr options:0 range:mrange)
	
	if (matches = "" or matches = missing value) = true then return (0 as integer)
	
	# want the first match group
	set matchrange to matches's rangeAtIndex:1
	return (pstr's substringWithRange:matchrange) as text
end search_firstpage_only


Aug 29, 2022 11:12 AM in response to VikingOSX

And a more succinct Python approach to regular expression capture of the pattern from the first page of the PDF:


use scripting additions

set f to ((path to desktop as text) & "briggs.pdf") as text as alias
set arg to (POSIX path of f as text)'s quoted form
display dialog (my extract_string(arg)) as text
return

on extract_string(apdf)
	return (do shell script "python <<'EOF' - " & apdf & "
#!/usr/bin/python
# coding: utf-8

from AppKit import NSURL
from Quartz.PDFKit import PDFDocument

import re
import sys

f = sys.argv[1]
pdf = PDFDocument.alloc().initWithURL_(NSURL.fileURLWithPath_(f))
first_page = pdf.pageAtIndex_(0).string()
regex = re.compile('[A-Z]{3}_[0-9]{6}', re.M)
match = regex.findall(first_page)
out = ' '.join(match).encode('utf-8')
print(out)
EOF")
end extract_string


Aug 30, 2022 8:26 AM in response to VikingOSX

I get this error:


File "<stdin>", line 12, in <module>

IndexError: list index is out of range



==


# coding: utf-8


import re

import os

import sys

from AppKit import NSURL

from Quartz.PDFKit import PDFDocument


uargv = [ a.decode('utf-8') for a in sys.argv ]

outdir = uargv[2].rstrip('/')

re_pattern = re.compile(re.escape(uargv[3]) + '\\S*', re.M | re.I)

ret = 0

for f in [ a for a in uargv[1].split(':') if a.lower.endswith('.pdf') ]:


os.rename(f, os.path.join(outdir, name + '.pdf'))

doc = PDFDocument.alloc().initWithURL_(NSURL.fileURLWithPath_(f))

path = doc.documentURL().path()

first_page = doc.pageAtIndex_(0).string()

match = re_pattern.search(first_page)

if not match:

ret = max(1,ret)

print 'no matching string in page %d of %s' % (i + 1, path.encode('utf-8'))

continue # ignore this page

name = match.group().encode('utf-8')

# rename the current PDF to the matched string name

try:

os.rename(f, os.path.join(os.path.split(f)[0], name + '.pdf'))

except:

ret = max(2, ret)

print 'failed to save page %d of %s' % (i + 1, path.encode('utf-8'))

sys.exit(ret)

EOF"

Aug 31, 2022 5:45 AM in response to VikingOSX

Here is the flow of the script. Since you are only processing the first page, there is no need for the i + 1 counter in the two print statements.


on main(inputs, outputFolder, params)

    (*

        list inputs : list of POSIX path of input files

        string outputFolder : POSIX path of output folder

        list params : optional parameters as list of strings

        return string : "OK" | "Warning" | "Error"

        * to be invoked by Esko Automation Engine

        cf.

        https://docs.esko.com/docs/en-us/automationengine/16/userguide/pdf/ae_ScriptRunner.pdf

    *)

    set a1 to ""

    repeat with a in inputs
        set a1 to a1 & a's quoted form & ":"
    end repeat

    set a1 to a1's text 1 thru -2 -- remove last excessive :
    set a2 to outputFolder's quoted form

    set ar to ""

    repeat with a in params
        set ar to ar & a's quoted form & space
    end repeat

    set args to a1 & space & a2 & space & ar

    try
        -- Python removed from macOS 12.3.1
        do shell script "/usr/bin/python <<'EOF' - " & args & "

# coding: utf-8

#   sys.argv[1]   : input files separatated by :

#   sys.argv[2]   : output directory

#   sys.argv[3..] : additional parameters

#       sys.argv[3] => search string in page

# 

import sys
import os
import re
from AppKit import NSURL
from Quartz.PDFKit import PDFDocument

uargv = [ a.decode('utf-8') for a in sys.argv ]
outdir = uargv[2].rstrip('/')
re_pattern = re.compile(re.escape(uargv[3]) + '\\S*', re.M | re.I)
ret = 0

for f in [a.lower() for a in uargv[1].split(':') if '.pdf' in a]:
    
    doc = PDFDocument.alloc().initWithURL_(NSURL.fileURLWithPath_(f))
    path = doc.documentURL().path()
    first_page = doc.pageAtIndex_(0).string()
    match = re_pattern.search(first_page)

    if not match:
        ret = max(1,ret)
        print 'no matching string in page %d of %s' % (1, path.encode('utf-8'))
        continue    # ignore this page
    
    name = match.group().encode('utf-8')

    try:
        # rename input PDF to matched name string from first page
        os.rename(f, os.path.join(os.path.split(f)[0], name + '.pdf'))
    except:
         ret = max(2, ret)
         print 'failed to save page %d of %s' % (1, path.encode('utf-8'))

sys.exit(ret)
EOF"

        set {r, err} to {result, 0}

    on error errs number errn
        set {r, err} to {errs, errn}
    end try

    if err = 0 then
        return "OK"
    else if err = 1 then
        log r
        return "Warning"
    else
        log r
        return "Error"
    end if
end main


Aug 30, 2022 7:01 AM in response to VikingOSX

Still didn't get it right as the code that I posted did not account for PDFs separated by colons, and the outdir path.


uargv = [ a.decode('utf-8') for a in sys.argv ]
outdir = uargv[2].rstrip('/')
re_pattern = re.compile(re.escape(uargv[3]) + '\\S*')
ret = 0

for f in [ a for a in uargv[1].split(':') if a.lower.endswith('.pdf') ]:

    os.rename(f, os.path.join(outdir, name + '.pdf'))


I am using a hardcoded regular expression to match the specific string format (ABC_123456) and should I be accommodating your passing in the string to be found, which would change this regular expression syntax?

Aug 30, 2022 1:10 PM in response to Phillip Briggs

Phil,


AppleScript passes a space separated cluster of strings to Python as the arg variable:


'a.pdf:b.pdf:c.pdf' 'outdir' 'searchStr'


The following list comprehension assigns each of these strings as array elements in uargv:


argv = [ a.decode('utf-8') for a in sys.argv[1:] ]
['a.pdf:b.pdf:c.pdf', 'outdir', 'searchStr']


While AppleScript lists are one-based, Python lists are zero-based, and to split the PDFs colon separated string into individual PDF names, one must operate on argv[0], not argv[1], and argv[1] is the outdir, and argv[2] is the searchStr.


outdir = uargv[1].rstrip('/')
re_pattern = re.compile(re.escape(uargv[2]) + '\\S*', re.M | re.I)

If in fact, I do have these in the correct orderr, then the Python for loop should be:


for f in [a.lower() for a in uargv[0].split(':') if '.pdf' in a]:


Hopefully, you are not using the following at all prior to the for loop as it removes the colon separated PDFs list element from the uargv array and assigns that string to outdir. With only two elements left in the array the for loop would always fail to split on a colon.


outdir = argv.pop(0).rstrip('/')



Sep 11, 2022 12:44 PM in response to Phillip Briggs

Phil,


Here is the rewritten version that replaces the Python solution for processing each page of a PDF and for matched partial search string (e.g. ABC_) will write out the page as a new PDF using that full found string as the filename. Tested: macOS 11.6.8.


As an example, I have a three page PDF with the following text strings on pages:

  1. ABC_12345
  2. ABC_98765
  3. BSC_123_456


Two PDFs are written to the outdir folder location as ABC_12345.pdf and ABC_98765.pdf.


Code:


(*
 Eskoo_allPages.applescript
 
 Process a list of PDFs (inputs) and attempt to locate a partial
 string pattern on each page of the PDF and successive characters
 up to the next word boundary. If found, then write
 that specific PDF page to a new PDF document whose name is that
 captured string and to the designated output directory.
*)

use framework "Foundation"
use framework "AppKit"
use framework "PDFKit"
use scripting additions

property NSString : a reference to current application's NSString
property NSURL : a reference to current application's NSURL
property NSArray : a reference to current application's NSArray
property NSPredicate : a reference to current application's NSPredicate
property PDFDocument : a reference to current application's PDFDocument
property NSRegularExpression : a reference to current application's NSRegularExpression

property ret : 0

on main(inputs, outputFolder, params)
	-- just get the files ending with ".pdf" from inputs
	set pred to NSPredicate's predicateWithFormat:"self ENDSWITH[c] '.pdf'"
	set pdfArray to (NSArray's arrayWithArray:inputs)'s filteredArrayUsingPredicate:pred
	
	set outdir to POSIX path of outputFolder as text
	set searchStr to (item 1 of params) as text
	
	try
		repeat with pdf in pdfArray
			set doc to (PDFDocument's alloc()'s initWithURL:(NSURL's fileURLWithPath:pdf))
			set apath to doc's documentURL()'s |path|()
			set page_cnt to doc's pageCount()
			-- loop through the pages of each PDF
			repeat with i from 0 to page_cnt - 1
				set thisPage to (doc's pageAtIndex:i)
				set textPage to thisPage's |string|() as text
				set found to my regex_search(textPage, searchStr)
				if (class of found is text) then log (found) as text
				
				if (class of found is integer) = true then
					set ret to max(1, ret)
					log "Search String not Found in page: " & (i as text) & " of " & (apath as text)
				else if (class of found is text) = true then
					set page_name to (found as text) & ".pdf"
					set outPath to outdir & "/" & page_name
					set doc1 to (PDFDocument's alloc()'s initWithData:(thisPage's dataRepresentation()))
					set status to (doc1's writeToFile:outPath) as boolean
					if not status then
						set ret to my max(2, ret)
						log ("failed to save page" & i & " of " & apath) as text
					end if
				else
					log "regex failed."
				end if
			end repeat
			set ret to 0 -- for each PDF
		end repeat
		set result to ret
		set {r, err} to {result, 0}
	on error errs number errn
		set {r, err} to {errs, errn}
	end try
	
	if err = 0 then
		return "OK"
	else if err = 1 then
		log r
		return "Warning"
	else
		log r
		return "Error"
	end if
	
end main

to max(n1, n2)
	if n1 > n2 then
		return n1
	else
		return n2
	end if
end max

on regex_search(pageStr, searchStr)
	-- look for searchStr on each page of the PDF passed here.
	-- Return captured search string, or 0 if not found
	set pstr to NSString's alloc()'s initWithString:pageStr
	-- capture all contiguous content from the search string
	-- up to the next word boundary. (e.g. ABC_123456_98)
	set capture_group to "(\\b" & searchStr & ".*\\b)" as text
	set re_pattern to NSString's alloc()'s initWithString:capture_group
	set regex to NSRegularExpression's regularExpressionWithPattern:re_pattern options:0 |error|:0
	set mrange to current application's NSMakeRange(0, pstr's |length|())
	set matches to (regex's firstMatchInString:pstr options:0 range:mrange)
	
	if (matches = "" or matches = missing value) = true then return (0 as integer)
	
	# want the first match group
	set matchrange to matches's rangeAtIndex:1
	return (pstr's substringWithRange:matchrange) as text
end regex_search


Sep 12, 2022 4:24 AM in response to VikingOSX

Yes, i don't know why I get the variable error when I run it through Esko scriptrunner, which is the preferred method.


I added the select file bit to try and narrow down the issue by running it manually, but I still don't get any success, sorry:


===


(*


 Eskoo_allPages.applescript


 


 Process a list of PDFs (inputs) and attempt to locate a partial


 string pattern on each page of the PDF and successive characters


 up to the next word boundary. If found, then write


 that specific PDF page to a new PDF document whose name is that


 captured string and to the designated output directory.


*)




use framework "Foundation"


use framework "AppKit"


use framework "PDFKit"


use scripting additions




property NSString : a reference to current application's NSString


property NSURL : a reference to current application's NSURL


property NSArray : a reference to current application's NSArray


property NSPredicate : a reference to current application's NSPredicate


property PDFDocument : a reference to current application's PDFDocument


property NSRegularExpression : a reference to current application's NSRegularExpression




property ret : 0


set inputs to choose file with prompt ("Choose PDF Files.") of type {"com.adobe.pdf"} with multiple selections allowed


set outputFolder to choose folder with prompt ("Choose Destination Folder.")


set searchStr to text returned of (display dialog "Enter search string: " default answer "" with title "Search String Input")




repeat with i in inputs


set i's contents to i's POSIX path


end repeat


on main(inputs, outputFolder, params)


-- just get the files ending with ".pdf" from inputs


set pred to NSPredicate's predicateWithFormat:"self ENDSWITH[c] '.pdf'"


set posix_inputs to (NSArray's arrayWithArray:inputs)'s valueForKey:"path"


set pdfArray to (NSArray's arrayWithArray:posix_inputs)'s filteredArrayUsingPredicate:pred




set outdir to POSIX path of outputFolder as text


(* set searchStr to (item 1 of params) as text *)



try


repeat with pdf in pdfArray


set doc to (PDFDocument's alloc()'s initWithURL:(NSURL's fileURLWithPath:pdf))


set apath to doc's documentURL()'s |path|()


set page_cnt to doc's pageCount()


-- loop through the pages of each PDF


repeat with i from 0 to page_cnt - 1


set thisPage to (doc's pageAtIndex:i)


set textPage to thisPage's |string|() as text


set found to my regex_search(textPage, searchStr)


if (class of found is text) then log (found) as text



if (class of found is integer) = true then


set ret to max(1, ret)


log "Search String not Found in page: " & (i as text) & " of " & (apath as text)


else if (class of found is text) = true then


set page_name to (found as text) & ".pdf"


set outPath to outdir & "/" & page_name


set doc1 to (PDFDocument's alloc()'s initWithData:(thisPage's dataRepresentation()))


set status to (doc1's writeToFile:outPath) as boolean


if not status then


set ret to my max(2, ret)


log ("failed to save page" & i & " of " & apath) as text


end if


else


log "regex failed."


end if


end repeat


set ret to 0 -- for each PDF


end repeat


set result to ret


set {r, err} to {result, 0}


on error errs number errn


set {r, err} to {errs, errn}


end try



if err = 0 then


return "OK"


else if err = 1 then


log r


return "Warning"


else


log r


return "Error"


end if



end main




to max(n1, n2)


if n1 > n2 then


return n1


else


return n2


end if


end max




on regex_search(pageStr, searchStr)


-- look for searchStr on each page of the PDF passed here.


-- Return captured search string, or 0 if not found


set pstr to NSString's alloc()'s initWithString:pageStr


-- capture all contiguous content from the search string


-- up to the next word boundary. (e.g. ABC_123456_98)


set capture_group to "(\\b" & searchStr & ".*\\b)" as text


set re_pattern to NSString's alloc()'s initWithString:capture_group


set regex to NSRegularExpression's regularExpressionWithPattern:re_pattern options:0 |error|:0


set mrange to current application's NSMakeRange(0, pstr's |length|())


set matches to (regex's firstMatchInString:pstr options:0 range:mrange)



if (matches = "" or matches = missing value) = true then return (0 as integer)



# want the first match group


set matchrange to matches's rangeAtIndex:1


return (pstr's substringWithRange:matchrange) as text


end regex_search

Sep 12, 2022 7:25 AM in response to Phillip Briggs

Your variables are not getting passed into the main handler because you are not deliberately invoking the main handler with those variables. Once you do that, then the code in the script works correctly writing out PDF pages where the search string has matched.


Here is the business end of the solution you posted above with modifications that work. Notice I have dropped the parenthesis around your choose with prompt text strings as they are not necessary.


set PDFs to choose file with prompt "Choose PDF Files." of type {"com.adobe.pdf"} with multiple selections allowed
set outdir to POSIX path of (choose folder with prompt "Choose Destination Folder.") as text
set str to text returned of (display dialog "Enter search string: " default answer "" with title "Search String Input") as text

my main(PDFs, outdir, str)

on main(inputs, outputFolder, params)
	-- just get the files ending with ".pdf" from inputs
	set pred to NSPredicate's predicateWithFormat:"self ENDSWITH[c] '.pdf'"
    -- convert HFS paths to POSIX paths
	set posix_inputs to (NSArray's arrayWithArray:inputs)'s valueForKey:"path"
	set pdfArray to (NSArray's arrayWithArray:posix_inputs)'s filteredArrayUsingPredicate:pred
	set outdir to outputFolder
	set searchStr to params
	
	try


I have a four-page PDF and have assigned the following strings in random places on each page:

  1. ABC_123456
  2. ABC_987654
  3. ABC_123456_25
  4. BCS_123456


The first three pages are written out as PDFs with those names when I supply the search string as ABC_


Aug 30, 2022 4:14 AM in response to VikingOSX

Viking_OSX - Thanks for your work


Sorry for being dim though - I can't make it work within the Esko Script Runner environment.


This simpler script shows more easily the required Applescript wrapper for it to work within Esko AE.


--APPLESCRIPT



(*

  for Esko Automation Engine Script Runner

*)

on main(inputs, outputFolder, params)


(*


  list inputs : list of POSIX path of input files

  string outputFolder : POSIX path of output folder

        list params : optional parameters as list of strings

        return string : "OK" | "Warning" | "Error"

        * to be invoked by Esko Automation Engine

https://docs.esko.com/docs/en-us/automationengine/16/userguide/pdf/ae_ScriptRunner.pdf


    *)

script o

property aa : {outputFolder} & inputs

set args to ""

repeat with a in my aa

set args to args & a's quoted form & space

end repeat

try

do shell script "/usr/bin/python <<'EOF' - " & args & "


# coding: utf-8

import sys, re

from Foundation import NSURL

from Quartz.PDFKit import PDFDocument

argv = [ a.decode('utf-8') for a in sys.argv[1:] ]

outdir = argv.pop(0).rstrip('/')

ret = 0


for f in [ a for a in argv if re.search(r'\\.pdf$', a, re.I) ]:


    url = NSURL.fileURLWithPath_(f)


    doc = PDFDocument.alloc().initWithURL_(url)


    path = doc.documentURL().path()


    pcnt = doc.pageCount()


    for i in range(0, pcnt):


        page = doc.pageAtIndex_(i)


        m = re.search(r'BX_\\S*', page.string()) # Search for required string i.e. BX_123_ABC_21 up until end on line. This varies


        if not m:

            ret = max(1, ret)

            print 'no matching string in page %d of %s' % (i + 1, path.encode('utf-8'))

            continue    # ignore this page

        name = m.group()


        doc1 = PDFDocument.alloc().initWithData_(page.dataRepresentation()) # doc for this page

        if not doc1.writeToFile_('%s/%s.pdf' % (outdir,name)):

            ret = max(2, ret)

            print 'failed to save page %d of %s' % (i + 1, path.encode('utf-8'))


sys.exit(ret)


EOF"

set {r, err} to {result, 0}

on error errs number errn

set {r, err} to {errs, errn}

end try

if err = 0 then

return "OK"

else if err = 1 then


log r


return "Warning"

else

log r

return "Error"

end if

end script

tell o to run

end main


--END OF APPLESCRIPT


So I need it to search for the string on the first page and if found, rename the whole PDF (without splitting it) with the found string and return back to the output folder. I can get the file back named with the found string but it's only the first page not the whole original document.


I can't seem to make your applescript work with the Esko requirements.

Aug 30, 2022 6:40 AM in response to Phillip Briggs

Give the following a try. It renames the current PDF to the matched string while in Python.


on main(inputs, outputFolder, params)
	
	(*

    list inputs : list of POSIX path of input files
    string outputFolder : POSIX path of output folder
        list params : optional parameters as list of strings
        return string : "OK" | "Warning" | "Error"
        * to be invoked by Esko Automation Engine
    hthttps://docs.esko.com/docs/en-us/automationengine/16/userguide/pdf/ae_ScriptRunner.pdf.
    *)
	script o
		property aa : {outputFolder} & inputs
		set args to ""
		repeat with a in my aa
			set args to args & a's quoted form & space
		end repeat
		try
			do shell script "/usr/bin/python <<'EOF' - " & args & "

# coding: utf-8

import re
import os
import sys
from AppKit import NSURL
from Quartz.PDFKit import PDFDocument

argv = [ a.decode('utf-8') for a in sys.argv[1:] ]
outdir = argv.pop(0).rstrip('/')

ret = 0
regex = re.compile('[A-Z]{3}_[0-9]{6}', re.M)  # multiline is re.M
	
# for f in [ a for a in argv if re.search(r'\\.pdf$', a, re.I) ]:
# more efficient that performing a search for an extension
for f in [a for a in argv if a.lower.endswith('.pdf')]:
	doc = PDFDocument.alloc().initWithURL_(NSURL.fileURLWithPath_(f))
	path = doc.documentURL().path()
	first_page = doc.pageAtIndex_(0).string()
	match = re.search(first_page)
	if not match:
		ret = max(1,ret)
		print 'no matching string in page %d of %s' % (1, path.encode('utf-8'))
		continue    # ignore this page
	
	name = match.group().encode('utf-8')
	# rename the current PDF to the matched string name
    try:
	    os.rename(f, os.path.join(os.path.split(f)[0], name + '.pdf'))
    except:
         ret = max(2, ret)
         print 'failed to rename PDF %s' % (path.encode('utf-8'))
	sys.exit(ret)
EOF"

			set {r, err} to {result, 0}
		on error errs number errn
			set {r, err} to {errs, errn}
		end try
		
		if err = 0 then
			return "OK"
		else if err = 1 then
			
			log r
			
			return "Warning"
		else
			log r
			return "Error"
		end if
	end script
	tell o to run
end main








Aug 30, 2022 7:38 AM in response to VikingOSX

Hi,


I can work with a hardcoded string, but as I use it for various strings I would pass the an Esko argument in as the search string:

#    sys.argv[3] => search string in page, in this case "T22_"

So each one of those script parameters is a sys.arg [1] is input files, [2] is outputdir, [3] is what I use as the search string.



so here it takes the string, then adds \\S* so that it grabs the whole line of text up until a space.

uargv = [ a.decode('utf-8') for a in sys.argv ]

outdir = uargv[2].rstrip('/')

re_pattern = re.compile(re.escape(uargv[3]) + '\\S*')

origname = uargv[4]

dash = uargv[5]

ret = 0


This thread has been closed by the system or the community team. You may vote for any posts you find helpful, or search the Community for additional answers.

Applescript to find a text string in a PDF - continued

Welcome to Apple Support Community
A forum where Apple customers help each other with their products. Get started with your Apple Account.