Applescript to find a text string in a PDF

Hello,


I'm trying to find an Applescript to find a text string in a PDF - if it finds the string it succeeds, if not it fails.


Needs to work on single and multi-page PDFs.


Thanks to Hiroto from these forums I have a script that already finds a string, splits the PDF into single pages if required and saves each page named as the string.


I have tried to adapt the script so that it doesn't split the pages, and searches on all pages. But whatever I try it fails.


It this script tweakable or would a simpler scrip do it?




--APPLESCRIPT


on main(inputs, outputFolder, params)


(*


        list inputs : list of POSIX path of input files


        string outputFolder : POSIX path of output folder


        list params : optional parameters as list of strings


        return string : "OK" | "Warning" | "Error"


       


        * to be invoked by Esko Automation Engine


        cf.


        https://docs.esko.com/docs/en-us/automationengine/16/userguide/pdf/ae_ScriptRunner.pdf


    *)


set a1 to ""


repeat with a in inputs


set a1 to a1 & a's quoted form & ":"


end repeat


set a1 to a1's text 1 thru -2 -- remove last excessive :


set a2 to outputFolder's quoted form


set ar to ""


repeat with a in params


set ar to ar & a's quoted form & space


end repeat


set args to a1 & space & a2 & space & ar



try


do shell script "/usr/bin/python <<'EOF' - " & args & "


# coding: utf-8


#   sys.argv[1]   : input files separatated by :


#   sys.argv[2]   : output directory


#   sys.argv[3..] : additional parameters


#       sys.argv[3] => search string in page



import sys, re


from Foundation import NSURL


from Quartz.PDFKit import PDFDocument




uargv = [ a.decode('utf-8') for a in sys.argv ]


outdir = uargv[2].rstrip('/')


re_pattern = re.compile(re.escape(uargv[3]) + '\\S*')


ret = 0




for f in [ a for a in uargv[1].split(':') if re.search(r'\\.pdf$', a, re.I) ]:


    url = NSURL.fileURLWithPath_(f)


    doc = PDFDocument.alloc().initWithURL_(url)


    path = doc.documentURL().path()


    pcnt = doc.pageCount()




    for i in range(0, pcnt):


        page = doc.pageAtIndex_(i)


        m = re.search(re_pattern, page.string())


        if not m:


            ret = max(1, ret)


            print 'no matching string in page %d of %s' % (i + 1, path.encode('utf-8'))


            continue    # ignore this page


        name = m.group()


        doc1 = PDFDocument.alloc().initWithData_(page.dataRepresentation()) # doc for this page


        if not doc1.writeToFile_('%s/%s.pdf' % (outdir, name)):


            ret = max(2, ret)


            print 'failed to save page %d of %s' % (i + 1, path.encode('utf-8'))




sys.exit(ret)


EOF"


set {r, err} to {result, 0}


on error errs number errn


set {r, err} to {errs, errn}


end try



if err = 0 then


return "OK"


else if err = 1 then


log r


return "Warning"


else


log r


return "Error"


end if


end main


--END OF APPLESCRIPT



Posted on Apr 1, 2022 8:03 AM

Reply
Question marked as Top-ranking reply

Posted on Apr 1, 2022 9:58 AM

Apple removed Python from macOS 12.3, so here is an AppleScript/Objective-C approach. It prompts for a search string, and then one or multiple PDFs. Each PDF is searched in its entirety for the search string and if found, a boolean true is returned, otherwise false. It does not loop through PDF pages, nor split those pages into separate PDF.


use framework "Foundation"
use framework "PDFKit"
use AppleScript version "2.4"
use scripting additions

property NSString : a reference to current application's NSString
property NSURL : a reference to current application's NSURL
property PDFDocument : a reference to current application's PDFDocument

set searchStr to text returned of (display dialog "Enter your search string: " default answer "") as text
if searchStr is "" or searchStr is missing value then return

set thesePDF to (choose file of type {"PDF"} with multiple selections allowed) as list

repeat with aPDF in thesePDF
	set truth to (my find_PDF_String(aPDF, searchStr)) as boolean
	log (truth & tab & searchStr's quoted form & tab & tab & aPDF as text)
end repeat
return

on find_PDF_String(aPDF, thestring)
	set aurl to NSURL's fileURLWithPath:(NSString's stringWithString:(POSIX path of aPDF))
	set pdf to PDFDocument's alloc()'s initWithURL:aurl
	# NSCaseInsensitiveSearch = 1, NSLiteralSearch = 2 and 1 | 2 = 3
	set options to 3
	# searches the entire PDF for searchStr
	set found to pdf's findString:thestring withOptions:options
	if (found as list) = {} then
		return (false as boolean)
	else
		return (true as boolean)
	end if
end find_PDF_String



9 replies
Question marked as Top-ranking reply

Apr 1, 2022 9:58 AM in response to Phillip Briggs

Apple removed Python from macOS 12.3, so here is an AppleScript/Objective-C approach. It prompts for a search string, and then one or multiple PDFs. Each PDF is searched in its entirety for the search string and if found, a boolean true is returned, otherwise false. It does not loop through PDF pages, nor split those pages into separate PDF.


use framework "Foundation"
use framework "PDFKit"
use AppleScript version "2.4"
use scripting additions

property NSString : a reference to current application's NSString
property NSURL : a reference to current application's NSURL
property PDFDocument : a reference to current application's PDFDocument

set searchStr to text returned of (display dialog "Enter your search string: " default answer "") as text
if searchStr is "" or searchStr is missing value then return

set thesePDF to (choose file of type {"PDF"} with multiple selections allowed) as list

repeat with aPDF in thesePDF
	set truth to (my find_PDF_String(aPDF, searchStr)) as boolean
	log (truth & tab & searchStr's quoted form & tab & tab & aPDF as text)
end repeat
return

on find_PDF_String(aPDF, thestring)
	set aurl to NSURL's fileURLWithPath:(NSString's stringWithString:(POSIX path of aPDF))
	set pdf to PDFDocument's alloc()'s initWithURL:aurl
	# NSCaseInsensitiveSearch = 1, NSLiteralSearch = 2 and 1 | 2 = 3
	set options to 3
	# searches the entire PDF for searchStr
	set found to pdf's findString:thestring withOptions:options
	if (found as list) = {} then
		return (false as boolean)
	else
		return (true as boolean)
	end if
end find_PDF_String



Apr 2, 2022 10:21 AM in response to Phillip Briggs

Phil,


The following is a revised version of the original Hiroto script you posted. It omits a regular expression search for the search string, on each page, and instead uses the PDFDocument's find method that searches all Pages of the PDF. If there is a match, then it places 0 in the ret variable and if no matches, it sets it to 1. I excised the entire section that splits out and writes the individual PDF pages.


on main(inputs, outputFolder, params)
    
        (*
    
            list inputs : list of POSIX path of input files
    
            string outputFolder : POSIX path of output folder
    
            list params : optional parameters as list of strings
    
            return string : "OK" | "Warning" | "Error"
    
            * to be invoked by Esko Automation Engine
    
            cf.
    
            https://docs.esko.com/docs/en-us/automationengine/16/userguide/pdf/ae_ScriptRunner.pdf
    
        *)
    
        set a1 to ""
    
        repeat with a in inputs
            set a1 to a1 & a's quoted form & ":"
        end repeat
    
        set a1 to a1's text 1 thru -2 -- remove last excessive :
        set a2 to outputFolder's quoted form
    
        set ar to ""
    
        repeat with a in params
            set ar to ar & a's quoted form & space
        end repeat
    
        set args to a1 & space & a2 & space & ar
    
        try
    
            do shell script "/usr/bin/python <<'EOF' - " & args & "
    
    # coding: utf-8
    
    #   sys.argv[1]   : input files separatated by :
    
    #   sys.argv[2]   : output directory
    
    #   sys.argv[3..] : additional parameters
    
    #       sys.argv[3] => search string in page
    
    # 
    
    import sys
    import os
    import re
    from Foundation import NSURL
    from Quartz.PDFKit import PDFDocument
    
    uargv = [ a.decode('utf-8') for a in sys.argv ]
    outdir = uargv[2].rstrip('/')
    # re_pattern = re.compile(re.escape(uargv[3]) + '\\S*')
    search_string = uargv[3]
    ret = 0
    
    for f in [ a for a in uargv[1].split(':') if os.path.splitext(a)[1] == '.pdf']:
    
        url = NSURL.fileURLWithPath_(f)
        doc = PDFDocument.alloc().initWithURL_(url)
        options = 3  # NSCaseInsensitiveSearch | NSLiteralSearch
        found = doc.findString_withOptions_(search_string, options)
    
        # found will be an array of PDFSelections if there is a string(s) match in the entire PDF
        # or an empty array if nothing found
        ret = 0 if len(found) else 1
    
    sys.exit(ret)
    EOF"
    
            set {r, err} to {result, 0}
    
        on error errs number errn
            set {r, err} to {errs, errn}
        end try
    
        if err = 0 then
            return "OK"
        else if err = 1 then
            log r
            return "Warning"
        else
            log r
            return "Error"
        end if
    end main


Apr 2, 2022 3:41 AM in response to VikingOSX

Wow that's very good of you! With Esko I run the scripts on a Mac running Mojave, so if Python is being removed by Apple your original script might be the way forward? I just have to Esko'ify it like in Hiroto's original, where the Applescript 'wrapper' calls the python script. Esko Automation Engine passes the filename, path, search string etc. into the script, in this case the search string variable is: sys.argv[3] =>


--APPLESCRIPT

on main(inputs, outputFolder, params)

(*

        list inputs : list of POSIX path of input files

        string outputFolder : POSIX path of output folder

        list params : optional parameters as list of strings

        return string : "OK" | "Warning" | "Error"

     

        * to be invoked by Esko Automation Engine


        cf.


        https://docs.esko.com/docs/en-us/automationengine/16/userguide/pdf/ae_ScriptRunner.pdf


    *)


set a1 to ""

repeat with a in inputs

set a1 to a1 & a's quoted form & ":"

end repeat

set a1 to a1's text 1 thru -2 -- remove last excessive :

set a2 to outputFolder's quoted form

set ar to ""

repeat with a in params

set ar to ar & a's quoted form & space

end repeat

set args to a1 & space & a2 & space & ar


try


...

This thread has been closed by the system or the community team. You may vote for any posts you find helpful, or search the Community for additional answers.

Applescript to find a text string in a PDF

Welcome to Apple Support Community
A forum where Apple customers help each other with their products. Get started with your Apple Account.