diff --git a/src/test.py b/src/test.py deleted file mode 100644 index 1135c6a8cc9ccd80f4d020d92fa110ed2593eb82..0000000000000000000000000000000000000000 --- a/src/test.py +++ /dev/null @@ -1,1760 +0,0 @@ -#!/usr/bin/env python - -__description__ = 'pdf-parser, use it to parse a PDF document' -__author__ = 'Didier Stevens' -__version__ = '0.7.8' -__date__ = '2023/01/03' -__minimum_python_version__ = (2, 5, 1) -__maximum_python_version__ = (3, 11, 1) - -""" -Source code put in public domain by Didier Stevens, no Copyright -https://DidierStevens.com -Use at your own risk - -History: - 2008/05/02: continue - 2008/05/03: continue - 2008/06/02: streams - 2008/10/19: refactor, grep & extract functionality - 2008/10/20: reference - 2008/10/21: cleanup - 2008/11/12: V0.3 dictionary parser - 2008/11/13: option elements - 2008/11/14: continue - 2009/05/05: added /ASCIIHexDecode support (thanks Justin Prosco) - 2009/05/11: V0.3.1 updated usage, added --verbose and --extract - 2009/07/16: V0.3.2 Added Canonicalize (thanks Justin Prosco) - 2009/07/18: bugfix EqualCanonical - 2009/07/24: V0.3.3 Added --hash option - 2009/07/25: EqualCanonical for option --type, added option --nocanonicalizedoutput - 2009/07/28: V0.3.4 Added ASCII85Decode support - 2009/08/01: V0.3.5 Updated ASCIIHexDecode to support whitespace obfuscation - 2009/08/30: V0.3.6 TestPythonVersion - 2010/01/08: V0.3.7 Added RLE and LZW support (thanks pARODY); added dump option - 2010/01/09: Fixed parsing of incomplete startxref - 2010/09/22: V0.3.8 Changed dump option, updated PrettyPrint, added debug option - 2011/12/17: fixed bugs empty objects - 2012/03/11: V0.3.9 fixed bugs double nested [] in PrettyPrintSub (thanks kurt) - 2013/01/11: V0.3.10 Extract and dump bug fixes by Priit; added content option - 2013/02/16: Performance improvement in cPDFTokenizer by using StringIO for token building by Christophe Vandeplas; xrange replaced with range - 2013/02/16: V0.4.0 added http/https support; added error handling for missing file or URL; ; added support for ZIP file with password 'infected' - 2013/03/13: V0.4.1 fixes for Python 3 - 2013/04/11: V0.4.2 modified PrettyPrintSub for strings with unprintable characters - 2013/05/04: Added options searchstream, unfiltered, casesensitive, regex - 2013/09/18: V0.4.3 fixed regression bug -w option - 2014/09/25: V0.5.0 added option -g - 2014/09/29: Added PrintGenerateObject and PrintOutputObject - 2014/12/05: V0.6.0 Added YARA support - 2014/12/09: cleanup, refactoring - 2014/12/13: Python 3 fixes - 2015/01/11: Added support for multiple YARA rule files; added request to search in trailer - 2015/01/31: V0.6.1 Added optionyarastrings - 2015/02/09: Added decoders - 2015/04/05: V0.6.2 Added generateembedded - 2015/04/06: fixed bug reported by Kurt for stream produced by Ghostscript where endstream is not preceded by whitespace; fixed prettyprint bug - 2015/04/24: V0.6.3 when option dump's filename is -, content is dumped to stdout - 2015/08/12: V0.6.4 option hash now also calculates hashes of streams when selecting or searching objects; and displays hexasciidump first line - 2016/07/27: V0.6.5 bugfix whitespace 0x00 0x0C after stream 0x0D 0x0A reported by @mr_me - 2016/11/20: V0.6.6 added workaround zlib errors FlateDecode - 2016/12/17: V0.6.7 added option -k - 2017/01/07: V0.6.8 changed cPDFParseDictionary to handle strings () with % character - 2017/10/28: fixed bug - 2017/10/29: added # support for option -y - 2018/06/29: V0.6.9 added option --overridingfilters - 2018/10/20: added keywords to statistics - 2019/02/22: V0.7.0 added option -O --objstm to parse the stream of /ObjStm objects, inspired by a contributor wishing anonymity - 2019/03/01: V0.7.1 added ContainsName for correct keyword statistics (-a) - 2019/04/12: V0.7.2 Python 2.6.6 compatibility fix - 2019/07/30: bug fixes (including fixes Josef Hinteregger) - 2019/09/26: V0.7.3 added multiple id selection to option -o; added man page (-m); added environment variable PDFPARSER_OPTIONS; bug fixes - 2019/11/05: V0.7.4 fixed plugin path when compiled with pyinstaller, replaced eval with int - 2021/07/03: V0.7.5 bug fixes; fixed ASCII85Decode Python 3 bug thanks to R Primus - 2021/11/23: V0.7.6 Python 3 bug fixes - 2022/05/24: bug fixes - 2022/11/09: V0.7.7 added support for environment variable DSS_DEFAULT_HASH_ALGORITHMS - 2023/01/03: V0.7.8 added unreferenced objects to statistics - -Todo: - - handle printf todo - - support for JS hex string EC61C64349DB8D88AF0523C4C06E0F4D.pdf.vir - -""" - -import re -import optparse -import zlib -import binascii -import hashlib -import sys -import zipfile -import time -import os -import textwrap -if sys.version_info[0] >= 3: - from io import StringIO - import urllib.request - urllib23 = urllib.request - import configparser as ConfigParser -else: - from cStringIO import StringIO - import urllib2 - urllib23 = urllib2 - import ConfigParser -try: - import yara -except: - pass - -CHAR_WHITESPACE = 1 -CHAR_DELIMITER = 2 -CHAR_REGULAR = 3 - -CONTEXT_NONE = 1 -CONTEXT_OBJ = 2 -CONTEXT_XREF = 3 -CONTEXT_TRAILER = 4 - -PDF_ELEMENT_COMMENT = 1 -PDF_ELEMENT_INDIRECT_OBJECT = 2 -PDF_ELEMENT_XREF = 3 -PDF_ELEMENT_TRAILER = 4 -PDF_ELEMENT_STARTXREF = 5 -PDF_ELEMENT_MALFORMED = 6 - -dumplinelength = 16 - -def PrintManual(): - manual = ''' -Manual: - -This manual is a work in progress. - -There is a free PDF analysis book: -https://blog.didierstevens.com/2010/09/26/free-malicious-pdf-analysis-e-book/ - -Option -o is used to select objects by id. Provide a single id or multiple ids separated by a comma (,). - -When environment variable PDFPARSER_OPTIONS is defined, the options it defines are added implicitely to the command line arguments. -Use this to define options you want included with each use of pdf-parser.py. -Like option -O, to parse stream objects (/ObjStm). -By defining PDFPARSER_OPTIONS=-O, pdf-parser will always parse stream objects (when found). -PS: this feature is experimental. - -Option -H calculates the MD5 hash by default. -This can be changed by setting environment variable DSS_DEFAULT_HASH_ALGORITHMS. -Like this: set DSS_DEFAULT_HASH_ALGORITHMS=sha256 - -''' - for line in manual.split('\n'): - print(textwrap.fill(line)) - -#Convert 2 Bytes If Python 3 -def C2BIP3(string): - if sys.version_info[0] > 2: - if type(string) == bytes: - return string - else: - return bytes([ord(x) for x in string]) - else: - return string - -#Convert 2 String If Python 3 -def C2SIP3(bytes): - if sys.version_info[0] > 2: - return ''.join([chr(byte) for byte in bytes]) - else: - return bytes - -# CIC: Call If Callable -def CIC(expression): - if callable(expression): - return expression() - else: - return expression - -# IFF: IF Function -def IFF(expression, valueTrue, valueFalse): - if expression: - return CIC(valueTrue) - else: - return CIC(valueFalse) - -def Timestamp(epoch=None): - if epoch == None: - localTime = time.localtime() - else: - localTime = time.localtime(epoch) - return '%04d%02d%02d-%02d%02d%02d' % localTime[0:6] - -def CopyWithoutWhiteSpace(content): - result = [] - for token in content: - if token[0] != CHAR_WHITESPACE: - result.append(token) - return result - -def Obj2Str(content): - return ''.join(map(lambda x: repr(x[1])[1:-1], CopyWithoutWhiteSpace(content))) - -class cPDFDocument: - def __init__(self, file): - self.file = file - if type(file) != str: - self.infile = file - elif file.lower().startswith('http://') or file.lower().startswith('https://'): - try: - if sys.hexversion >= 0x020601F0: - self.infile = urllib23.urlopen(file, timeout=5) - else: - self.infile = urllib23.urlopen(file) - except urllib23.HTTPError: - print('Error accessing URL %s' % file) - print(sys.exc_info()[1]) - sys.exit() - elif file.lower().endswith('.zip'): - try: - self.zipfile = zipfile.ZipFile(file, 'r') - self.infile = self.zipfile.open(self.zipfile.infolist()[0], 'r', C2BIP3('infected')) - except: - print('Error opening file %s' % file) - print(sys.exc_info()[1]) - sys.exit() - else: - try: - self.infile = open(file, 'rb') - except: - print('Error opening file %s' % file) - print(sys.exc_info()[1]) - sys.exit() - self.ungetted = [] - self.position = -1 - - def byte(self): - if len(self.ungetted) != 0: - self.position += 1 - return self.ungetted.pop() - inbyte = self.infile.read(1) - if not inbyte or inbyte == '': - self.infile.close() - return None - self.position += 1 - return ord(inbyte) - - def unget(self, byte): - self.position -= 1 - self.ungetted.append(byte) - -def CharacterClass(byte): - if byte == 0 or byte == 9 or byte == 10 or byte == 12 or byte == 13 or byte == 32: - return CHAR_WHITESPACE - if byte == 0x28 or byte == 0x29 or byte == 0x3C or byte == 0x3E or byte == 0x5B or byte == 0x5D or byte == 0x7B or byte == 0x7D or byte == 0x2F or byte == 0x25: - return CHAR_DELIMITER - return CHAR_REGULAR - -def IsNumeric(str): - return re.match('^[0-9]+', str) - -class cPDFTokenizer: - def __init__(self, file): - self.oPDF = cPDFDocument(file) - self.ungetted = [] - - def Token(self): - if len(self.ungetted) != 0: - return self.ungetted.pop() - if self.oPDF == None: - return None - self.byte = self.oPDF.byte() - if self.byte == None: - self.oPDF = None - return None - elif CharacterClass(self.byte) == CHAR_WHITESPACE: - file_str = StringIO() - while self.byte != None and CharacterClass(self.byte) == CHAR_WHITESPACE: - file_str.write(chr(self.byte)) - self.byte = self.oPDF.byte() - if self.byte != None: - self.oPDF.unget(self.byte) - else: - self.oPDF = None - self.token = file_str.getvalue() - return (CHAR_WHITESPACE, self.token) - elif CharacterClass(self.byte) == CHAR_REGULAR: - file_str = StringIO() - while self.byte != None and CharacterClass(self.byte) == CHAR_REGULAR: - file_str.write(chr(self.byte)) - self.byte = self.oPDF.byte() - if self.byte != None: - self.oPDF.unget(self.byte) - else: - self.oPDF = None - self.token = file_str.getvalue() - return (CHAR_REGULAR, self.token) - else: - if self.byte == 0x3C: - self.byte = self.oPDF.byte() - if self.byte == 0x3C: - return (CHAR_DELIMITER, '<<') - else: - self.oPDF.unget(self.byte) - return (CHAR_DELIMITER, '<') - elif self.byte == 0x3E: - self.byte = self.oPDF.byte() - if self.byte == 0x3E: - return (CHAR_DELIMITER, '>>') - else: - self.oPDF.unget(self.byte) - return (CHAR_DELIMITER, '>') - elif self.byte == 0x25: - file_str = StringIO() - while self.byte != None: - file_str.write(chr(self.byte)) - if self.byte == 10 or self.byte == 13: - self.byte = self.oPDF.byte() - break - self.byte = self.oPDF.byte() - if self.byte != None: - if self.byte == 10: - file_str.write(chr(self.byte)) - else: - self.oPDF.unget(self.byte) - else: - self.oPDF = None - self.token = file_str.getvalue() - return (CHAR_DELIMITER, self.token) - return (CHAR_DELIMITER, chr(self.byte)) - - def TokenIgnoreWhiteSpace(self): - token = self.Token() - while token != None and token[0] == CHAR_WHITESPACE: - token = self.Token() - return token - - def Tokens(self): - tokens = [] - token = self.Token() - while token != None: - tokens.append(token) - token = self.Token() - return tokens - - def unget(self, byte): - self.ungetted.append(byte) - -class cPDFParser: - def __init__(self, file, verbose=False, extract=None, objstm=None): - self.context = CONTEXT_NONE - self.content = [] - self.oPDFTokenizer = cPDFTokenizer(file) - self.verbose = verbose - self.extract = extract - self.objstm = objstm - - def GetObject(self): - while True: - if self.context == CONTEXT_OBJ: - self.token = self.oPDFTokenizer.Token() - else: - self.token = self.oPDFTokenizer.TokenIgnoreWhiteSpace() - if self.token: - if self.token[0] == CHAR_DELIMITER: - if self.token[1][0] == '%': - if self.context == CONTEXT_OBJ: - self.content.append(self.token) - else: - return cPDFElementComment(self.token[1]) - elif self.token[1] == '/': - self.token2 = self.oPDFTokenizer.Token() - if self.token2[0] == CHAR_REGULAR: - if self.context != CONTEXT_NONE: - self.content.append((CHAR_DELIMITER, self.token[1] + self.token2[1])) - elif self.verbose: - print('todo 1: %s' % (self.token[1] + self.token2[1])) - else: - self.oPDFTokenizer.unget(self.token2) - if self.context != CONTEXT_NONE: - self.content.append(self.token) - elif self.verbose: - print('todo 2: %d %s' % (self.token[0], repr(self.token[1]))) - elif self.context != CONTEXT_NONE: - self.content.append(self.token) - elif self.verbose: - print('todo 3: %d %s' % (self.token[0], repr(self.token[1]))) - elif self.token[0] == CHAR_WHITESPACE: - if self.context != CONTEXT_NONE: - self.content.append(self.token) - elif self.verbose: - print('todo 4: %d %s' % (self.token[0], repr(self.token[1]))) - else: - if self.context == CONTEXT_OBJ: - if self.token[1] == 'endobj': - self.oPDFElementIndirectObject = cPDFElementIndirectObject(self.objectId, self.objectVersion, self.content, self.objstm) - self.context = CONTEXT_NONE - self.content = [] - return self.oPDFElementIndirectObject - else: - self.content.append(self.token) - elif self.context == CONTEXT_TRAILER: - if self.token[1] == 'startxref' or self.token[1] == 'xref': - self.oPDFElementTrailer = cPDFElementTrailer(self.content) - self.oPDFTokenizer.unget(self.token) - self.context = CONTEXT_NONE - self.content = [] - return self.oPDFElementTrailer - else: - self.content.append(self.token) - elif self.context == CONTEXT_XREF: - if self.token[1] == 'trailer' or self.token[1] == 'xref': - self.oPDFElementXref = cPDFElementXref(self.content) - self.oPDFTokenizer.unget(self.token) - self.context = CONTEXT_NONE - self.content = [] - return self.oPDFElementXref - else: - self.content.append(self.token) - else: - if IsNumeric(self.token[1]): - self.token2 = self.oPDFTokenizer.TokenIgnoreWhiteSpace() - if IsNumeric(self.token2[1]): - self.token3 = self.oPDFTokenizer.TokenIgnoreWhiteSpace() - if self.token3[1] == 'obj': - self.objectId = int(self.token[1], 10) - self.objectVersion = int(self.token2[1], 10) - self.context = CONTEXT_OBJ - else: - self.oPDFTokenizer.unget(self.token3) - self.oPDFTokenizer.unget(self.token2) - if self.verbose: - print('todo 6: %d %s' % (self.token[0], repr(self.token[1]))) - else: - self.oPDFTokenizer.unget(self.token2) - if self.verbose: - print('todo 7: %d %s' % (self.token[0], repr(self.token[1]))) - elif self.token[1] == 'trailer': - self.context = CONTEXT_TRAILER - self.content = [self.token] - elif self.token[1] == 'xref': - self.context = CONTEXT_XREF - self.content = [self.token] - elif self.token[1] == 'startxref': - self.token2 = self.oPDFTokenizer.TokenIgnoreWhiteSpace() - if self.token2 and IsNumeric(self.token2[1]): - return cPDFElementStartxref(int(self.token2[1], 10)) - else: - self.oPDFTokenizer.unget(self.token2) - if self.verbose: - print('todo 9: %d %s' % (self.token[0], repr(self.token[1]))) - elif self.extract: - self.bytes = '' - while self.token: - self.bytes += self.token[1] - self.token = self.oPDFTokenizer.Token() - return cPDFElementMalformed(self.bytes) - elif self.verbose: - print('todo 10: %d %s' % (self.token[0], repr(self.token[1]))) - else: - break - -class cPDFElementComment: - def __init__(self, comment): - self.type = PDF_ELEMENT_COMMENT - self.comment = comment -# if re.match('^%PDF-[0-9]\.[0-9]', self.token[1]): -# print(repr(self.token[1])) -# elif re.match('^%%EOF', self.token[1]): -# print(repr(self.token[1])) - -class cPDFElementXref: - def __init__(self, content): - self.type = PDF_ELEMENT_XREF - self.content = content - -class cPDFElementTrailer: - def __init__(self, content): - self.type = PDF_ELEMENT_TRAILER - self.content = content - - def Contains(self, keyword): - data = '' - for i in range(0, len(self.content)): - if self.content[i][1] == 'stream': - break - else: - data += Canonicalize(self.content[i][1]) - return data.upper().find(keyword.upper()) != -1 - -def IIf(expr, truepart, falsepart): - if expr: - return truepart - else: - return falsepart - -class cPDFElementIndirectObject: - def __init__(self, id, version, content, objstm=None): - self.type = PDF_ELEMENT_INDIRECT_OBJECT - self.id = id - self.version = version - self.content = content - self.objstm = objstm - #fix stream for Ghostscript bug reported by Kurt - if self.ContainsStream(): - position = len(self.content) - 1 - if position < 0: - return - while self.content[position][0] == CHAR_WHITESPACE and position >= 0: - position -= 1 - if position < 0: - return - if self.content[position][1].endswith('endstream\n'): - self.content = self.content[0:position] + [(self.content[position][0], self.content[position][1][:-len('endstream\n')])] + [(CHAR_REGULAR, 'endstream')] + self.content[position+1:] - return - if self.content[position][0] != CHAR_REGULAR: - return - if self.content[position][1] == 'endstream': - return - if not self.content[position][1].endswith('endstream'): - return - self.content = self.content[0:position] + [(self.content[position][0], self.content[position][1][:-len('endstream')])] + [(self.content[position][0], 'endstream')] + self.content[position+1:] - - def GetType(self): - content = CopyWithoutWhiteSpace(self.content) - dictionary = 0 - for i in range(0, len(content)): - if content[i][0] == CHAR_DELIMITER and content[i][1] == '<<': - dictionary += 1 - if content[i][0] == CHAR_DELIMITER and content[i][1] == '>>': - dictionary -= 1 - if dictionary == 1 and content[i][0] == CHAR_DELIMITER and EqualCanonical(content[i][1], '/Type') and i < len(content) - 1: - return content[i+1][1] - return '' - - def GetReferences(self): - content = CopyWithoutWhiteSpace(self.content) - references = [] - for i in range(0, len(content)): - if i > 1 and content[i][0] == CHAR_REGULAR and content[i][1] == 'R' and content[i-2][0] == CHAR_REGULAR and IsNumeric(content[i-2][1]) and content[i-1][0] == CHAR_REGULAR and IsNumeric(content[i-1][1]): - references.append((content[i-2][1], content[i-1][1], content[i][1])) - return references - - def References(self, index): - for ref in self.GetReferences(): - if ref[0] == index: - return True - return False - - def ContainsStream(self): - for i in range(0, len(self.content)): - if self.content[i][0] == CHAR_REGULAR and self.content[i][1] == 'stream': - return self.content[0:i] - return False - - def Contains(self, keyword): - data = '' - for i in range(0, len(self.content)): - if self.content[i][1] == 'stream': - break - else: - data += Canonicalize(self.content[i][1]) - return data.upper().find(keyword.upper()) != -1 - - def ContainsName(self, keyword): - for token in self.content: - if token[1] == 'stream': - return False - if token[0] == CHAR_DELIMITER and EqualCanonical(token[1], keyword): - return True - return False - - def StreamContains(self, keyword, filter, casesensitive, regex, overridingfilters): - if not self.ContainsStream(): - return False - streamData = self.Stream(filter, overridingfilters) - if filter and streamData == 'No filters': - streamData = self.Stream(False, overridingfilters) - if isinstance(streamData, bytes): - keyword = keyword.encode() - if regex: - return re.search(keyword, streamData, IIf(casesensitive, 0, re.I)) - elif casesensitive: - return keyword in streamData - else: - return keyword.lower() in streamData.lower() - - def Stream(self, filter=True, overridingfilters=''): - state = 'start' - countDirectories = 0 - data = '' - filters = [] - for i in range(0, len(self.content)): - if state == 'start': - if self.content[i][0] == CHAR_DELIMITER and self.content[i][1] == '<<': - countDirectories += 1 - if self.content[i][0] == CHAR_DELIMITER and self.content[i][1] == '>>': - countDirectories -= 1 - if countDirectories == 1 and self.content[i][0] == CHAR_DELIMITER and EqualCanonical(self.content[i][1], '/Filter'): - state = 'filter' - elif countDirectories == 0 and self.content[i][0] == CHAR_REGULAR and self.content[i][1] == 'stream': - state = 'stream-whitespace' - elif state == 'filter': - if self.content[i][0] == CHAR_DELIMITER and self.content[i][1][0] == '/': - filters = [self.content[i][1]] - state = 'search-stream' - elif self.content[i][0] == CHAR_DELIMITER and self.content[i][1] == '[': - state = 'filter-list' - elif state == 'filter-list': - if self.content[i][0] == CHAR_DELIMITER and self.content[i][1][0] == '/': - filters.append(self.content[i][1]) - elif self.content[i][0] == CHAR_DELIMITER and self.content[i][1] == ']': - state = 'search-stream' - elif state == 'search-stream': - if self.content[i][0] == CHAR_REGULAR and self.content[i][1] == 'stream': - state = 'stream-whitespace' - elif state == 'stream-whitespace': - if self.content[i][0] == CHAR_WHITESPACE: - whitespace = self.content[i][1] - if whitespace.startswith('\x0D\x0A') and len(whitespace) > 2: - data += whitespace[2:] - elif whitespace.startswith('\x0A') and len(whitespace) > 1: - data += whitespace[1:] - else: - data += self.content[i][1] - state = 'stream-concat' - elif state == 'stream-concat': - if self.content[i][0] == CHAR_REGULAR and self.content[i][1] == 'endstream': - if filter: - if overridingfilters == '': - return self.Decompress(data, filters) - elif overridingfilters == 'raw': - return data - else: - return self.Decompress(data, overridingfilters.split(' ')) - else: - return data - else: - data += self.content[i][1] - else: - return 'Unexpected filter state' - return filters - - def Decompress(self, data, filters): - for filter in filters: - if EqualCanonical(filter, '/FlateDecode') or EqualCanonical(filter, '/Fl'): - try: - data = FlateDecode(data) - except zlib.error as e: - message = 'FlateDecode decompress failed' - if len(data) > 0 and ord(data[0]) & 0x0F != 8: - message += ', unexpected compression method: %02x' % ord(data[0]) - return message + '. zlib.error %s' % e.message - elif EqualCanonical(filter, '/ASCIIHexDecode') or EqualCanonical(filter, '/AHx'): - try: - data = ASCIIHexDecode(data) - except: - return 'ASCIIHexDecode decompress failed' - elif EqualCanonical(filter, '/ASCII85Decode') or EqualCanonical(filter, '/A85'): - try: - data = ASCII85Decode(data.rstrip('>')) - except: - return 'ASCII85Decode decompress failed' - elif EqualCanonical(filter, '/LZWDecode') or EqualCanonical(filter, '/LZW'): - try: - data = LZWDecode(data) - except: - return 'LZWDecode decompress failed' - elif EqualCanonical(filter, '/RunLengthDecode') or EqualCanonical(filter, '/R'): - try: - data = RunLengthDecode(data) - except: - return 'RunLengthDecode decompress failed' -# elif i.startswith('/CC') # CCITTFaxDecode -# elif i.startswith('/DCT') # DCTDecode - else: - return 'Unsupported filter: %s' % repr(filters) - if len(filters) == 0: - return 'No filters' - else: - return data - - def StreamYARAMatch(self, rules, decoders, decoderoptions, filter, overridingfilters): - if not self.ContainsStream(): - return None - streamData = self.Stream(filter, overridingfilters) - if filter and streamData == 'No filters': - streamData = self.Stream(False, overridingfilters) - - oDecoders = [cIdentity(streamData, None)] - for cDecoder in decoders: - try: - oDecoder = cDecoder(streamData, decoderoptions) - oDecoders.append(oDecoder) - except Exception as e: - print('Error instantiating decoder: %s' % cDecoder.name) - raise e - results = [] - for oDecoder in oDecoders: - while oDecoder.Available(): - yaraResults = rules.match(data=oDecoder.Decode()) - if yaraResults != []: - results.append([oDecoder.Name(), yaraResults]) - - return results - -class cPDFElementStartxref: - def __init__(self, index): - self.type = PDF_ELEMENT_STARTXREF - self.index = index - -class cPDFElementMalformed: - def __init__(self, content): - self.type = PDF_ELEMENT_MALFORMED - self.content = content - -def TrimLWhiteSpace(data): - while data != [] and data[0][0] == CHAR_WHITESPACE: - data = data[1:] - return data - -def TrimRWhiteSpace(data): - while data != [] and data[-1][0] == CHAR_WHITESPACE: - data = data[:-1] - return data - -class cPDFParseDictionary: - def __init__(self, content, nocanonicalizedoutput): - self.content = content - self.nocanonicalizedoutput = nocanonicalizedoutput - dataTrimmed = TrimLWhiteSpace(TrimRWhiteSpace(self.content)) - if dataTrimmed == []: - self.parsed = None - elif self.isOpenDictionary(dataTrimmed[0]) and (self.isCloseDictionary(dataTrimmed[-1]) or self.couldBeCloseDictionary(dataTrimmed[-1])): - self.parsed = self.ParseDictionary(dataTrimmed)[0] - else: - self.parsed = None - - def isOpenDictionary(self, token): - return token[0] == CHAR_DELIMITER and token[1] == '<<' - - def isCloseDictionary(self, token): - return token[0] == CHAR_DELIMITER and token[1] == '>>' - - def couldBeCloseDictionary(self, token): - return token[0] == CHAR_DELIMITER and token[1].rstrip().endswith('>>') - - def ParseDictionary(self, tokens): - state = 0 # start - dictionary = [] - while tokens != []: - if state == 0: - if self.isOpenDictionary(tokens[0]): - state = 1 - else: - return None, tokens - elif state == 1: - if self.isOpenDictionary(tokens[0]): - pass - elif self.isCloseDictionary(tokens[0]): - return dictionary, tokens - elif tokens[0][0] != CHAR_WHITESPACE: - key = ConditionalCanonicalize(tokens[0][1], self.nocanonicalizedoutput) - value = [] - state = 2 - elif state == 2: - if self.isOpenDictionary(tokens[0]): - value, tokens = self.ParseDictionary(tokens) - dictionary.append((key, value)) - state = 1 - elif self.isCloseDictionary(tokens[0]): - dictionary.append((key, value)) - return dictionary, tokens - elif value == [] and tokens[0][0] == CHAR_WHITESPACE: - pass - elif value == [] and tokens[0][1] == '[': - value.append(tokens[0][1]) - elif value != [] and value[0] == '[' and tokens[0][1] != ']': - value.append(tokens[0][1]) - elif value != [] and value[0] == '[' and tokens[0][1] == ']': - value.append(tokens[0][1]) - dictionary.append((key, value)) - value = [] - state = 1 - elif value == [] and tokens[0][1] == '(': - value.append(tokens[0][1]) - elif value != [] and value[0] == '(' and tokens[0][1] != ')': - if tokens[0][1][0] == '%': - tokens = [tokens[0]] + cPDFTokenizer(StringIO(tokens[0][1][1:])).Tokens() + tokens[1:] - value.append('%') - else: - value.append(tokens[0][1]) - elif value != [] and value[0] == '(' and tokens[0][1] == ')': - value.append(tokens[0][1]) - balanced = 0 - for item in value: - if item == '(': - balanced += 1 - elif item == ')': - balanced -= 1 - if balanced < 0 and self.verbose: - print('todo 11: ' + repr(value)) - if balanced < 1: - dictionary.append((key, value)) - value = [] - state = 1 - elif value != [] and tokens[0][1][0] == '/': - dictionary.append((key, value)) - key = ConditionalCanonicalize(tokens[0][1], self.nocanonicalizedoutput) - value = [] - state = 2 - else: - value.append(ConditionalCanonicalize(tokens[0][1], self.nocanonicalizedoutput)) - tokens = tokens[1:] - return None, tokens - - def Retrieve(self): - return self.parsed - - def PrettyPrintSubElement(self, prefix, e): - if e[1] == []: - print('%s %s' % (prefix, e[0])) - elif type(e[1][0]) == type(''): - if len(e[1]) == 3 and IsNumeric(e[1][0]) and e[1][1] == '0' and e[1][2] == 'R': - joiner = ' ' - else: - joiner = '' - value = joiner.join(e[1]).strip() - reprValue = repr(value) - if "'" + value + "'" != reprValue: - value = reprValue - print('%s %s %s' % (prefix, e[0], value)) - else: - print('%s %s' % (prefix, e[0])) - self.PrettyPrintSub(prefix + ' ', e[1]) - - def PrettyPrintSub(self, prefix, dictionary): - if dictionary != None: - print('%s<<' % prefix) - for e in dictionary: - self.PrettyPrintSubElement(prefix, e) - print('%s>>' % prefix) - - def PrettyPrint(self, prefix): - self.PrettyPrintSub(prefix, self.parsed) - - def Get(self, select): - for key, value in self.parsed: - if key == select: - return value - return None - - def GetNestedSub(self, dictionary, select): - for key, value in dictionary: - if key == select: - return self.PrettyPrintSubElement('', [select, value]) - if type(value) == type([]) and len(value) > 0 and type(value[0]) == type((None,)): - result = self.GetNestedSub(value, select) - if result !=None: - return self.PrettyPrintSubElement('', [select, result]) - return None - - def GetNested(self, select): - return self.GetNestedSub(self.parsed, select) - -def FormatOutput(data, raw): - if raw: - if type(data) == type([]): - return ''.join(map(lambda x: x[1], data)) - else: - return data - elif sys.version_info[0] > 2: - return ascii(data) - else: - return repr(data) - -#Fix for http://bugs.python.org/issue11395 -def StdoutWriteChunked(data): - if sys.version_info[0] > 2: - sys.stdout.buffer.write(data) - else: - while data != '': - sys.stdout.write(data[0:10000]) - try: - sys.stdout.flush() - except IOError: - return - data = data[10000:] - -def IfWIN32SetBinary(io): - if sys.platform == 'win32': - import msvcrt - msvcrt.setmode(io.fileno(), os.O_BINARY) - -def PrintOutputObject(object, options): - if options.dump == '-': - filtered = object.Stream(options.filter == True, options.overridingfilters) - if filtered == []: - filtered = '' - IfWIN32SetBinary(sys.stdout) - StdoutWriteChunked(filtered) - return - - print('obj %d %d' % (object.id, object.version)) - if object.objstm != None: - print(' Containing /ObjStm: %d %d' % object.objstm) - print(' Type: %s' % ConditionalCanonicalize(object.GetType(), options.nocanonicalizedoutput)) - print(' Referencing: %s' % ', '.join(map(lambda x: '%s %s %s' % x, object.GetReferences()))) - dataPrecedingStream = object.ContainsStream() - oPDFParseDictionary = None - if dataPrecedingStream: - print(' Contains stream') - if options.debug: - print(' %s' % FormatOutput(dataPrecedingStream, options.raw)) - oPDFParseDictionary = cPDFParseDictionary(dataPrecedingStream, options.nocanonicalizedoutput) - if options.hash: - streamContent = object.Stream(False, options.overridingfilters) - print(' unfiltered') - print(' len: %6d md5: %s' % (len(streamContent), hashlib.md5(streamContent).hexdigest())) - print(' %s' % HexAsciiDumpLine(streamContent)) - streamContent = object.Stream(True, options.overridingfilters) - print(' filtered') - print(' len: %6d md5: %s' % (len(streamContent), hashlib.md5(streamContent).hexdigest())) - print(' %s' % HexAsciiDumpLine(streamContent)) - streamContent = None - else: - if options.debug or options.raw: - print(' %s' % FormatOutput(object.content, options.raw)) - oPDFParseDictionary = cPDFParseDictionary(object.content, options.nocanonicalizedoutput) - print('') - oPDFParseDictionary.PrettyPrint(' ') - print('') - if options.filter and not options.dump: - filtered = object.Stream(overridingfilters=options.overridingfilters) - if filtered == []: - print(' %s' % FormatOutput(object.content, options.raw)) - else: - print(' %s' % FormatOutput(filtered, options.raw)) - if options.content: - if object.ContainsStream(): - stream = object.Stream(False, options.overridingfilters) - if stream != []: - print(' %s' % FormatOutput(stream, options.raw)) - else: - print(''.join([token[1] for token in object.content])) - - - if options.dump: - filtered = object.Stream(options.filter == True, options.overridingfilters) - if filtered == []: - filtered = '' - try: - fDump = open(options.dump, 'wb') - try: - fDump.write(C2BIP3(filtered)) - except: - print('Error writing file %s' % options.dump) - fDump.close() - except: - print('Error writing file %s' % options.dump) - print('') - return - -def Canonicalize(sIn): - if sIn == '': - return sIn - elif sIn[0] != '/': - return sIn - elif sIn.find('#') == -1: - return sIn - else: - i = 0 - iLen = len(sIn) - sCanonical = '' - while i < iLen: - if sIn[i] == '#' and i < iLen - 2: - try: - sCanonical += chr(int(sIn[i+1:i+3], 16)) - i += 2 - except: - sCanonical += sIn[i] - else: - sCanonical += sIn[i] - i += 1 - return sCanonical - -def EqualCanonical(s1, s2): - return Canonicalize(s1) == s2 - -def ConditionalCanonicalize(sIn, nocanonicalizedoutput): - if nocanonicalizedoutput: - return sIn - else: - return Canonicalize(sIn) - -# http://code.google.com/p/pdfminerr/source/browse/trunk/pdfminer/pdfminer/ascii85.py -def ASCII85Decode(data): - import struct - n = b = 0 - out = b'' - for c in data: - if '!' <= c and c <= 'u': - n += 1 - b = b*85+(ord(c)-33) - if n == 5: - out += struct.pack('>L',b) - n = b = 0 - elif c == 'z': - assert n == 0 - out += b'\0\0\0\0' - elif c == '~': - if n: - for _ in range(5-n): - b = b*85+84 - out += struct.pack('>L',b)[:n-1] - break - return out - -def ASCIIHexDecode(data): - return binascii.unhexlify(''.join([c for c in data if c not in ' \t\n\r']).rstrip('>')) - -# if inflating fails, we try to inflate byte per byte (sample 4da299d6e52bbb79c0ac00bad6a1d51d4d5fe42965a8d94e88a359e5277117e2) -def FlateDecode(data): - try: - return zlib.decompress(C2BIP3(data)) - except: - if len(data) <= 10: - raise - oDecompress = zlib.decompressobj() - oStringIO = StringIO() - count = 0 - for byte in C2BIP3(data): - try: - oStringIO.write(oDecompress.decompress(byte)) - count += 1 - except: - break - if len(data) - count <= 2: - return oStringIO.getvalue() - else: - raise - -def RunLengthDecode(data): - f = StringIO(data) - decompressed = '' - runLength = ord(f.read(1)) - while runLength: - if runLength < 128: - decompressed += f.read(runLength + 1) - if runLength > 128: - decompressed += f.read(1) * (257 - runLength) - if runLength == 128: - break - runLength = ord(f.read(1)) -# return sub(r'(\d+)(\D)', lambda m: m.group(2) * int(m.group(1)), data) - return decompressed - -#### LZW code sourced from pdfminer -# Copyright (c) 2004-2009 Yusuke Shinyama <yusuke at cs dot nyu dot edu> -# -# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated -# documentation files (the "Software"), to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, -# and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -class LZWDecoder(object): - def __init__(self, fp): - self.fp = fp - self.buff = 0 - self.bpos = 8 - self.nbits = 9 - self.table = None - self.prevbuf = None - return - - def readbits(self, bits): - v = 0 - while 1: - # the number of remaining bits we can get from the current buffer. - r = 8-self.bpos - if bits <= r: - # |-----8-bits-----| - # |-bpos-|-bits-| | - # | |----r----| - v = (v<<bits) | ((self.buff>>(r-bits)) & ((1<<bits)-1)) - self.bpos += bits - break - else: - # |-----8-bits-----| - # |-bpos-|---bits----... - # | |----r----| - v = (v<<r) | (self.buff & ((1<<r)-1)) - bits -= r - x = self.fp.read(1) - if not x: raise EOFError - self.buff = ord(x) - self.bpos = 0 - return v - - def feed(self, code): - x = '' - if code == 256: - self.table = [ chr(c) for c in range(256) ] # 0-255 - self.table.append(None) # 256 - self.table.append(None) # 257 - self.prevbuf = '' - self.nbits = 9 - elif code == 257: - pass - elif not self.prevbuf: - x = self.prevbuf = self.table[code] - else: - if code < len(self.table): - x = self.table[code] - self.table.append(self.prevbuf+x[0]) - else: - self.table.append(self.prevbuf+self.prevbuf[0]) - x = self.table[code] - l = len(self.table) - if l == 511: - self.nbits = 10 - elif l == 1023: - self.nbits = 11 - elif l == 2047: - self.nbits = 12 - self.prevbuf = x - return x - - def run(self): - while 1: - try: - code = self.readbits(self.nbits) - except EOFError: - break - x = self.feed(code) - yield x - return - -#### - -def LZWDecode(data): - return ''.join(LZWDecoder(StringIO(data)).run()) - -def PrintGenerateObject(object, options, newId=None): - if newId == None: - objectId = object.id - else: - objectId = newId - dataPrecedingStream = object.ContainsStream() - if dataPrecedingStream: - if options.filter: - decompressed = object.Stream(True, options.overridingfilters) - if decompressed == 'No filters' or decompressed.startswith('Unsupported filter: '): - print(' oPDF.stream(%d, %d, %s, %s)' % (objectId, object.version, repr(object.Stream(False, options.overridingfilters).rstrip()), repr(re.sub('/Length\s+\d+', '/Length %d', FormatOutput(dataPrecedingStream, True)).strip()))) - else: - dictionary = FormatOutput(dataPrecedingStream, True) - dictionary = re.sub(r'/Length\s+\d+', '', dictionary) - dictionary = re.sub(r'/Filter\s*/[a-zA-Z0-9]+', '', dictionary) - dictionary = re.sub(r'/Filter\s*\[.+\]', '', dictionary) - dictionary = re.sub(r'^\s*<<', '', dictionary) - dictionary = re.sub(r'>>\s*$', '', dictionary) - dictionary = dictionary.strip() - print(" oPDF.stream2(%d, %d, %s, %s, 'f')" % (objectId, object.version, repr(decompressed.rstrip()), repr(dictionary))) - else: - print(' oPDF.stream(%d, %d, %s, %s)' % (objectId, object.version, repr(object.Stream(False, options.overridingfilters).rstrip()), repr(re.sub('/Length\s+\d+', '/Length %d', FormatOutput(dataPrecedingStream, True)).strip()))) - else: - print(' oPDF.indirectobject(%d, %d, %s)' % (objectId, object.version, repr(FormatOutput(object.content, True).strip()))) - -def PrintObject(object, options): - if options.generate: - PrintGenerateObject(object, options) - else: - PrintOutputObject(object, options) - -def File2Strings(filename): - try: - f = open(filename, 'r') - except: - return None - try: - return map(lambda line:line.rstrip('\n'), f.readlines()) - except: - return None - finally: - f.close() - -def ProcessAt(argument): - if argument.startswith('@'): - strings = File2Strings(argument[1:]) - if strings == None: - raise Exception('Error reading %s' % argument) - else: - return strings - else: - return [argument] - -def YARACompile(ruledata): - if ruledata.startswith('#'): - if ruledata.startswith('#h#'): - rule = binascii.a2b_hex(ruledata[3:]) - elif ruledata.startswith('#b#'): - rule = binascii.a2b_base64(ruledata[3:]) - elif ruledata.startswith('#s#'): - rule = 'rule string {strings: $a = "%s" ascii wide nocase condition: $a}' % ruledata[3:] - elif ruledata.startswith('#q#'): - rule = ruledata[3:].replace("'", '"') - else: - rule = ruledata[1:] - return yara.compile(source=rule) - else: - dFilepaths = {} - if os.path.isdir(ruledata): - for root, dirs, files in os.walk(ruledata): - for file in files: - filename = os.path.join(root, file) - dFilepaths[filename] = filename - else: - for filename in ProcessAt(ruledata): - dFilepaths[filename] = filename - return yara.compile(filepaths=dFilepaths) - -def AddDecoder(cClass): - global decoders - - decoders.append(cClass) - -class cDecoderParent(): - pass - -def GetScriptPath(): - if getattr(sys, 'frozen', False): - return os.path.dirname(sys.executable) - else: - return os.path.dirname(sys.argv[0]) - -def LoadDecoders(decoders, verbose): - if decoders == '': - return - scriptPath = GetScriptPath() - for decoder in sum(map(ProcessAt, decoders.split(',')), []): - try: - if not decoder.lower().endswith('.py'): - decoder += '.py' - if os.path.dirname(decoder) == '': - if not os.path.exists(decoder): - scriptDecoder = os.path.join(scriptPath, decoder) - if os.path.exists(scriptDecoder): - decoder = scriptDecoder - exec(open(decoder, 'r').read(), globals(), globals()) - except Exception as e: - print('Error loading decoder: %s' % decoder) - if verbose: - raise e - -class cIdentity(cDecoderParent): - name = 'Identity function decoder' - - def __init__(self, stream, options): - self.stream = stream - self.options = options - self.available = True - - def Available(self): - return self.available - - def Decode(self): - self.available = False - return self.stream - - def Name(self): - return '' - -def DecodeFunction(decoders, options, stream): - if decoders == []: - return stream - return decoders[0](stream, options.decoderoptions).Decode() - -class cDumpStream(): - def __init__(self): - self.text = '' - - def Addline(self, line): - if line != '': - self.text += line + '\n' - - def Content(self): - return self.text - -def HexDump(data): - oDumpStream = cDumpStream() - hexDump = '' - for i, b in enumerate(data): - if i % dumplinelength == 0 and hexDump != '': - oDumpStream.Addline(hexDump) - hexDump = '' - hexDump += IFF(hexDump == '', '', ' ') + '%02X' % ord(b) - oDumpStream.Addline(hexDump) - return oDumpStream.Content() - -def CombineHexAscii(hexDump, asciiDump): - if hexDump == '': - return '' - return hexDump + ' ' + (' ' * (3 * (dumplinelength - len(asciiDump)))) + asciiDump - -def HexAsciiDump(data): - oDumpStream = cDumpStream() - hexDump = '' - asciiDump = '' - for i, b in enumerate(data): - if i % dumplinelength == 0: - if hexDump != '': - oDumpStream.Addline(CombineHexAscii(hexDump, asciiDump)) - hexDump = '%08X:' % i - asciiDump = '' - hexDump+= ' %02X' % ord(b) - asciiDump += IFF(ord(b) >= 32, b, '.') - oDumpStream.Addline(CombineHexAscii(hexDump, asciiDump)) - return oDumpStream.Content() - -def HexAsciiDumpLine(data): - return HexAsciiDump(data[0:16])[10:-1] - -def ParseINIFile(): - oConfigParser = ConfigParser.ConfigParser(allow_no_value=True) - oConfigParser.optionxform = str - oConfigParser.read(os.path.join(GetScriptPath(), 'pdfid.ini')) - keywords = [] - if oConfigParser.has_section('keywords'): - for key, value in oConfigParser.items('keywords'): - if not key in keywords: - keywords.append(key) - return keywords - -def MatchObjectID(id, selection): - return str(id) in selection.split(',') - -def GetArguments(): - arguments = sys.argv[1:] - envvar = os.getenv('PDFPARSER_OPTIONS') - if envvar == None: - return arguments - return envvar.split(' ') + arguments - -class cHashCRC32(): - def __init__(self): - self.crc32 = None - - def update(self, data): - self.crc32 = zlib.crc32(data) - - def hexdigest(self): - return '%08x' % (self.crc32 & 0xffffffff) - -class cHashChecksum8(): - def __init__(self): - self.sum = 0 - - def update(self, data): - if sys.version_info[0] >= 3: - self.sum += sum(data) - else: - self.sum += sum(map(ord, data)) - - def hexdigest(self): - return '%08x' % (self.sum) - -dSpecialHashes = {'crc32': cHashCRC32, 'checksum8': cHashChecksum8} - -def GetHashObjects(algorithms): - global dSpecialHashes - - dHashes = {} - - if algorithms == '': - algorithms = os.getenv('DSS_DEFAULT_HASH_ALGORITHMS', 'md5') - if ',' in algorithms: - hashes = algorithms.split(',') - else: - hashes = algorithms.split(';') - for name in hashes: - if not name in dSpecialHashes.keys() and not name in hashlib.algorithms_available: - print('Error: unknown hash algorithm: %s' % name) - print('Available hash algorithms: ' + ' '.join([name for name in list(hashlib.algorithms_available)] + list(dSpecialHashes.keys()))) - return [], {} - elif name in dSpecialHashes.keys(): - dHashes[name] = dSpecialHashes[name]() - else: - dHashes[name] = hashlib.new(name) - - return hashes, dHashes - -def CalculateChosenHash(data): - hashes, dHashes = GetHashObjects('') - dHashes[hashes[0]].update(data) - return dHashes[hashes[0]].hexdigest(), hashes[0] - -def Main(): - """pdf-parser, use it to parse a PDF document - """ - - global decoders - - oParser = optparse.OptionParser(usage='usage: %prog [options] pdf-file|zip-file|url\n' + __description__, version='%prog ' + __version__) - oParser.add_option('-m', '--man', action='store_true', default=False, help='Print manual') - oParser.add_option('-s', '--search', help='string to search in indirect objects (except streams)') - oParser.add_option('-f', '--filter', action='store_true', default=False, help='pass stream object through filters (FlateDecode, ASCIIHexDecode, ASCII85Decode, LZWDecode and RunLengthDecode only)') - oParser.add_option('-o', '--object', help='id(s) of indirect object(s) to select, use comma (,) to separate ids (version independent)') - oParser.add_option('-r', '--reference', help='id of indirect object being referenced (version independent)') - oParser.add_option('-e', '--elements', help='type of elements to select (cxtsi)') - oParser.add_option('-w', '--raw', action='store_true', default=False, help='raw output for data and filters') - oParser.add_option('-a', '--stats', action='store_true', default=False, help='display stats for pdf document') - oParser.add_option('-t', '--type', help='type of indirect object to select') - oParser.add_option('-O', '--objstm', action='store_true', default=False, help='parse stream of /ObjStm objects') - oParser.add_option('-v', '--verbose', action='store_true', default=False, help='display malformed PDF elements') - oParser.add_option('-x', '--extract', help='filename to extract malformed content to') - oParser.add_option('-H', '--hash', action='store_true', default=False, help='display hash of objects') - oParser.add_option('-n', '--nocanonicalizedoutput', action='store_true', default=False, help='do not canonicalize the output') - oParser.add_option('-d', '--dump', help='filename to dump stream content to') - oParser.add_option('-D', '--debug', action='store_true', default=False, help='display debug info') - oParser.add_option('-c', '--content', action='store_true', default=False, help='display the content for objects without streams or with streams without filters') - oParser.add_option('--searchstream', help='string to search in streams') - oParser.add_option('--unfiltered', action='store_true', default=False, help='search in unfiltered streams') - oParser.add_option('--casesensitive', action='store_true', default=False, help='case sensitive search in streams') - oParser.add_option('--regex', action='store_true', default=False, help='use regex to search in streams') - oParser.add_option('--overridingfilters', type=str, default='', help='override filters with given filters (use raw for the raw stream content)') - oParser.add_option('-g', '--generate', action='store_true', default=False, help='generate a Python program that creates the parsed PDF file') - oParser.add_option('--generateembedded', type=int, default=0, help='generate a Python program that embeds the selected indirect object as a file') - oParser.add_option('-y', '--yara', help='YARA rule (or directory or @file) to check streams (can be used with option --unfiltered)') - oParser.add_option('--yarastrings', action='store_true', default=False, help='Print YARA strings') - oParser.add_option('--decoders', type=str, default='', help='decoders to load (separate decoders with a comma , ; @file supported)') - oParser.add_option('--decoderoptions', type=str, default='', help='options for the decoder') - oParser.add_option('-k', '--key', help='key to search in dictionaries') - (options, args) = oParser.parse_args(GetArguments()) - - if options.man: - oParser.print_help() - PrintManual() - return 0 - - if len(args) != 1: - oParser.print_help() - print('') - print(' %s' % __description__) - print(' Source code put in the public domain by Didier Stevens, no Copyright') - print(' Use at your own risk') - print(' https://DidierStevens.com') - - else: - decoders = [] - LoadDecoders(options.decoders, True) - - oPDFParser = cPDFParser(args[0], options.verbose, options.extract) - cntComment = 0 - cntXref = 0 - cntTrailer = 0 - cntStartXref = 0 - cntIndirectObject = 0 - dicObjectTypes = {} - objectsAll = set() - objectsReferenced = set() - objectsWithStream = [] - keywords = ['/JS', '/JavaScript', '/AA', '/OpenAction', '/AcroForm', '/RichMedia', '/Launch', '/EmbeddedFile', '/XFA', '/URI'] - for extrakeyword in ParseINIFile(): - if not extrakeyword in keywords: - keywords.append(extrakeyword) - -# dKeywords = {keyword: [] for keyword in keywords} -# Done for compatibility with 2.6.6 - dKeywords = {} - for keyword in keywords: - dKeywords[keyword] = [] - - selectComment = False - selectXref = False - selectTrailer = False - selectStartXref = False - selectIndirectObject = False - if options.elements: - for c in options.elements: - if c == 'c': - selectComment = True - elif c == 'x': - selectXref = True - elif c == 't': - selectTrailer = True - elif c == 's': - selectStartXref = True - elif c == 'i': - selectIndirectObject = True - else: - print('Error: unknown --elements value %s' % c) - return - else: - selectIndirectObject = True - if not options.search and not options.object and not options.reference and not options.type and not options.searchstream and not options.key: - selectComment = True - selectXref = True - selectTrailer = True - selectStartXref = True - if options.search or options.key or options.reference: - selectTrailer = True - - if options.type == '-': - optionsType = '' - else: - optionsType = options.type - - if options.generate or options.generateembedded != 0: - savedRoot = ['1', '0', 'R'] - print('#!/usr/bin/python') - print('') - print('"""') - print('') - print('Program generated by pdf-parser.py by Didier Stevens') - print('https://DidierStevens.com') - print('Use at your own risk') - print('') - print('Input PDF file: %s' % args[0]) - print('This Python program was created on: %s' % Timestamp()) - print('') - print('"""') - print('') - print('import mPDF') - print('import sys') - print('') - print('def Main():') - print(' if len(sys.argv) != 2:') - print(" print('Usage: %s pdf-file' % sys.argv[0])") - print(' return') - print(' oPDF = mPDF.cPDF(sys.argv[1])') - - if options.generateembedded != 0: - print(" oPDF.header('1.1')") - print(r" oPDF.comment('\xd0\xd0\xd0\xd0')") - print(r" oPDF.indirectobject(1, 0, '<<\r\n /Type /Catalog\r\n /Outlines 2 0 R\r\n /Pages 3 0 R\r\n /Names << /EmbeddedFiles << /Names [(test.bin) 7 0 R] >> >>\r\n>>')") - print(r" oPDF.indirectobject(2, 0, '<<\r\n /Type /Outlines\r\n /Count 0\r\n>>')") - print(r" oPDF.indirectobject(3, 0, '<<\r\n /Type /Pages\r\n /Kids [4 0 R]\r\n /Count 1\r\n>>')") - print(r" oPDF.indirectobject(4, 0, '<<\r\n /Type /Page\r\n /Parent 3 0 R\r\n /MediaBox [0 0 612 792]\r\n /Contents 5 0 R\r\n /Resources <<\r\n /ProcSet [/PDF /Text]\r\n /Font << /F1 6 0 R >>\r\n >>\r\n>>')") - print(r" oPDF.stream(5, 0, 'BT /F1 12 Tf 70 700 Td 15 TL (This PDF document embeds file test.bin) Tj ET', '<< /Length %d >>')") - print(r" oPDF.indirectobject(6, 0, '<<\r\n /Type /Font\r\n /Subtype /Type1\r\n /Name /F1\r\n /BaseFont /Helvetica\r\n /Encoding /MacRomanEncoding\r\n>>')") - print(r" oPDF.indirectobject(7, 0, '<<\r\n /Type /Filespec\r\n /F (test.bin)\r\n /EF << /F 8 0 R >>\r\n>>')") - - if options.yara != None: - if not 'yara' in sys.modules: - print('Error: option yara requires the YARA Python module.') - return - rules = YARACompile(options.yara) - - oPDFParserOBJSTM = None - while True: - if oPDFParserOBJSTM == None: - object = oPDFParser.GetObject() - else: - object = oPDFParserOBJSTM.GetObject() - if object == None: - oPDFParserOBJSTM = None - object = oPDFParser.GetObject() - if options.objstm and hasattr(object, 'GetType') and EqualCanonical(object.GetType(), '/ObjStm') and object.ContainsStream(): - # parsing objects inside an /ObjStm object by extracting & parsing the stream content to create a synthesized PDF document, that is then parsed by cPDFParser - oPDFParseDictionary = cPDFParseDictionary(object.ContainsStream(), options.nocanonicalizedoutput) - numberOfObjects = int(oPDFParseDictionary.Get('/N')[0]) - offsetFirstObject = int(oPDFParseDictionary.Get('/First')[0]) - indexes = list(map(int, C2SIP3(object.Stream())[:offsetFirstObject].strip().split(' '))) - if len(indexes) % 2 != 0 or len(indexes) / 2 != numberOfObjects: - raise Exception('Error in index of /ObjStm stream') - streamObject = C2SIP3(object.Stream()[offsetFirstObject:]) - synthesizedPDF = '' - while len(indexes) > 0: - objectNumber = indexes[0] - offset = indexes[1] - indexes = indexes[2:] - if len(indexes) >= 2: - offsetNextObject = indexes[1] - else: - offsetNextObject = len(streamObject) - synthesizedPDF += '%d 0 obj\n%s\nendobj\n' % (objectNumber, streamObject[offset:offsetNextObject]) - oPDFParserOBJSTM = cPDFParser(StringIO(synthesizedPDF), options.verbose, options.extract, (object.id, object.version)) - if object != None: - if options.stats: - if object.type == PDF_ELEMENT_COMMENT: - cntComment += 1 - elif object.type == PDF_ELEMENT_XREF: - cntXref += 1 - elif object.type == PDF_ELEMENT_TRAILER: - cntTrailer += 1 - oPDFParseDictionary = cPDFParseDictionary(object.content[1:], options.nocanonicalizedoutput) - for keyTrailer, valueTrailer in oPDFParseDictionary.parsed: - if len(valueTrailer) == 3 and valueTrailer[2] == 'R' and IsNumeric(valueTrailer[0]) and IsNumeric(valueTrailer[1]): - objectsReferenced.add(tuple(valueTrailer)) - elif object.type == PDF_ELEMENT_STARTXREF: - cntStartXref += 1 - elif object.type == PDF_ELEMENT_INDIRECT_OBJECT: - cntIndirectObject += 1 - type1 = object.GetType() - if not type1 in dicObjectTypes: - dicObjectTypes[type1] = [object.id] - else: - dicObjectTypes[type1].append(object.id) - for keyword in dKeywords.keys(): - if object.ContainsName(keyword): - dKeywords[keyword].append(object.id) - if object.ContainsStream(): - objectsWithStream.append(object.id) - for reference in object.GetReferences(): - objectsReferenced.add(reference) - objectsAll.add((str(object.id), str(object.version), 'R')) - else: - if object.type == PDF_ELEMENT_COMMENT and selectComment: - if options.generate: - comment = object.comment[1:].rstrip() - if re.match('PDF-\d\.\d', comment): - print(" oPDF.header('%s')" % comment[4:]) - elif comment != '%EOF': - print(' oPDF.comment(%s)' % repr(comment)) - elif options.yara == None and options.generateembedded == 0: - print('PDF Comment %s' % FormatOutput(object.comment, options.raw)) - print('') - elif object.type == PDF_ELEMENT_XREF and selectXref: - if not options.generate and options.yara == None and options.generateembedded == 0: - if options.debug: - print('xref %s' % FormatOutput(object.content, options.raw)) - else: - print('xref') - print('') - elif object.type == PDF_ELEMENT_TRAILER and selectTrailer: - oPDFParseDictionary = cPDFParseDictionary(object.content[1:], options.nocanonicalizedoutput) - if options.generate: - result = oPDFParseDictionary.Get('/Root') - if result != None: - savedRoot = result - elif options.yara == None and options.generateembedded == 0: - if not options.search and not options.key and not options.reference or options.search and object.Contains(options.search): - if oPDFParseDictionary == None: - print('trailer %s' % FormatOutput(object.content, options.raw)) - else: - print('trailer') - oPDFParseDictionary.PrettyPrint(' ') - print('') - elif options.key: - if oPDFParseDictionary.parsed != None: - result = oPDFParseDictionary.GetNested(options.key) - if result != None: - print(result) - elif options.reference: - for key, value in oPDFParseDictionary.Retrieve(): - if value == [str(options.reference), '0', 'R']: - print('trailer') - oPDFParseDictionary.PrettyPrint(' ') - elif object.type == PDF_ELEMENT_STARTXREF and selectStartXref: - if not options.generate and options.yara == None and options.generateembedded == 0: - print('startxref %d' % object.index) - print('') - elif object.type == PDF_ELEMENT_INDIRECT_OBJECT and selectIndirectObject: - if options.search: - if object.Contains(options.search): - PrintObject(object, options) - elif options.key: - contentDictionary = object.ContainsStream() - if not contentDictionary: - contentDictionary = object.content[1:] - oPDFParseDictionary = cPDFParseDictionary(contentDictionary, options.nocanonicalizedoutput) - if oPDFParseDictionary.parsed != None: - result = oPDFParseDictionary.GetNested(options.key) - if result != None: - print(result) - elif options.object: - if MatchObjectID(object.id, options.object): - PrintObject(object, options) - elif options.reference: - if object.References(options.reference): - PrintObject(object, options) - elif options.type: - if EqualCanonical(object.GetType(), optionsType): - PrintObject(object, options) - elif options.hash: - print('obj %d %d' % (object.id, object.version)) - rawContent = FormatOutput(object.content, True) - hashHexdigest, hashAlgo = CalculateChosenHash(rawContent.encode('latin')) - print(' len: %d %s: %s' % (len(rawContent), hashAlgo, hashHexdigest)) - print('') - elif options.searchstream: - if object.StreamContains(options.searchstream, not options.unfiltered, options.casesensitive, options.regex, options.overridingfilters): - PrintObject(object, options) - elif options.yara != None: - results = object.StreamYARAMatch(rules, decoders, options.decoderoptions, not options.unfiltered, options.overridingfilters) - if results != None and results != []: - for result in results: - for yaraResult in result[1]: - print('YARA rule%s: %s (%s)' % (IFF(result[0] == '', '', ' (stream decoder: %s)' % result[0]), yaraResult.rule, yaraResult.namespace)) - if options.yarastrings: - for stringdata in yaraResult.strings: - print('%06x %s:' % (stringdata[0], stringdata[1])) - print(' %s' % binascii.hexlify(C2BIP3(stringdata[2]))) - print(' %s' % repr(stringdata[2])) - PrintObject(object, options) - elif options.generateembedded != 0: - if object.id == options.generateembedded: - PrintGenerateObject(object, options, 8) - else: - PrintObject(object, options) - elif object.type == PDF_ELEMENT_MALFORMED: - try: - fExtract = open(options.extract, 'wb') - try: - fExtract.write(C2BIP3(object.content)) - except: - print('Error writing file %s' % options.extract) - fExtract.close() - except: - print('Error writing file %s' % options.extract) - else: - break - - if options.stats: - print('Comment: %s' % cntComment) - print('XREF: %s' % cntXref) - print('Trailer: %s' % cntTrailer) - print('StartXref: %s' % cntStartXref) - print('Indirect object: %s' % cntIndirectObject) - print('Indirect objects with a stream: %s' % ', '.join([str(id) for id in objectsWithStream])) - objectsUnreferenced = objectsAll - objectsReferenced - for key in sorted(dicObjectTypes.keys()): - print(' %s %d: %s' % (key, len(dicObjectTypes[key]), ', '.join(map(lambda x: '%d' % x, dicObjectTypes[key])))) - if len(objectsUnreferenced) > 0: - print('Unreferenced indirect objects: %s' % ', '.join([' '.join(reference) for reference in sorted(objectsUnreferenced, key=lambda a: int(a[0]))])) - if '/ObjStm' in dicObjectTypes: - objectsUnreferencedMinusObjStm = set() - for unreferencedObject in objectsUnreferenced: - if not int(unreferencedObject[0]) in dicObjectTypes['/ObjStm']: - objectsUnreferencedMinusObjStm.add(unreferencedObject) - print('Unreferenced indirect objects without /ObjStm objects: %s' % ', '.join([' '.join(reference) for reference in sorted(objectsUnreferencedMinusObjStm, key=lambda a: int(a[0]))])) - if sum(map(len, dKeywords.values())) > 0: - print('Search keywords:') - for keyword in keywords: - if len(dKeywords[keyword]) > 0: - print(' %s %d: %s' % (keyword, len(dKeywords[keyword]), ', '.join(map(lambda x: '%d' % x, dKeywords[keyword])))) - - if options.generate or options.generateembedded != 0: - print(" oPDF.xrefAndTrailer('%s')" % ' '.join(savedRoot)) - print('') - print("if __name__ == '__main__':") - print(' Main()') - -def TestPythonVersion(enforceMaximumVersion=False, enforceMinimumVersion=False): - if sys.version_info[0:3] > __maximum_python_version__: - if enforceMaximumVersion: - print('This program does not work with this version of Python (%d.%d.%d)' % sys.version_info[0:3]) - print('Please use Python version %d.%d.%d' % __maximum_python_version__) - sys.exit() - else: - print('This program has not been tested with this version of Python (%d.%d.%d)' % sys.version_info[0:3]) - print('Should you encounter problems, please use Python version %d.%d.%d' % __maximum_python_version__) - if sys.version_info[0:3] < __minimum_python_version__: - if enforceMinimumVersion: - print('This program does not work with this version of Python (%d.%d.%d)' % sys.version_info[0:3]) - print('Please use Python version %d.%d.%d' % __maximum_python_version__) - sys.exit() - else: - print('This program has not been tested with this version of Python (%d.%d.%d)' % sys.version_info[0:3]) - print('Should you encounter problems, please use Python version %d.%d.%d' % __maximum_python_version__) - -if __name__ == '__main__': - TestPythonVersion() - Main() diff --git a/src/time_data.py b/src/time_data.py new file mode 100644 index 0000000000000000000000000000000000000000..b9b5081e07cdcf3cec894638dd9274af8e35532d --- /dev/null +++ b/src/time_data.py @@ -0,0 +1,126 @@ +# source: https://gitlab.com/pavanello-research-group/dftpy/-/blob/5da59bf8/src/dftpy/time_data.py +# released under MIT license: +# https://gitlab.com/pavanello-research-group/dftpy/-/blob/5da59bf8/LICENSE.txt + +import time +from dftpy.mpi import sprint +from functools import wraps + + +class TimeObj(object): + """ + """ + + def __init__(self, **kwargs): + self.reset(**kwargs) + + def reset(self, **kwargs): + self.labels = [] + self.tic = {} + self.toc = {} + self.cost = {} + self.number = {} + + def Begin(self, label): + if label in self.tic: + self.number[label] += 1 + else: + self.labels.append(label) + self.number[label] = 1 + self.cost[label] = 0.0 + + self.tic[label] = time.time() + + def Time(self, label): + if label not in self.tic: + print('!WARN : You should call "Begin" before this label : {}'.format(label)) + t = 0 + else: + t = time.time() - self.tic[label] + return t + + def End(self, label): + if label not in self.tic: + print('!WARN : You should call "Begin" before this label : {}'.format(label)) + t = 0 + else: + self.toc[label] = time.time() + t = time.time() - self.tic[label] + self.cost[label] += t + return t + + def output(self, config=None, sort=0, lprint=False, comm=None, **kwargs): + """ + sort : Label(0), Cost(1), Number(2), Avg(3) + """ + column = { + 'label': 0, + 'cost': 1, + 'number': 2, + 'avg': 3, + } + if isinstance(sort, str) : sort = sort.lower() + if sort in column: + idx = column[sort] + elif isinstance(sort, (int, float)): + idx = int(sort) + if idx < 0 or idx > 3: + idx = 0 + else: + idx = 0 + sprint(format("Time information", "-^80"), comm=comm) + lenk = max(max([len(x) for x in self.cost]), 28) + fmth = "{:"+str(lenk)+"s}{:24s}{:16s}{:24s}" + sprint(fmth.format("Label", "Cost(s)", "Number", "Avg. Cost(s)"), comm=comm) + fmt = "{:"+str(lenk)+"s}{:<24.4f}{:<16d}{:<24.4f}" + if config: + if isinstance(config, dict) and not config["OUTPUT"]["time"]: + lprint = False + else: + lprint = True + if lprint: + info = [] + for key, cost in self.cost.items(): + if key == 'TOTAL': continue + item = [key, cost, self.number[key], cost / self.number[key]] + info.append(item) + for item in sorted(info, key=lambda d: d[idx]): + sprint(fmt.format(*item), comm=comm) + key = "TOTAL" + if key in self.cost : + sprint(fmt.format(key, self.cost[key], self.number[key], + self.cost[key] / self.number[key]), comm=comm) + + +TimeData = TimeObj() + + +def timer(label: str = None): + """ + A decorator times the function + Parameters + ---------- + label + + Returns + ------- + + """ + + def decorator(function): + @wraps(function) + def wrapper(*args, **kwargs): + tag = label + if tag is None: + if hasattr(function, '__qualname__'): + tag = function.__qualname__ + else : + tag = function.__class__.__name__ + TimeData.Begin(tag) + results = function(*args, **kwargs) + TimeData.End(tag) + return results + + return wrapper + + return decorator