1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
|
% Copyright (C) 2001-2019 Artifex Software, Inc.
% All Rights Reserved.
%
% This software is provided AS-IS with no warranty, either express or
% implied.
%
% This software is distributed under license and may not be copied,
% modified or distributed except as expressly authorized under the terms
% of the license contained in the file LICENSE in this distribution.
%
% Refer to licensing information at http://www.artifex.com or contact
% Artifex Software, Inc., 1305 Grant Avenue - Suite 200, Novato,
% CA 94945, U.S.A., +1(415)492-9861, for further information.
%
% pdf_rbld.ps - Rebuilding of broken PDF files (xref errors)
% This module contains routines that are used if we detect an error
% while reading the xref tables. These routines will scan the file and
% build an xref table by finding the objects. We also need to find the
% appropriate trailer dictionary. Note: One procedure is also used
% even if we do not need to rebuild a PDF file.
%
% This module cannot rebuild a PDF file which has had errors created inside
% of objects or binary data streams. It often succeeds with files that
% have had its end of lines converted between unix and dos versions.
% if true --> we have an object with duplicate object and generation numbers.
/dup_obj_gen_num //false def
% Note: This procedure is also used by non-rebuild code.
% Store a line in the xref array (Actually Objects and Generations arrays)
% <obj num> (strm num> <obj loc> <gen num> <rebuild>
% setxrefentry <obj num> strm num> <obj loc> <gen num>
/setxrefentry
{
5 1 roll
dup 65535 or 65535 ne {
( **** Error: Generation number out of 0..65535 range, assuming 0.\n)
pdfformaterror
( Output may be incorrect.\n) pdfformaterror
pop 0
} if
% We store generation numbers as value + 1
% We reserve 0 to indicate an free xref entry
1 add % increment generation number
% To save space, generations numbers are stored in a string unless we
% find a generation number greater than 255. If so then transfer to
% an array.
dup 255 gt {
Generations type /stringtype eq { % Convert Generations to an array.
Generations length array dup % Create new array
0 1 2 index length 1 sub { % Copy from old string to new array
Generations 1 index get put dup
} for
pop
/Generations exch store % Save new Generations array
} if
} if
% Verify that the new values are for a new object. If the current
% entry is null then we have a new entry.
Objects 4 index get //null eq {
ObjectStream 4 index 4 index cvx put % Save ObjectStream object number
Objects 4 index 3 index cvx put % Save object location
Generations 4 index 2 index put % Save geenration number
} {
% Verify that the new entry has at least as high a generaton number
% We accept equal entry number because we have found PDF files in
% which there are multiple objects with the same object and entry
% numbers. The normal xref logic only accepts the first such
% entry that it finds. However the 'rebuild PDF' logic can find
% both such entries. The correct one is usually the last one.
Generations 4 index get 1 index le {
%% Check if the object we already found was at the locaton specified
%% in the original xref (now stored in Orig_Objects). If so, prefer
%% that offset, otherwise prefer the later object.
%% NB check first to see that the object number is in the range of the original
%% xref. If it isn't, set the 'original' Offset to 0.
3 index Orig_Objects length le {Orig_Objects 4 index get}{0}ifelse
Objects 5 index get eq not
{
ObjectStream 4 index 4 index cvx put % Save ObjectStream object number
Objects 4 index 3 index cvx put % Save object location
Generations 4 index 2 index put % Save geenration number
} if
} if
% Set error flag if we have equal object and generation numbers
Generations 4 index get 1 index eq { /dup_obj_gen_num //true def } if
} 8 -1 roll { ifelse } { pop if } ifelse % Run 'else' only when rebuilding.
} bind executeonly def
% Print the contents of the xref array. This actually consists of three
% arrays (Objects, Generations, and ObjectStream).
/print_xref % - print_xref -
{ 0 1 Objects length 1 sub % stack: 0 1 <number of objects - 1>
{ dup =only % print object number
( ) print
dup Generations exch get 1 sub =only % print Generation number
( ) print
dup ObjectStream exch get ==only % print ObjectStream object number
( ) print
Objects exch get === % print object location
} for
flush
} bind executeonly def
% Get token from string and check its type
% <string> <type> typed_token <false> % no token or not match
% <string> <type> typed_token <obj> <last> <true> % matching token type
% Where last is the string remainder
/typed_token
{ exch
token_nofail % get token
{
dup type % stack: type last token type
4 -1 roll eq { % stack: last token bool
exch //true % desired object found - set exit status
} {
pop pop //false % not type - clear stack, set exit status
} ifelse
} {
pop //false % no token - pop type, set exit status
} ifelse % check if we got token
} bind executeonly def
% Allocate space for post_eof_count to be bound into procedures below.
/post_eof_count 0 def
% We want the location of the trailer dictionary at the start of file.
% First we will find the xref. Then we will skip over the xref entries
% to the trailer.
/search_start_trailer % - search_start_trailer <trailer loc>
{ % Read the first 300 bytes and check for xref
PDFfile 0 setfileposition
PDFfile bytesavailable post_eof_count sub % location of end of data
300 .min % block size to read
dup string 0 1 4 -1 roll 1 sub
{ 2 copy PDFfile read pop put pop } for
(xref) search {
% found 'xref'
exch pop exch pop length 4 add PDFfile exch setfileposition
PDFfile token pop % get starting entry - or 'trailer'
(trailer) ne { % if we do not already have 'trailer'
PDFfile token pop % get number of entries
PDFfile token pop pop % this moves us into the middle of the first entry
25 string exch % define working string for readline
{ PDFfile 1 index readline pop pop
} repeat % skip entries
pop % pop working string
PDFfile token pop pop % get 'trailer'
PDFfile fileposition % get file position
} if
} {
pop 0 % no xref, should not happen, report it upstrem
} ifelse
} bind executeonly def
%% Searches backwards from a specified point looking for a 'trailer' keyword.
%% position search_earlier_trailer position or 0
%%
%% Its is just possible that a 'trailer' keyword could straddle a buffer, in which case
%% we wouldn't find it. Given that this only executes for broken files anyway I don't
%% propose to worry about it at the moment, if anyone ever turns up an example we may
%% choose to enhance this routine further.
%%
/search_earlier_trailer {
{ % position
dup 0 gt { % position bool
dup 65535 .min exch % block_size position
1 index sub % block_size position-block_size
dup % block_size new_position new_position
PDFfile exch setfileposition % block_size new position
exch dup % new position block_size block_size
dup string 0 1 4 -1 roll 1 sub %
{2 copy PDFfile read pop put pop } for %
% new_position block size (...string from file....)
(trailer) search {
pop
{ search not { exit } if pop } loop
% determine where the trailer is in the file
% trailer loc = end loc - remaing string length
length % new_position block size string length
3 1 roll % string length new_position block size
add exch sub % string length - (new_position + block size)
} {
pop % discard old block size
pop 0
} ifelse
} {
pop 0 exit
}ifelse
%% We either have a position for a trailer, or 0 if we failed to find one
dup 0 eq not {
exit
}if
pop % renove the zero leaving the new start position
} loop
} bind executeonly def
% We want the location of the trailer dictionary at the end of file.
% We will read the last block of data and search for the final occurance
% of the word 'trailer'
/search_end_trailer % - search_end_trailer <trailer loc>
{ % Position to read block of data from the end of the file. Note: We ignore
% anything past the last %%EOF since this is not PDF data.
PDFfile 0 setfileposition
PDFfile bytesavailable post_eof_count sub % location of end of data
dup 65535 .min % block size to read
% stack: <file end pos> <block size>
% move file position to the start of the block
2 copy sub PDFfile exch setfileposition
% read block of data
dup string 0 1 4 -1 roll 1 sub { 2 copy PDFfile read pop put pop } for
% search for last occurance of 'trailer'
(trailer) search {
pop
{ search not { exit } if pop } loop
% determine where the trailer is in the file
% trailer loc = end loc - remaing string length
length sub
} {
pop pop 0
} ifelse
} bind executeonly def
% We want to find the trailer dictionary. There is a trailer dictionary
% for each xref object list. We only want the trailer dictionary associated
% with the first xref object list. In theory this can be anywhere in the
% file. However since we are trying to repair a broken file, we cannot simply
% follow the xref links. So we are falling back to a simple strategy. We
% find the specified location of the first xref list. If its location is in
% the first half of the file then we search for the first trailer dictionary
% at the start of the file. Otherwise we search for the last trailer at the
% end of the file.
/search_trailer % - search_trailer -
{ % Find the 'startxref' and associated position at the end of the file.
% Position to read block of data from the end of the file. Note: We
% actually end at the end of the last %%EOF since this is the end of the
% useful PDF data. (Some files contain trailing garbage.)
PDFfile 0 setfileposition
PDFfile bytesavailable % size of file
post_eof_count sub dup % location of end of last %%EOF
dup 4096 .min % block size to read
% stack: <useful file size> <useful file size file> <block size>
% move file position to the start of the block
2 copy sub PDFfile exch setfileposition
% read block of data
dup string 0 1 4 -1 roll 1 sub { 2 copy PDFfile read pop put pop } for
% search for last occurance of 'startxref'
//false % Assume that startxref not present
exch (startxref) {
search not { exit } if % Exit loop when no more startxref's
pop 3 -1 roll pop //true 3 1 roll % Idicate that we have found starxref
} loop
exch % Exch last string and 'found' flag
{
% determine where the startxref is in the file
% 'startxref' loc = end loc - remaing string length - 9 bytes
length sub 9 sub
% move the file to this position and read startxref and position
% First, read the startxref token, this should not fail.
PDFfile exch setfileposition PDFfile token
{
% Discard the startxref token and read the following token, which should be the offset.
pop PDFfile token_no_close
{
% Now chck its an integer
dup type /integertype eq not {
pop
% startxref not followed by integer. We will search the end of the file for trailer.
PDFfilelen
} if
} {
% startxref not followed by any token. We will search the end of the file for trailer.
PDFfilelen
} ifelse
} {
% No startxref token ? We will search the end of the file for trailer.
PDFfilelen
} ifelse
} {
% startxref not found. We will search the end of the file for trailer.
pop pop PDFfilelen
} ifelse
% compare xref position to 1/2 the length of the file and search for trailer
exch 2 div lt {
search_start_trailer dup 0 eq { pop search_end_trailer } if
} {
search_end_trailer dup 0 eq { pop search_start_trailer } if
} ifelse
dup 0 eq {
pop
( **** Error: Trailer dictionary not found.\n) pdfformaterror
( Output may be incorrect.\n) pdfformaterror
}{
% get the trailer
dup
PDFfile exch setfileposition % set to the specified trailer location
/dictlevelcount 0 def
PDFfile traileropdict .pdfrun % read trailer info
{
dup /Root known not {
( **** Warning: This trailer dictionary does not contain a /Root entry\n searching for a prior trailer.\n) pdfformatwarning
%% remove trailer dict copy, duplicate file position, then remove the length of 'trailer' so we don't
%% find the same one again....
pop dup 7 sub
search_earlier_trailer
}{exit}ifelse
dup 0 eq {
pop
( **** Error: Valid trailer not found.\n) pdfformaterror
( Output may be incorrect.\n) pdfformaterror
}{
PDFfile exch setfileposition % set to the specified trailer location
/dictlevelcount 0 def
PDFfile traileropdict .pdfrun % read trailer info
} ifelse
} loop
/Trailer exch def
pop
} ifelse
} bind executeonly def
% This routine will determine if there is stuff after the %%EOF. There is
% supposed to be only a line termination. However many real life files
% contain some garbage. This routine checks how much. We then ignore this
% stuff when we are scanning for objects.
/determine_post_eof_count % - determine_post_eof_count <count>
{ % Position to read block of data from the end of the file.
PDFfilelen % size of file
dup 4096 .min % file_size block_size
dup 3 1 roll sub % block_size file_size-block_size
PDFfile exch setfileposition % block_size
string PDFfile exch readstring pop % ()
% search for last occurance of 'startxref', '%%EOF' is often damaged
(startxref) search {
pop
{ search not { exit } if pop
} loop
% how much is left = remaining string length
% Now search for %%EO or try to read a number after 'startxref'.
(%%EO) search {
pop pop
} {
% Look for a number after startxref
{ dup token { pop exch pop } if
} stopped pop
} ifelse
length
} {
% Can't even find startxref, assume it's all objects
pop 0
} ifelse
} bind executeonly def
% This routine will scan a file searaching for object locations to build
% an alternate version of the data in the xref tables.
% Its purpose is to provide a basis for an xref fixing facility.
/search_objects % - search_objects -
{ % Initialize the Objects, Generations, etc. arrays
Objects dup length array copy /Orig_Objects exch def
initPDFobjects
% reset duplicate object and generation numbers error flag
/dup_obj_gen_num //false def
% Determine how many bytes are in the file after the final %%EOF
/post_eof_count determine_post_eof_count def
% Start at the beginning of the file
PDFfile 0 setfileposition
% Create a working string (and also store its length on stack). We are
% using a maximum size string size the logic below wants a recovered object
% to fit into our working string.
65535 dup string
{ % Now loop through the entire file looking for objects
PDFfile fileposition % save current file position
% When we get near the end of the file, we use a smaller interval of
% our working string to prevent reading past the end. (See comments on
% EOF testing below.)
PDFfile bytesavailable post_eof_count sub 10 sub dup 4 index lt {
2 index 0 3 -1 roll getinterval % near EOF, use interval of string
} {
pop 1 index % not near end, use full working string
} ifelse
% Read a line from file. If the line does not fit into our working string,
% or any other error, then we will discard it.
PDFfile exch { readline } //.internalstopped exec
{ pop pop //false } if % indicate no string if we stopped
{ % stack: <length> <working_str> <loc> <string>
% Now that we have line, get obj num, ref num, and 'obj'. Verify that each
% of these is correct type.
dup (obj) search { % preliminary check for obj
pop pop pop
/integertype typed_token { % get obj number
/integertype typed_token { % get ref number
/nametype typed_token { % get 'obj' text
pop % pop remaining string
/obj eq { % verify name is 'obj'
% make sure we have room in the arrays. We work in increments
% of 20 each time we increase the size.
1 index 20 add 20 idiv 20 mul
growPDFobjects
% save xref parameters into ObjectStream, Objects and Generations
1 index 0 % rearrange parms for setxrefentry
4 index PDFoffset sub 3 index
//true setxrefentry % save parameters
pop pop pop pop % clear parameters
} if % check if name is 'obj'
} if % check if we got 'obj" string
pop % remove ref number
} if % check if we got ref number
pop % remove obj number
} if % check if we got object number
} {
pop pop
} ifelse
} if % check if got a string from readline
pop % remove location
% Check if we are approaching the end of the file. We do not want to
% read past the end of the file since that closes it. We actually stop
% 10-20 bytes early since there cannot be an object that close to the end.
% (There is a Trailer dictionary, etc. at the end of the file.)
PDFfile bytesavailable post_eof_count sub 20 lt { exit } if
} loop % loop through the entire file
pop pop % remove working string and its length
% Output warning if we have two objects with the same object and generation
% numbers.
dup_obj_gen_num {
( **** Warning: There are objects with matching object and generation\n)
pdfformatwarning
( **** numbers. The output may be incorrect.\n)
pdfformatwarning
} if
currentdict /Orig_Objects undef
} bind executeonly def
% Print warning message because we found a problem while reading the xref
% tables
/print_xref_warning
{ ( **** Error: An error occurred while reading an XREF table.\n)
pdfformaterror
( **** The file has been damaged. This may have been caused\n)
pdfformaterror
( **** by a problem while converting or transfering the file.\n)
pdfformaterror
( **** Ghostscript will attempt to recover the data.\n)
pdfformaterror
( **** However, the output may be incorrect.\n) pdfformaterror
} bind executeonly def
% Attempt to recover the XRef data. This is called if we have a failure
% while reading the normal XRef tables. This routine usually works
% only for pre PDF1.5 versions of PDF files.
/recover_xref_data % - recover_xref_data -
{ print_xref_warning % Print warning message
count pdfemptycount sub { pop } repeat % remove anything left by readxref
search_objects % Search for objects
} bind executeonly def
|