Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 41 additions & 3 deletions src/odr/internal/html/pdf_file.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,33 @@ std::string svg_path_fragment(const pdf::PathElement &path,
return std::move(f).str();
}

/// Serialize an image XObject to an SVG `<image>` fragment in the page viewBox,
/// or "" when it carries no pass-through bytes. The image fills the unit square
/// in user space (ISO 32000-1 8.10.5); the transform maps that square — through
/// a vertical flip (the image's first row is its top, SVG draws y-down) and the
/// CTM — into the page box. `clip_id` installs a clip via `clip-path`.
std::string svg_image_fragment(const pdf::ImageElement &image,
const util::math::Transform2D &to_box,
const std::string &clip_id) {
if (image.data.empty()) {
return {};
}
// image natural box [0,1] (y-down) -> PDF unit square (y-up) -> user -> box.
constexpr util::math::Transform2D flip =
util::math::Transform2D::scaling_translation(1, -1, 0, 1);
const util::math::Transform2D m = flip * image.transform * to_box;

std::ostringstream f;
f << R"(<image width="1" height="1" preserveAspectRatio="none" transform="matrix()"
<< m.a << ',' << m.b << ',' << m.c << ',' << m.d << ',' << round2(m.e)
<< ',' << round2(m.f) << ")\"";
if (!clip_id.empty()) {
f << " clip-path=\"url(#" << clip_id << ")\"";
}
f << " href=\"" << file_to_url(image.data, image.mime) << "\"/>";
return std::move(f).str();
}

/// Registers a page's clip regions as nested `<clipPath>` defs, deduplicating
/// shared prefixes. PDF's current clip is the *intersection* of an ordered list
/// of regions; SVG expresses intersection by chaining `clip-path` from one
Expand Down Expand Up @@ -330,9 +357,9 @@ class HtmlServiceImpl final : public HtmlService {
std::string glyph_classes;
std::string glyph_text;
};
// One painted path, already serialized to an SVG `<path .../>` fragment in
// the page's viewBox (PDF points, y-down). Contiguous paths share one `<svg>`
// at write time.
// One vector item, already serialized to an SVG fragment in the page's
// viewBox (PDF points, y-down): a painted `<path>` or an `<image>`.
// Contiguous vector items share one `<svg>` at write time.
struct PathOut {
std::string svg;
};
Expand Down Expand Up @@ -501,6 +528,17 @@ class HtmlServiceImpl final : public HtmlService {
continue;
}

// An image XObject: an `<image>` placed by the CTM, in the page `<svg>`
// alongside the paths (so it layers by paint order).
if (const auto *image = std::get_if<pdf::ImageElement>(&element)) {
const std::string clip_id = clips.register_clip(image->clip, to_box);
std::string fragment = svg_image_fragment(*image, to_box, clip_id);
if (!fragment.empty()) {
page_out.items.push_back(PathOut{std::move(fragment)});
}
continue;
}

const pdf::TextElement &text = std::get<pdf::TextElement>(element);
// The font index is non-zero when an embedded font lets us render
// the actual glyphs; 0 falls through to the legacy path.
Expand Down
7 changes: 7 additions & 0 deletions src/odr/internal/pdf/pdf_document_element.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,13 @@ struct XObject final : Element {
/// Form XObject only: the decoded (filter-applied) content stream, read
/// eagerly at parse time so text extraction needs no parser handle.
std::string content;

/// Image XObject only: the encoded image bytes passed through to the browser
/// (stage 4.5: JPEG / `DCTDecode`), with `image_mime` naming the codec. Empty
/// for an image whose codec is not yet a pass-through (Flate/LZW raster,
/// image masks — later stages) and for non-image XObjects, so `Do` skips it.
std::string image_data;
std::string image_mime;
};

/// A non-owning view over a string of PDF character codes, splitting it into
Expand Down
25 changes: 22 additions & 3 deletions src/odr/internal/pdf/pdf_document_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -540,10 +540,29 @@ XObject *parse_x_object(State &state, const ObjectReference &reference) {
? dictionary["Subtype"].as_name()
: "";
if (subtype == "Image") {
// Image XObjects carry raster data, not a content stream: recognized but
// not decoded until stage 4 (and `read_decoded_stream` would throw on the
// image codec anyway).
x_object->subtype = XObject::Subtype::image;
// Stage 4.5: pass a JPEG (`DCTDecode`) image through to the browser
// undecoded. `/ImageMask` stencils, color-key masks and the non-JPEG raster
// codecs are later stages; leave their bytes empty so `Do` skips them.
const bool image_mask =
dictionary.get("ImageMask").as_bool_opt().value_or(false);
if (!image_mask) {
std::string raw = parser.read_object_stream(object);
Object filter;
Object decode_parms;
if (dictionary.has_key("Filter")) {
filter = parser.deep_resolve_object_copy(dictionary["Filter"]);
}
if (dictionary.has_key("DecodeParms")) {
decode_parms =
parser.deep_resolve_object_copy(dictionary["DecodeParms"]);
}
DecodeResult result = decode(filter, decode_parms, std::move(raw));

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Avoid decoding images that will be skipped

When a page resource contains a non-JPEG Image XObject with a filter chain the generic decoder does not support for raster images, this unconditional decode() throws during parse_resources even though the image is supposed to be skipped. For example, a /FlateDecode image using TIFF predictor parameters that apply_tiff_predictor rejects will now abort rendering of the whole PDF, including text and paths; before this change image XObjects were ignored, and the nearby comment says non-JPEG codecs should just leave image_data empty. Check the filter chain for a pass-through DCT case, or catch non-pass-through decode failures, before decoding skipped images.

Useful? React with 👍 / 👎.

if (result.stopped_at_filter == "DCTDecode") {
x_object->image_data = std::move(result.data);
x_object->image_mime = "image/jpeg";
}
}
return x_object;
}
if (subtype != "Form") {
Expand Down
19 changes: 16 additions & 3 deletions src/odr/internal/pdf/pdf_page_element.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,22 @@ struct PathElement {
double dash_phase{0};
};

/// A single page-content element in paint (z) order: a shown text segment or a
/// painted path. Images, shadings and patterns join this variant in later
/// One image XObject painted by `Do`, placed by the CTM in effect when it was
/// invoked (ISO 32000-1 8.10.5): the image fills the unit square in user space,
/// which `transform` maps. The encoded bytes pass straight through to the
/// browser (stage 4.5: JPEG / `DCTDecode`), `mime` naming the codec. The clip
/// is snapshotted as for a path.
struct ImageElement {
/// CTM at `Do` time: maps the image's unit square to user space.
util::math::Transform2D transform;
std::vector<ClipPath> clip;
std::string data; // encoded image bytes (e.g. a JPEG)
std::string mime; // e.g. "image/jpeg"
};

/// A single page-content element in paint (z) order: a shown text segment, a
/// painted path or an image. Shadings and patterns join this variant in later
/// stage-4 PRs.
using PageElement = std::variant<TextElement, PathElement>;
using PageElement = std::variant<TextElement, PathElement, ImageElement>;

} // namespace odr::internal::pdf
21 changes: 18 additions & 3 deletions src/odr/internal/pdf/pdf_page_extractor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -450,8 +450,9 @@ void begin_marked_content(const GraphicsOperator &op,
/// Invoke a form XObject named by `Do`: save the state, concatenate the form's
/// `/Matrix` onto the CTM, run its content with the form's own `/Resources`
/// (falling back to the enclosing scope), then restore (ISO 32000-1 8.10.1).
/// `/BBox` clipping is deferred (text-only). Image and unknown XObjects are
/// skipped, and a form already on the render stack is skipped (cycle guard).
/// `/BBox` clips the form's content. An image XObject emits an `ImageElement`
/// (when its codec passes through); unknown subtypes are skipped, and a form
/// already on the render stack is skipped (cycle guard).
void invoke_x_object(const std::string &name, const Resources &resources,
GraphicsState &state, std::vector<PageElement> &out,
const Logger &logger, std::set<std::string> &warned,
Expand All @@ -466,8 +467,22 @@ void invoke_x_object(const std::string &name, const Resources &resources,
}

const XObject *x_object = it->second;
if (x_object->subtype == XObject::Subtype::image) {
// An image is placed by the CTM in effect (its unit square maps to user
// space), under the current clip. Only codecs with bytes ready for the
// browser carry `image_data` (stage 4.5: JPEG); the rest are skipped.
if (!x_object->image_data.empty()) {
ImageElement image;
image.transform = state.current().general.transform_matrix;
image.clip = state.current().clip;
image.data = x_object->image_data;
image.mime = x_object->image_mime;
out.push_back(std::move(image));
}
return;
}
if (x_object->subtype != XObject::Subtype::form) {
return; // image XObjects are stage 4; unknown subtypes are inexecutable
return; // unknown subtypes are inexecutable
}
if (!active.insert(x_object).second) {
ODR_WARNING(logger, "pdf: cyclic form XObject invocation, skipping");
Expand Down
70 changes: 70 additions & 0 deletions test/src/internal/pdf/pdf_page_extractor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -781,6 +781,76 @@ TEST(PdfPageExtractor, device_color_clears_color_space) {
EXPECT_DOUBLE_EQ(p.fill_color.rgb[1], 0.0);
}

// --- stage 4.5: image XObjects (JPEG pass-through) ------------------------

namespace {

XObject jpeg_x_object(std::string data) {
XObject x_object;
x_object.subtype = XObject::Subtype::image;
x_object.image_data = std::move(data);
x_object.image_mime = "image/jpeg";
return x_object;
}

} // namespace

// `Do` on a pass-through image XObject emits an `ImageElement` placed by the
// CTM, carrying the encoded bytes verbatim.
TEST(PdfPageExtractor, image_xobject_emitted_at_ctm) {
XObject image = jpeg_x_object("JFIF-bytes");
Resources res;
res.x_object["Im0"] = &image;

const auto page =
extract_page("q 2 0 0 3 10 20 cm /Im0 Do Q", res, Logger::null());
ASSERT_EQ(page.size(), 1);
const ImageElement &img = std::get<ImageElement>(page[0]);
EXPECT_EQ(img.data, "JFIF-bytes");
EXPECT_EQ(img.mime, "image/jpeg");
EXPECT_DOUBLE_EQ(img.transform.a, 2); // unit square -> 2 wide
EXPECT_DOUBLE_EQ(img.transform.d, 3); // 3 tall
EXPECT_DOUBLE_EQ(img.transform.e, 10);
EXPECT_DOUBLE_EQ(img.transform.f, 20);
}

// An image whose codec is not a pass-through (no `image_data`) is skipped, as
// is an unknown XObject — `Do` emits nothing.
TEST(PdfPageExtractor, image_xobject_without_data_skipped) {
XObject image; // subtype image, but no decoded pass-through bytes
image.subtype = XObject::Subtype::image;
Resources res;
res.x_object["Im0"] = &image;

EXPECT_TRUE(extract_page("/Im0 Do", res, Logger::null()).empty());
}

// An image is clipped by the current clip, like a path.
TEST(PdfPageExtractor, image_xobject_carries_clip) {
XObject image = jpeg_x_object("bytes");
Resources res;
res.x_object["Im0"] = &image;

const auto page =
extract_page("0 0 50 50 re W n /Im0 Do", res, Logger::null());
ASSERT_EQ(page.size(), 1);
EXPECT_EQ(std::get<ImageElement>(page[0]).clip.size(), 1);
}

// Images interleave with paths and text in paint order.
TEST(PdfPageExtractor, image_in_paint_order) {
XObject image = jpeg_x_object("bytes");
Resources res;
res.x_object["Im0"] = &image;

const auto page =
extract_page("0 0 10 10 re f /Im0 Do 5 5 m 6 6 l S", res, Logger::null());
ASSERT_EQ(page.size(), 3);
EXPECT_TRUE(std::holds_alternative<PathElement>(page[0]));
EXPECT_TRUE(std::holds_alternative<ImageElement>(page[1]));
EXPECT_TRUE(std::holds_alternative<PathElement>(page[2]));
}

// --- stage 4.3: clipping --------------------------------------------------

// A `W n` clip rect limits a later fill: the fill carries the clip region (the
Expand Down
Loading