diff --git a/src/odr/internal/html/pdf_file.cpp b/src/odr/internal/html/pdf_file.cpp index 953e4b23..7f1e06ff 100644 --- a/src/odr/internal/html/pdf_file.cpp +++ b/src/odr/internal/html/pdf_file.cpp @@ -177,6 +177,33 @@ std::string svg_path_fragment(const pdf::PathElement &path, return std::move(f).str(); } +/// Serialize an image XObject to an SVG `` fragment in the page viewBox, +/// or "" when it carries no pass-through bytes. The image fills the unit square +/// in user space (ISO 32000-1 8.10.5); the transform maps that square — through +/// a vertical flip (the image's first row is its top, SVG draws y-down) and the +/// CTM — into the page box. `clip_id` installs a clip via `clip-path`. +std::string svg_image_fragment(const pdf::ImageElement &image, + const util::math::Transform2D &to_box, + const std::string &clip_id) { + if (image.data.empty()) { + return {}; + } + // image natural box [0,1] (y-down) -> PDF unit square (y-up) -> user -> box. + constexpr util::math::Transform2D flip = + util::math::Transform2D::scaling_translation(1, -1, 0, 1); + const util::math::Transform2D m = flip * image.transform * to_box; + + std::ostringstream f; + f << R"("; + return std::move(f).str(); +} + /// Registers a page's clip regions as nested `` defs, deduplicating /// shared prefixes. PDF's current clip is the *intersection* of an ordered list /// of regions; SVG expresses intersection by chaining `clip-path` from one @@ -331,9 +358,9 @@ class HtmlServiceImpl final : public HtmlService { std::string glyph_classes; std::string glyph_text; }; - // One painted path, already serialized to an SVG `` fragment in - // the page's viewBox (PDF points, y-down). Contiguous paths share one `` - // at write time. + // One vector item, already serialized to an SVG fragment in the page's + // viewBox (PDF points, y-down): a painted `` or an ``. + // Contiguous vector items share one `` at write time. struct PathOut { std::string svg; }; @@ -562,6 +589,17 @@ class HtmlServiceImpl final : public HtmlService { continue; } + // An image XObject: an `` placed by the CTM, in the page `` + // alongside the paths (so it layers by paint order). + if (const auto *image = std::get_if(&element)) { + const std::string clip_id = clips.register_clip(image->clip, to_box); + std::string fragment = svg_image_fragment(*image, to_box, clip_id); + if (!fragment.empty()) { + page_out.items.push_back(PathOut{std::move(fragment)}); + } + continue; + } + const pdf::TextElement &text = std::get(element); // TODO(clip text): the active clip is not applied to text. Paths carry // a clip snapshot realized as an SVG ``, but text is emitted diff --git a/src/odr/internal/pdf/pdf_document_element.hpp b/src/odr/internal/pdf/pdf_document_element.hpp index 32119485..3f094a3e 100644 --- a/src/odr/internal/pdf/pdf_document_element.hpp +++ b/src/odr/internal/pdf/pdf_document_element.hpp @@ -123,6 +123,13 @@ struct XObject final : Element { /// Form XObject only: the decoded (filter-applied) content stream, read /// eagerly at parse time so text extraction needs no parser handle. std::string content; + + /// Image XObject only: the encoded image bytes passed through to the browser + /// (stage 4.5: JPEG / `DCTDecode`), with `image_mime` naming the codec. Empty + /// for an image whose codec is not yet a pass-through (Flate/LZW raster, + /// image masks — later stages) and for non-image XObjects, so `Do` skips it. + std::string image_data; + std::string image_mime; }; /// A non-owning view over a string of PDF character codes, splitting it into diff --git a/src/odr/internal/pdf/pdf_document_parser.cpp b/src/odr/internal/pdf/pdf_document_parser.cpp index 3e825787..35fd74eb 100644 --- a/src/odr/internal/pdf/pdf_document_parser.cpp +++ b/src/odr/internal/pdf/pdf_document_parser.cpp @@ -549,10 +549,34 @@ XObject *parse_x_object(State &state, const ObjectReference &reference) { ? dictionary["Subtype"].as_name() : ""; if (subtype == "Image") { - // Image XObjects carry raster data, not a content stream: recognized but - // not decoded until stage 4 (and `read_decoded_stream` would throw on the - // image codec anyway). x_object->subtype = XObject::Subtype::image; + // Stage 4.5: pass a JPEG (`DCTDecode`) image through to the browser + // undecoded. `/ImageMask` stencils, color-key masks and the non-JPEG raster + // codecs are later stages; leave their bytes empty so `Do` skips them. + const bool image_mask = + dictionary.get("ImageMask").as_bool_opt().value_or(false); + Object filter; + if (!image_mask && dictionary.has_key("Filter")) { + filter = parser.deep_resolve_object_copy(dictionary["Filter"]); + } + // Only a JPEG passes straight through to the browser. Gate on the chain's + // terminal codec so a non-pass-through raster (e.g. FlateDecode with a + // predictor) is left empty without inflating it — that decode is wasted for + // a skipped image and can throw on parameters we don't support, which would + // otherwise abort the whole document. `Do` skips an image with no bytes. + if (!image_mask && terminal_image_codec(filter) == "DCTDecode") { + Object decode_parms; + if (dictionary.has_key("DecodeParms")) { + decode_parms = + parser.deep_resolve_object_copy(dictionary["DecodeParms"]); + } + std::string raw = parser.read_object_stream(object); + DecodeResult result = decode(filter, decode_parms, std::move(raw)); + if (result.stopped_at_filter == "DCTDecode") { + x_object->image_data = std::move(result.data); + x_object->image_mime = "image/jpeg"; + } + } return x_object; } if (subtype != "Form") { diff --git a/src/odr/internal/pdf/pdf_filter.cpp b/src/odr/internal/pdf/pdf_filter.cpp index b706643a..08d11086 100644 --- a/src/odr/internal/pdf/pdf_filter.cpp +++ b/src/odr/internal/pdf/pdf_filter.cpp @@ -241,6 +241,27 @@ pdf::DecodeResult pdf::decode(const Object &filter, const Object &decode_parms, return result; } +std::optional pdf::terminal_image_codec(const Object &filter) { + Object last; + if (filter.is_array()) { + const Array &array = filter.as_array(); + if (array.empty()) { + return std::nullopt; + } + last = array.back(); + } else if (!filter.is_null()) { + last = filter; + } else { + return std::nullopt; + } + + std::string name = canonical_filter_name(last.as_string()); + if (is_image_codec(name)) { + return name; + } + return std::nullopt; +} + std::string pdf::ascii_hex_decode(const std::string &input) { std::string result; result.reserve(input.size() / 2); diff --git a/src/odr/internal/pdf/pdf_filter.hpp b/src/odr/internal/pdf/pdf_filter.hpp index 32c76c40..2ca1fc67 100644 --- a/src/odr/internal/pdf/pdf_filter.hpp +++ b/src/odr/internal/pdf/pdf_filter.hpp @@ -24,6 +24,12 @@ struct DecodeResult { DecodeResult decode(const Object &filter, const Object &decode_parms, std::string data); +/// The image codec a `/Filter` chain terminates in — its last entry, when that +/// is an image codec such as DCTDecode (the filter `decode` would stop at) — +/// else nullopt for a chain that decodes fully. Lets a caller recognise a +/// pass-through image without inflating a raster it only means to skip. +std::optional terminal_image_codec(const Object &filter); + std::string ascii_hex_decode(const std::string &input); std::string ascii85_decode(const std::string &input); std::string lzw_decode(const std::string &input, Integer early_change = 1); diff --git a/src/odr/internal/pdf/pdf_object.hpp b/src/odr/internal/pdf/pdf_object.hpp index 13b109c9..b15d48cb 100644 --- a/src/odr/internal/pdf/pdf_object.hpp +++ b/src/odr/internal/pdf/pdf_object.hpp @@ -254,6 +254,7 @@ class Array final { [[nodiscard]] const Holder &holder() const { return m_holder; } [[nodiscard]] std::size_t size() const { return m_holder.size(); } + [[nodiscard]] bool empty() const { return m_holder.empty(); } [[nodiscard]] Holder::iterator begin() { return m_holder.begin(); } [[nodiscard]] Holder::iterator end() { return m_holder.end(); } [[nodiscard]] Holder::const_iterator begin() const { @@ -264,6 +265,11 @@ class Array final { Object &operator[](const std::size_t i) { return m_holder.at(i); } const Object &operator[](const std::size_t i) const { return m_holder.at(i); } + Object &front() { return m_holder.front(); } + [[nodiscard]] const Object &front() const { return m_holder.front(); } + Object &back() { return m_holder.back(); } + [[nodiscard]] const Object &back() const { return m_holder.back(); } + void to_stream(std::ostream &) const; [[nodiscard]] std::string to_string() const; diff --git a/src/odr/internal/pdf/pdf_page_element.hpp b/src/odr/internal/pdf/pdf_page_element.hpp index b071c22d..38e419e1 100644 --- a/src/odr/internal/pdf/pdf_page_element.hpp +++ b/src/odr/internal/pdf/pdf_page_element.hpp @@ -86,9 +86,22 @@ struct PathElement { double dash_phase{0}; }; -/// A single page-content element in paint (z) order: a shown text segment or a -/// painted path. Images, shadings and patterns join this variant in later +/// One image XObject painted by `Do`, placed by the CTM in effect when it was +/// invoked (ISO 32000-1 8.10.5): the image fills the unit square in user space, +/// which `transform` maps. The encoded bytes pass straight through to the +/// browser (stage 4.5: JPEG / `DCTDecode`), `mime` naming the codec. The clip +/// is snapshotted as for a path. +struct ImageElement { + /// CTM at `Do` time: maps the image's unit square to user space. + util::math::Transform2D transform; + std::vector clip; + std::string data; // encoded image bytes (e.g. a JPEG) + std::string mime; // e.g. "image/jpeg" +}; + +/// A single page-content element in paint (z) order: a shown text segment, a +/// painted path or an image. Shadings and patterns join this variant in later /// stage-4 PRs. -using PageElement = std::variant; +using PageElement = std::variant; } // namespace odr::internal::pdf diff --git a/src/odr/internal/pdf/pdf_page_extractor.cpp b/src/odr/internal/pdf/pdf_page_extractor.cpp index ce069371..0ed7b12d 100644 --- a/src/odr/internal/pdf/pdf_page_extractor.cpp +++ b/src/odr/internal/pdf/pdf_page_extractor.cpp @@ -450,8 +450,9 @@ void begin_marked_content(const GraphicsOperator &op, /// Invoke a form XObject named by `Do`: save the state, concatenate the form's /// `/Matrix` onto the CTM, run its content with the form's own `/Resources` /// (falling back to the enclosing scope), then restore (ISO 32000-1 8.10.1). -/// `/BBox` clipping is deferred (text-only). Image and unknown XObjects are -/// skipped, and a form already on the render stack is skipped (cycle guard). +/// `/BBox` clips the form's content. An image XObject emits an `ImageElement` +/// (when its codec passes through); unknown subtypes are skipped, and a form +/// already on the render stack is skipped (cycle guard). void invoke_x_object(const std::string &name, const Resources &resources, GraphicsState &state, std::vector &out, const Logger &logger, std::set &warned, @@ -466,8 +467,22 @@ void invoke_x_object(const std::string &name, const Resources &resources, } const XObject *x_object = it->second; + if (x_object->subtype == XObject::Subtype::image) { + // An image is placed by the CTM in effect (its unit square maps to user + // space), under the current clip. Only codecs with bytes ready for the + // browser carry `image_data` (stage 4.5: JPEG); the rest are skipped. + if (!x_object->image_data.empty()) { + ImageElement image; + image.transform = state.current().general.transform_matrix; + image.clip = state.current().clip; + image.data = x_object->image_data; + image.mime = x_object->image_mime; + out.push_back(std::move(image)); + } + return; + } if (x_object->subtype != XObject::Subtype::form) { - return; // image XObjects are stage 4; unknown subtypes are inexecutable + return; // unknown subtypes are inexecutable } if (!active.insert(x_object).second) { ODR_WARNING(logger, "pdf: cyclic form XObject invocation, skipping"); diff --git a/test/src/internal/pdf/pdf_filter.cpp b/test/src/internal/pdf/pdf_filter.cpp index eaa89a76..8600bca8 100644 --- a/test/src/internal/pdf/pdf_filter.cpp +++ b/test/src/internal/pdf/pdf_filter.cpp @@ -151,6 +151,26 @@ TEST(PdfFilter, decode_stops_at_image_codec) { 0); } +TEST(PdfFilter, terminal_image_codec_identifies_passthrough) { + // The codec a chain terminates in, recognised without decoding — its last + // entry when that is an image codec (abbreviations canonicalised). + EXPECT_EQ(terminal_image_codec(name("DCTDecode")), "DCTDecode"); + EXPECT_EQ(terminal_image_codec(name("DCT")), "DCTDecode"); + EXPECT_EQ(terminal_image_codec(array({name("ASCII85Decode"), name("DCT")})), + "DCTDecode"); + EXPECT_EQ(terminal_image_codec(name("JPXDecode")), "JPXDecode"); +} + +TEST(PdfFilter, terminal_image_codec_none_for_decodable_chain) { + // A chain with no image codec (or none last) decodes fully: no pass-through. + EXPECT_FALSE(terminal_image_codec(Object()).has_value()); + EXPECT_FALSE(terminal_image_codec(name("FlateDecode")).has_value()); + EXPECT_FALSE(terminal_image_codec(array({})).has_value()); + EXPECT_FALSE( + terminal_image_codec(array({name("DCTDecode"), name("FlateDecode")})) + .has_value()); +} + TEST(PdfFilter, decode_crypt_identity_passes_through) { const DecodeResult result = decode(name("Crypt"), dictionary({{"Name", name("Identity")}}), "data"); diff --git a/test/src/internal/pdf/pdf_page_extractor.cpp b/test/src/internal/pdf/pdf_page_extractor.cpp index d6711abc..93a26003 100644 --- a/test/src/internal/pdf/pdf_page_extractor.cpp +++ b/test/src/internal/pdf/pdf_page_extractor.cpp @@ -852,3 +852,73 @@ TEST(PdfPageExtractor, device_color_clears_color_space) { EXPECT_DOUBLE_EQ(p.fill_color.rgb[0], 1.0); EXPECT_DOUBLE_EQ(p.fill_color.rgb[1], 0.0); } + +// --- stage 4.5: image XObjects (JPEG pass-through) ------------------------ + +namespace { + +XObject jpeg_x_object(std::string data) { + XObject x_object; + x_object.subtype = XObject::Subtype::image; + x_object.image_data = std::move(data); + x_object.image_mime = "image/jpeg"; + return x_object; +} + +} // namespace + +// `Do` on a pass-through image XObject emits an `ImageElement` placed by the +// CTM, carrying the encoded bytes verbatim. +TEST(PdfPageExtractor, image_xobject_emitted_at_ctm) { + XObject image = jpeg_x_object("JFIF-bytes"); + Resources res; + res.x_object["Im0"] = ℑ + + const auto page = + extract_page("q 2 0 0 3 10 20 cm /Im0 Do Q", res, Logger::null()); + ASSERT_EQ(page.size(), 1); + const ImageElement &img = std::get(page[0]); + EXPECT_EQ(img.data, "JFIF-bytes"); + EXPECT_EQ(img.mime, "image/jpeg"); + EXPECT_DOUBLE_EQ(img.transform.a, 2); // unit square -> 2 wide + EXPECT_DOUBLE_EQ(img.transform.d, 3); // 3 tall + EXPECT_DOUBLE_EQ(img.transform.e, 10); + EXPECT_DOUBLE_EQ(img.transform.f, 20); +} + +// An image whose codec is not a pass-through (no `image_data`) is skipped, as +// is an unknown XObject — `Do` emits nothing. +TEST(PdfPageExtractor, image_xobject_without_data_skipped) { + XObject image; // subtype image, but no decoded pass-through bytes + image.subtype = XObject::Subtype::image; + Resources res; + res.x_object["Im0"] = ℑ + + EXPECT_TRUE(extract_page("/Im0 Do", res, Logger::null()).empty()); +} + +// An image is clipped by the current clip, like a path. +TEST(PdfPageExtractor, image_xobject_carries_clip) { + XObject image = jpeg_x_object("bytes"); + Resources res; + res.x_object["Im0"] = ℑ + + const auto page = + extract_page("0 0 50 50 re W n /Im0 Do", res, Logger::null()); + ASSERT_EQ(page.size(), 1); + EXPECT_EQ(std::get(page[0]).clip.size(), 1); +} + +// Images interleave with paths and text in paint order. +TEST(PdfPageExtractor, image_in_paint_order) { + XObject image = jpeg_x_object("bytes"); + Resources res; + res.x_object["Im0"] = ℑ + + const auto page = + extract_page("0 0 10 10 re f /Im0 Do 5 5 m 6 6 l S", res, Logger::null()); + ASSERT_EQ(page.size(), 3); + EXPECT_TRUE(std::holds_alternative(page[0])); + EXPECT_TRUE(std::holds_alternative(page[1])); + EXPECT_TRUE(std::holds_alternative(page[2])); +}