diff --git a/CMakeLists.txt b/CMakeLists.txt index 7539b2cd..9b9797cc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -183,6 +183,7 @@ set(ODR_SOURCE_FILES "src/odr/internal/pdf/pdf_cid.cpp" "src/odr/internal/pdf/pdf_cmap.cpp" "src/odr/internal/pdf/pdf_cmap_parser.cpp" + "src/odr/internal/pdf/pdf_color.cpp" "src/odr/internal/pdf/pdf_document.cpp" "src/odr/internal/pdf/pdf_document_parser.cpp" "src/odr/internal/pdf/pdf_encoding.cpp" @@ -192,6 +193,7 @@ set(ODR_SOURCE_FILES "src/odr/internal/pdf/pdf_file_object.cpp" "src/odr/internal/pdf/pdf_file_parser.cpp" "src/odr/internal/pdf/pdf_filter.cpp" + "src/odr/internal/pdf/pdf_function.cpp" "src/odr/internal/pdf/pdf_graphics_operator_parser.cpp" "src/odr/internal/pdf/pdf_graphics_state.cpp" "src/odr/internal/pdf/pdf_object.cpp" diff --git a/src/odr/internal/html/pdf_file.cpp b/src/odr/internal/html/pdf_file.cpp index 07f2acd7..56f05db0 100644 --- a/src/odr/internal/html/pdf_file.cpp +++ b/src/odr/internal/html/pdf_file.cpp @@ -115,13 +115,19 @@ std::string svg_path_d(const std::vector &subpaths, /// viewBox, or "" when it paints nothing. Fill honours the even-odd rule; /// stroke carries width (CTM-scaled in user space), caps, joins, miter limit /// and the dash pattern. A zero stroke width renders as a thin hairline. +/// `clip_id`, when non-empty, references a `` installed via +/// `clip-path`. std::string svg_path_fragment(const pdf::PathElement &path, - const util::math::Transform2D &to_box) { + const util::math::Transform2D &to_box, + const std::string &clip_id) { if ((!path.fill && !path.stroke) || path.subpaths.empty()) { return {}; } std::ostringstream f; f << "` defs, deduplicating +/// shared prefixes. PDF's current clip is the *intersection* of an ordered list +/// of regions; SVG expresses intersection by chaining `clip-path` from one +/// `` to the next, so region i's clipPath references region i-1's and +/// the painted element references the last. Ids are namespaced per page +/// (`c_`); `defs()` is emitted once in a hidden `` for the page. +class ClipRegistry { +public: + explicit ClipRegistry(int page) : m_page{page} {} + + /// The clipPath id to reference on a path painted under `clip`, registering + /// any not-yet-seen regions. Empty when `clip` is empty (unclipped). + std::string register_clip(const std::vector &clip, + const util::math::Transform2D &to_box) { + std::string signature; + std::string parent; + for (const pdf::ClipPath ®ion : clip) { + const std::string d = svg_path_d(region.subpaths, to_box); + signature += region.even_odd ? 'E' : 'N'; + signature += d; + signature += ';'; + const auto [it, inserted] = m_id_by_signature.try_emplace(signature); + if (inserted) { + it->second = + "c" + std::to_string(m_page) + "_" + std::to_string(++m_count); + m_defs << "second << '"'; + if (!parent.empty()) { + m_defs << " clip-path=\"url(#" << parent << ")\""; + } + m_defs << ">"; + } + parent = it->second; + } + return parent; + } + + [[nodiscard]] std::string defs() const { return m_defs.str(); } + +private: + int m_page; + int m_count{0}; + std::unordered_map m_id_by_signature; + std::ostringstream m_defs; +}; + /// Deduplicates CSS declarations into atomic, single-property classes. PDF text /// emits one absolutely-positioned span per glyph run, and the same font sizes, /// offsets and spacings recur across the (potentially millions of) spans. @@ -289,6 +344,10 @@ class HtmlServiceImpl final : public HtmlService { double width{0}; // page box width, PDF points (for the SVG viewBox) double height{0}; // page box height, PDF points std::vector items; + // `` defs for this page's clipped paths, emitted once in a hidden + // ``; the path fragments reference them by id. Empty when no path on + // the page is clipped. + std::string clip_defs; }; HtmlResources write_document(HtmlWriter &out) const { @@ -427,12 +486,15 @@ class HtmlServiceImpl final : public HtmlService { util::math::Transform2D::translation(-box_x0, -box_y0) * util::math::Transform2D::scaling_translation(1, -1, 0, height); + ClipRegistry clips(static_cast(pages_out.size())); + for (const pdf::PageElement &element : pdf::extract_page(stream, *page->resources, *m_logger)) { // A painted path: serialize its subpaths to an SVG `` fragment in - // the page viewBox (fill and/or stroke). + // the page viewBox (fill and/or stroke), under any active clip. if (const auto *path = std::get_if(&element)) { - std::string fragment = svg_path_fragment(*path, to_box); + const std::string clip_id = clips.register_clip(path->clip, to_box); + std::string fragment = svg_path_fragment(*path, to_box, clip_id); if (!fragment.empty()) { page_out.items.push_back(PathOut{std::move(fragment)}); } @@ -568,6 +630,8 @@ class HtmlServiceImpl final : public HtmlService { SpanOut{std::move(classes), escape_text(text.text), {}, {}}); } } + + page_out.clip_defs = clips.defs(); } // Pass 2: write the document, now that the catalog is complete. @@ -604,7 +668,8 @@ class HtmlServiceImpl final : public HtmlService { // the page box, matching a PDF viewer: content drawn outside the MediaBox // (e.g. a background rectangle that bleeds past the left edge) is never // visible, and without this it spills into the centered page's margin. - // Arbitrary in-page clip paths still wait for stage 4.3. + // In-page clip paths are honoured via per-path `clip-path` (the page's + // `` defs are emitted in a hidden `` above). // `preserveAspectRatio:none` keeps the points->box mapping exact. // `pointer-events:none` so a full-page overlay painted after text // (paint order) does not swallow selection/clicks over its transparent @@ -640,6 +705,17 @@ class HtmlServiceImpl final : public HtmlService { for (const PageOut &page : pages_out) { out.write_element_begin("div", HtmlElementOptions().set_class(page.classes)); + // Clip-path defs for this page, in a hidden zero-size ``. They are + // referenced by id from the page's path fragments; `clipPathUnits` + // defaults to `userSpaceOnUse`, so the geometry is read in the user space + // of the referencing element (the page viewBox), not this ``. + if (!page.clip_defs.empty()) { + out.write_raw( + "" + ""); + out.write_raw(page.clip_defs); + out.write_raw(""); + } // Walk the page's elements in paint order, coalescing contiguous paths // into a single `` so spans and vector graphics layer by DOM order. bool svg_open = false; diff --git a/src/odr/internal/pdf/pdf_color.cpp b/src/odr/internal/pdf/pdf_color.cpp new file mode 100644 index 00000000..5300e3a8 --- /dev/null +++ b/src/odr/internal/pdf/pdf_color.cpp @@ -0,0 +1,284 @@ +#include + +#include + +#include +#include + +namespace odr::internal::pdf { + +namespace { + +double clamp01(const double v) { return std::clamp(v, 0.0, 1.0); } + +/// Naive DeviceCMYK -> RGB (no ICC), matching the stage-4.1 HTML path. +std::array cmyk_to_rgb(const double c, const double m, + const double y, const double k) { + return {(1 - c) * (1 - k), (1 - m) * (1 - k), (1 - y) * (1 - k)}; +} + +/// sRGB gamma encode of a linear component (IEC 61966-2-1). +double linear_to_srgb(const double c) { + const double v = clamp01(c); + return v <= 0.0031308 ? 12.92 * v : 1.055 * std::pow(v, 1 / 2.4) - 0.055; +} + +/// CIE L*a*b* -> sRGB through XYZ (ISO 32000-1 8.6.5.4), under the space's +/// white point. +std::array lab_to_rgb(const double l_star, const double a_star, + const double b_star, + const std::array &white) { + const double fy = (l_star + 16) / 116; + const double fx = fy + a_star / 500; + const double fz = fy - b_star / 200; + const auto g = [](const double t) { + constexpr double d = 6.0 / 29.0; + return t > d ? t * t * t : 3 * d * d * (t - 4.0 / 29.0); + }; + const double x = white[0] * g(fx); + const double y = white[1] * g(fy); + const double z = white[2] * g(fz); + // XYZ (D65) -> linear sRGB. + const double r = 3.2406 * x - 1.5372 * y - 0.4986 * z; + const double g_lin = -0.9689 * x + 1.8758 * y + 0.0415 * z; + const double b = 0.0557 * x - 0.2040 * y + 1.0570 * z; + return {linear_to_srgb(r), linear_to_srgb(g_lin), linear_to_srgb(b)}; +} + +std::shared_ptr device_space(const ColorSpaceKind kind, + const int components) { + auto def = std::make_shared(); + def->kind = kind; + def->components = components; + return def; +} + +std::vector read_numbers(const Object &object) { + std::vector result; + if (object.is_array()) { + for (const Object &item : object.as_array()) { + result.push_back(item.as_real()); + } + } + return result; +} + +/// Resolve a colour-space name to a device/pattern space, or a resource space +/// via `context.named`. +std::shared_ptr space_from_name(const std::string &name, + const ColorSpaceContext &ctx) { + if (name == "DeviceGray" || name == "G") { + return device_space(ColorSpaceKind::device_gray, 1); + } + if (name == "DeviceRGB" || name == "RGB") { + return device_space(ColorSpaceKind::device_rgb, 3); + } + if (name == "DeviceCMYK" || name == "CMYK") { + return device_space(ColorSpaceKind::device_cmyk, 4); + } + if (name == "Pattern") { + return device_space(ColorSpaceKind::pattern, 1); + } + if (ctx.named) { + return ctx.named(name); + } + return nullptr; +} + +} // namespace + +std::array +ColorSpaceDef::to_rgb(const std::vector &c) const { + const auto at = [&](const std::size_t i) { + return i < c.size() ? c[i] : 0.0; + }; + switch (kind) { + case ColorSpaceKind::device_gray: + case ColorSpaceKind::cal_gray: { + const double g = clamp01(at(0)); + return {g, g, g}; + } + case ColorSpaceKind::device_rgb: + case ColorSpaceKind::cal_rgb: + return {clamp01(at(0)), clamp01(at(1)), clamp01(at(2))}; + case ColorSpaceKind::device_cmyk: + return cmyk_to_rgb(at(0), at(1), at(2), at(3)); + case ColorSpaceKind::lab: + return lab_to_rgb(at(0), at(1), at(2), white_point); + case ColorSpaceKind::icc_based: + // No ICC engine: defer to the alternate, else pick a device space by the + // component count (ISO 32000-1 8.6.5.5). + if (alternate != nullptr) { + return alternate->to_rgb(c); + } + if (components == 1) { + const double g = clamp01(at(0)); + return {g, g, g}; + } + if (components == 4) { + return cmyk_to_rgb(at(0), at(1), at(2), at(3)); + } + return {clamp01(at(0)), clamp01(at(1)), clamp01(at(2))}; + case ColorSpaceKind::indexed: { + if (base == nullptr) { + return {0, 0, 0}; + } + const int n = base->components; + const auto index = static_cast(std::lround(at(0))); + const auto offset = static_cast(std::clamp(index, 0, hival)) * + static_cast(n); + std::vector base_components(static_cast(n), 0.0); + for (int j = 0; j < n; ++j) { + const std::size_t k = offset + static_cast(j); + base_components[static_cast(j)] = + k < lookup.size() ? static_cast(lookup[k]) / 255.0 + : 0.0; + } + return base->to_rgb(base_components); + } + case ColorSpaceKind::separation: + case ColorSpaceKind::device_n: { + if (tint == nullptr || alternate == nullptr) { + // Without the tint transform, approximate a Separation as additive ink + // over white (1 = full colorant -> black). + const double v = clamp01(1 - at(0)); + return {v, v, v}; + } + return alternate->to_rgb(tint->eval(c)); + } + case ColorSpaceKind::pattern: + case ColorSpaceKind::unknown: + return {0, 0, 0}; + } + return {0, 0, 0}; +} + +std::vector ColorSpaceDef::initial_components() const { + switch (kind) { + case ColorSpaceKind::separation: + case ColorSpaceKind::device_n: + // Initial tint is full colorant (ISO 32000-1 8.6.3). + return std::vector(static_cast(components), 1.0); + default: + return std::vector( + static_cast(std::max(components, 1)), 0.0); + } +} + +std::shared_ptr +parse_color_space(const Object &object, const ColorSpaceContext &context) { + const Object resolved = context.resolve(object); + + if (resolved.is_name()) { + return space_from_name(resolved.as_name(), context); + } + if (!resolved.is_array() || resolved.as_array().size() == 0) { + return nullptr; + } + + const Array &array = resolved.as_array(); + const Object family_object = context.resolve(array[0]); + if (!family_object.is_name()) { + return nullptr; + } + const std::string &family = family_object.as_name(); + + if (family == "ICCBased") { + if (array.size() < 2) { + return nullptr; + } + auto def = std::make_shared(); + def->kind = ColorSpaceKind::icc_based; + const Object stream_dict = context.resolve(array[1]); + if (stream_dict.is_dictionary()) { + const Dictionary &dict = stream_dict.as_dictionary(); + def->components = dict.get("N").is_integer() + ? static_cast(dict.get("N").as_integer()) + : 3; + if (dict.has_value("Alternate")) { + def->alternate = parse_color_space(dict.get("Alternate"), context); + } + } + return def; + } + if (family == "CalRGB") { + return device_space(ColorSpaceKind::cal_rgb, 3); + } + if (family == "CalGray") { + return device_space(ColorSpaceKind::cal_gray, 1); + } + if (family == "Lab") { + auto def = std::make_shared(); + def->kind = ColorSpaceKind::lab; + def->components = 3; + if (array.size() >= 2) { + const Object params = context.resolve(array[1]); + if (params.is_dictionary()) { + const Dictionary &dict = params.as_dictionary(); + const std::vector wp = read_numbers(dict.get("WhitePoint")); + if (wp.size() == 3) { + def->white_point = {wp[0], wp[1], wp[2]}; + } + const std::vector range = read_numbers(dict.get("Range")); + if (range.size() == 4) { + def->lab_range = {range[0], range[1], range[2], range[3]}; + } + } + } + return def; + } + if (family == "Indexed" || family == "I") { + if (array.size() < 4) { + return nullptr; + } + auto def = std::make_shared(); + def->kind = ColorSpaceKind::indexed; + def->components = 1; + def->base = parse_color_space(array[1], context); + def->hival = static_cast(context.resolve(array[2]).as_integer()); + const Object lookup = context.resolve(array[3]); + if (lookup.is_string()) { + def->lookup = lookup.as_string(); + } else { + def->lookup = context.load_stream(array[3]); + } + return def; + } + if (family == "Separation") { + if (array.size() < 4) { + return nullptr; + } + auto def = std::make_shared(); + def->kind = ColorSpaceKind::separation; + def->components = 1; + def->alternate = parse_color_space(array[2], context); + def->tint = parse_function( + array[3], FunctionContext{context.resolve, context.load_stream}); + return def; + } + if (family == "DeviceN") { + if (array.size() < 4) { + return nullptr; + } + auto def = std::make_shared(); + def->kind = ColorSpaceKind::device_n; + const Object names = context.resolve(array[1]); + def->components = + names.is_array() ? static_cast(names.as_array().size()) : 1; + def->alternate = parse_color_space(array[2], context); + def->tint = parse_function( + array[3], FunctionContext{context.resolve, context.load_stream}); + return def; + } + if (family == "Pattern") { + auto def = device_space(ColorSpaceKind::pattern, 1); + if (array.size() >= 2) { + def->base = parse_color_space(array[1], context); + } + return def; + } + + return nullptr; +} + +} // namespace odr::internal::pdf diff --git a/src/odr/internal/pdf/pdf_color.hpp b/src/odr/internal/pdf/pdf_color.hpp new file mode 100644 index 00000000..074802ee --- /dev/null +++ b/src/odr/internal/pdf/pdf_color.hpp @@ -0,0 +1,81 @@ +#pragma once + +#include + +#include +#include +#include +#include +#include + +namespace odr::internal::pdf { + +class Object; + +/// The colour-space families PDF content can select (ISO 32000-1 8.6). The +/// device families are also reachable through the dedicated operators +/// (`g`/`rg`/`k`); the rest arrive via `/ColorSpace` resources and `cs`/`scn`. +enum class ColorSpaceKind { + device_gray, + device_rgb, + device_cmyk, + cal_gray, + cal_rgb, + lab, + icc_based, + indexed, + separation, + device_n, + pattern, + unknown, +}; + +/// A resolved colour space: enough to convert a tuple of component values to +/// sRGB at emit time (ISO 32000-1 8.6.4 / 8.6.5 / 8.6.6). Non-device spaces are +/// approximated — ICC profiles by their alternate or component count, Cal* as +/// device, overprint ignored — per the stage-4 plan. +struct ColorSpaceDef { + ColorSpaceKind kind{ColorSpaceKind::unknown}; + /// Number of input components a colour in this space carries. + int components{1}; + + // Lab (8.6.5.4): the white point and the a*/b* component ranges. + std::array white_point{0.9505, 1.0, 1.089}; + std::array lab_range{-100, 100, -100, 100}; + + // Indexed (8.6.6.3): the base space, the packed palette and the max index. + std::shared_ptr base; + std::string lookup; + int hival{0}; + + // Separation / DeviceN (8.6.6.4): the alternate space and the tint transform. + std::shared_ptr alternate; + std::shared_ptr tint; + + /// Convert `components` of this space to sRGB in [0, 1]. A short/empty input + /// yields the space's default colour. + [[nodiscard]] std::array + to_rgb(const std::vector &components) const; + + /// The initial colour value of the space (ISO 32000-1 8.6.3): all-zero + /// components, except Indexed (index 0) and Separation/DeviceN (tint 1.0). + [[nodiscard]] std::vector initial_components() const; +}; + +/// How `parse_color_space` reaches indirect data: `resolve` dereferences, +/// `load_stream` decodes a stream's bytes (ICC profiles, an Indexed palette +/// given as a stream), and `named` looks up a colour space referenced by name +/// (a base space, the `/ColorSpace` resource table). +struct ColorSpaceContext { + std::function resolve; + std::function load_stream; + std::function(const std::string &)> named; +}; + +/// Build a colour space from its PDF object — a name (`/DeviceRGB`, …) or an +/// array (`[/ICCBased 5 0 R]`, `[/Separation …]`, …). Returns `nullptr` for an +/// unsupported or malformed definition. +std::shared_ptr +parse_color_space(const Object &object, const ColorSpaceContext &context); + +} // namespace odr::internal::pdf diff --git a/src/odr/internal/pdf/pdf_document_element.hpp b/src/odr/internal/pdf/pdf_document_element.hpp index e049841a..d57de541 100644 --- a/src/odr/internal/pdf/pdf_document_element.hpp +++ b/src/odr/internal/pdf/pdf_document_element.hpp @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -27,6 +28,7 @@ struct Annotation; struct Resources; struct Font; struct XObject; +struct ColorSpaceDef; struct Element { virtual ~Element() = default; @@ -83,6 +85,12 @@ struct Annotation final : Element {}; struct Resources final : Element { std::unordered_map font; std::unordered_map x_object; + /// The `/ColorSpace` subdictionary (ISO 32000-1 8.6.3): named colour spaces + /// referenced by `cs`/`CS`. Resolved eagerly (ICC alternates, Separation tint + /// transforms, …) so extraction can convert `sc`/`scn` colours to RGB without + /// a parser handle. The device spaces (`/DeviceRGB`, …) are not stored here — + /// they resolve by name at use time. + std::unordered_map> color_space; /// The `/Properties` subdictionary (ISO 32000-1 7.8.3): named property lists /// referenced by `BDC`. Each value is the resolved property-list dictionary /// `Object`; used to recover `/ActualText` for a `BDC /Tag /Name` sequence. @@ -105,6 +113,10 @@ struct XObject final : Element { /// Form XObject only: the `/Matrix` (default identity), concatenated onto the /// CTM when the form is invoked (8.10.1). util::math::Transform2D matrix; + /// Form XObject only: the `/BBox` `[x0 y0 x1 y1]` in form space, clipping the + /// form's content (8.10.2). `nullopt` when the form declares none (lenient; + /// the spec requires it). + std::optional> bbox; /// Form XObject only: the form's own `/Resources`, or `nullptr` to inherit /// the invoking scope's resources (7.8.3). Resources *resources{nullptr}; diff --git a/src/odr/internal/pdf/pdf_document_parser.cpp b/src/odr/internal/pdf/pdf_document_parser.cpp index 2c8e1661..24981eb8 100644 --- a/src/odr/internal/pdf/pdf_document_parser.cpp +++ b/src/odr/internal/pdf/pdf_document_parser.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -554,6 +555,14 @@ XObject *parse_x_object(State &state, const ObjectReference &reference) { if (dictionary.has_value("Matrix")) { x_object->matrix = parse_matrix(parser, dictionary["Matrix"]); } + if (dictionary.has_value("BBox")) { + const Array box = parser.resolve_object_copy(dictionary["BBox"]).as_array(); + if (box.size() == 4) { + x_object->bbox = + std::array{box[0].as_real(), box[1].as_real(), + box[2].as_real(), box[3].as_real()}; + } + } // Read the content eagerly so text extraction needs no parser handle. x_object->content = parser.read_decoded_stream(object); @@ -591,6 +600,40 @@ Resources *parse_resources(State &state, const Object &object) { } } + if (dictionary.has_value("ColorSpace")) { + const Dictionary color_space_table = + parser.resolve_object_copy(dictionary["ColorSpace"]).as_dictionary(); + ColorSpaceContext context; + context.resolve = [&parser](const Object &object) { + return parser.resolve_object_copy(object); + }; + context.load_stream = [&parser](const Object &object) { + return object.is_reference() + ? parser.read_decoded_stream(object.as_reference()) + : std::string{}; + }; + // A base/alternate space may be named (referencing another `/ColorSpace` + // entry); resolve it lazily from the same table, caching the result. + context.named = + [&](const std::string &name) -> std::shared_ptr { + if (const auto it = resources->color_space.find(name); + it != resources->color_space.end()) { + return it->second; + } + if (color_space_table.has_value(name)) { + auto def = parse_color_space(color_space_table.get(name), context); + resources->color_space[name] = def; + return def; + } + return nullptr; + }; + for (const auto &[key, value] : color_space_table) { + if (resources->color_space.find(key) == resources->color_space.end()) { + resources->color_space[key] = parse_color_space(value, context); + } + } + } + if (dictionary.has_key("Properties") && !dictionary["Properties"].is_null()) { // Named property lists for `BDC`; resolved eagerly so text extraction can // recover `/ActualText` without a parser handle (cf. form XObjects). diff --git a/src/odr/internal/pdf/pdf_function.cpp b/src/odr/internal/pdf/pdf_function.cpp new file mode 100644 index 00000000..76fc9e7e --- /dev/null +++ b/src/odr/internal/pdf/pdf_function.cpp @@ -0,0 +1,632 @@ +#include + +#include + +#include +#include +#include +#include +#include + +namespace odr::internal::pdf { + +namespace { + +/// Read an array of numbers from a dictionary entry, or `{}` when absent. +std::vector read_numbers(const Dictionary &dict, const char *key) { + std::vector result; + if (const Object &value = dict.get(key); value.is_array()) { + const Array &array = value.as_array(); + result.reserve(array.size()); + for (const Object &item : array) { + result.push_back(item.as_real()); + } + } + return result; +} + +double clamp(const double v, const double lo, const double hi) { + return std::clamp(v, std::min(lo, hi), std::max(lo, hi)); +} + +/// Affine map of `x` from `[a0, a1]` onto `[b0, b1]` (ISO 32000-1's +/// "Interpolate" — used for the sampled-function encode/decode and the +/// stitching encode). +double interpolate(const double x, const double a0, const double a1, + const double b0, const double b1) { + if (a1 == a0) { + return b0; + } + return b0 + (x - a0) * (b1 - b0) / (a1 - a0); +} + +// --- type 2: exponential interpolation (ISO 32000-1 7.10.3) ---------------- + +class ExponentialFunction final : public Function { +public: + ExponentialFunction(std::vector domain, std::vector range, + std::vector c0, std::vector c1, double n) + : Function(std::move(domain), std::move(range)), m_c0{std::move(c0)}, + m_c1{std::move(c1)}, m_n{n} {} + + std::vector compute(const std::vector &in) const override { + const double x = in.empty() ? 0.0 : in[0]; + const double xn = std::pow(x, m_n); + std::vector out(m_c0.size()); + for (std::size_t j = 0; j < m_c0.size(); ++j) { + out[j] = m_c0[j] + xn * (m_c1[j] - m_c0[j]); + } + return out; + } + +private: + std::vector m_c0; + std::vector m_c1; + double m_n; +}; + +// --- type 3: stitching (ISO 32000-1 7.10.4) -------------------------------- + +class StitchingFunction final : public Function { +public: + StitchingFunction(std::vector domain, std::vector range, + std::vector> functions, + std::vector bounds, std::vector encode) + : Function(std::move(domain), std::move(range)), + m_functions{std::move(functions)}, m_bounds{std::move(bounds)}, + m_encode{std::move(encode)} {} + + std::vector compute(const std::vector &in) const override { + if (m_functions.empty()) { + return {}; + } + const double d0 = m_domain[0]; + const double d1 = m_domain[1]; + const double x = in.empty() ? d0 : in[0]; + + // Locate the subinterval: bounds[i-1] <= x < bounds[i]. + std::size_t k = 0; + while (k < m_bounds.size() && x >= m_bounds[k]) { + ++k; + } + const double lo = k == 0 ? d0 : m_bounds[k - 1]; + const double hi = k < m_bounds.size() ? m_bounds[k] : d1; + const double e0 = m_encode[2 * k]; + const double e1 = m_encode[2 * k + 1]; + const double encoded = interpolate(x, lo, hi, e0, e1); + + if (m_functions[k] == nullptr) { + return {}; + } + return m_functions[k]->eval({encoded}); + } + +private: + std::vector> m_functions; + std::vector m_bounds; + std::vector m_encode; +}; + +// --- type 0: sampled (ISO 32000-1 7.10.2) ---------------------------------- + +class SampledFunction final : public Function { +public: + SampledFunction(std::vector domain, std::vector range, + std::vector size, int bits_per_sample, + std::vector encode, std::vector decode, + std::string samples) + : Function(std::move(domain), std::move(range)), m_size{std::move(size)}, + m_bits{bits_per_sample}, m_encode{std::move(encode)}, + m_decode{std::move(decode)}, m_samples{std::move(samples)} {} + + std::vector compute(const std::vector &in) const override { + const int m = static_cast(m_size.size()); + const int n = output_arity(); + if (m == 0 || n == 0) { + return {}; + } + + // Encode each input coordinate into its sample grid [0, size_i - 1]. + std::vector e(m); + for (int i = 0; i < m; ++i) { + const std::size_t d = 2 * static_cast(i); + const double x = i < static_cast(in.size()) ? in[i] : m_domain[d]; + const double enc = interpolate(x, m_domain[d], m_domain[d + 1], + m_encode[d], m_encode[d + 1]); + e[i] = clamp(enc, 0.0, m_size[i] - 1.0); + } + + // Multilinear interpolation across the 2^m surrounding grid corners. + std::vector base(m); + std::vector frac(m); + for (int i = 0; i < m; ++i) { + const int floor_i = + std::min(static_cast(std::floor(e[i])), m_size[i] - 1); + base[i] = floor_i; + frac[i] = e[i] - floor_i; + } + + std::vector out(n, 0.0); + const int corners = 1 << m; + for (int c = 0; c < corners; ++c) { + double weight = 1.0; + std::vector coord(m); + for (int i = 0; i < m; ++i) { + const bool high = (c >> i) & 1; + int ci = base[i] + (high ? 1 : 0); + ci = std::min(ci, m_size[i] - 1); + coord[i] = ci; + weight *= high ? frac[i] : (1.0 - frac[i]); + } + if (weight == 0.0) { + continue; + } + const std::size_t index = sample_index(coord); + for (int j = 0; j < n; ++j) { + out[j] += weight * raw_sample(index * n + j); + } + } + + // Decode each output from [0, 2^bits - 1] onto its Decode range. + const double max_value = std::ldexp(1.0, m_bits) - 1.0; + for (int j = 0; j < n; ++j) { + const std::size_t d = 2 * static_cast(j); + out[j] = + interpolate(out[j], 0.0, max_value, m_decode[d], m_decode[d + 1]); + } + return out; + } + +private: + [[nodiscard]] std::size_t sample_index(const std::vector &coord) const { + std::size_t index = 0; + std::size_t stride = 1; + for (std::size_t i = 0; i < coord.size(); ++i) { + index += static_cast(coord[i]) * stride; + stride *= static_cast(m_size[i]); + } + return index; + } + + /// The `k`-th `m_bits`-wide unsigned sample, MSB-first (ISO 32000-1 7.10.2). + [[nodiscard]] double raw_sample(const std::size_t k) const { + const std::size_t bit_offset = k * static_cast(m_bits); + std::uint64_t value = 0; + for (int i = 0; i < m_bits; ++i) { + const std::size_t bit = bit_offset + i; + const std::size_t byte = bit / 8; + int sample_bit = 0; + if (byte < m_samples.size()) { + const auto b = static_cast(m_samples[byte]); + sample_bit = (b >> (7 - bit % 8)) & 1; + } + value = (value << 1) | static_cast(sample_bit); + } + return static_cast(value); + } + + std::vector m_size; + int m_bits; + std::vector m_encode; + std::vector m_decode; + std::string m_samples; +}; + +// --- type 4: PostScript calculator (ISO 32000-1 7.10.5) -------------------- + +/// One token of a type-4 program: a literal number, an operator name, or a +/// nested `{ ... }` procedure block (used by `if`/`ifelse`). +struct PostScriptItem { + enum class Kind { number, op, block }; + Kind kind{Kind::op}; + double number{0}; + std::string op; + std::vector block; +}; + +class PostScriptFunction final : public Function { +public: + PostScriptFunction(std::vector domain, std::vector range, + std::vector program) + : Function(std::move(domain), std::move(range)), + m_program{std::move(program)} {} + + std::vector compute(const std::vector &in) const override { + std::vector stack; + stack.reserve(in.size()); + for (const double x : in) { + stack.emplace_back(x); + } + try { + run(m_program, stack); + } catch (const std::exception &) { + return std::vector(output_arity(), 0.0); + } + + const int n = output_arity(); + std::vector out(static_cast(n), 0.0); + // The function leaves n results on the stack, the last output on top. + for (int j = n - 1; j >= 0 && !stack.empty(); --j) { + out[static_cast(j)] = std::get(stack.back()); + stack.pop_back(); + } + return out; + } + +private: + using Item = std::variant *>; + + static double pop_number(std::vector &s) { + if (s.empty()) { + throw std::runtime_error("stack underflow"); + } + const double v = std::get(s.back()); + s.pop_back(); + return v; + } + + static const std::vector *pop_block(std::vector &s) { + if (s.empty()) { + throw std::runtime_error("stack underflow"); + } + const auto *b = std::get *>(s.back()); + s.pop_back(); + return b; + } + + static constexpr double deg = 180.0 / M_PI; + + static void run(const std::vector &program, + std::vector &s) { + for (const PostScriptItem &item : program) { + switch (item.kind) { + case PostScriptItem::Kind::number: + s.emplace_back(item.number); + break; + case PostScriptItem::Kind::block: + s.emplace_back(&item.block); + break; + case PostScriptItem::Kind::op: + run_op(item.op, s); + break; + } + } + } + + static void run_op(const std::string &op, std::vector &s) { + const auto unary = [&](double (*f)(double)) { + s.emplace_back(f(pop_number(s))); + }; + const auto binary = [&](double (*f)(double, double)) { + const double b = pop_number(s); + const double a = pop_number(s); + s.emplace_back(f(a, b)); + }; + + if (op == "add") { + binary([](double a, double b) { return a + b; }); + } else if (op == "sub") { + binary([](double a, double b) { return a - b; }); + } else if (op == "mul") { + binary([](double a, double b) { return a * b; }); + } else if (op == "div") { + binary([](double a, double b) { return b == 0 ? 0.0 : a / b; }); + } else if (op == "idiv") { + binary([](double a, double b) { + if (b == 0) { + return 0.0; + } + // NOLINTNEXTLINE(bugprone-integer-division): idiv is integer division + return static_cast(static_cast(a) / static_cast(b)); + }); + } else if (op == "mod") { + binary([](double a, double b) { + return b == 0 ? 0.0 + : static_cast(static_cast(a) % + static_cast(b)); + }); + } else if (op == "neg") { + unary([](double a) { return -a; }); + } else if (op == "abs") { + unary([](double a) { return std::abs(a); }); + } else if (op == "sqrt") { + unary([](double a) { return std::sqrt(std::max(0.0, a)); }); + } else if (op == "sin") { + unary([](double a) { return std::sin(a / deg); }); + } else if (op == "cos") { + unary([](double a) { return std::cos(a / deg); }); + } else if (op == "atan") { + const double den = pop_number(s); + const double num = pop_number(s); + double a = std::atan2(num, den) * deg; + if (a < 0) { + a += 360; + } + s.emplace_back(a); + } else if (op == "exp") { + binary([](double a, double b) { return std::pow(a, b); }); + } else if (op == "ln") { + unary([](double a) { return std::log(std::max(1e-12, a)); }); + } else if (op == "log") { + unary([](double a) { return std::log10(std::max(1e-12, a)); }); + } else if (op == "cvi" || op == "truncate") { + unary([](double a) { return std::trunc(a); }); + } else if (op == "cvr") { + // no-op: every value is already real + } else if (op == "floor") { + unary([](double a) { return std::floor(a); }); + } else if (op == "ceiling") { + unary([](double a) { return std::ceil(a); }); + } else if (op == "round") { + unary([](double a) { return std::round(a); }); + } else if (op == "eq") { + binary([](double a, double b) { return a == b ? 1.0 : 0.0; }); + } else if (op == "ne") { + binary([](double a, double b) { return a != b ? 1.0 : 0.0; }); + } else if (op == "gt") { + binary([](double a, double b) { return a > b ? 1.0 : 0.0; }); + } else if (op == "ge") { + binary([](double a, double b) { return a >= b ? 1.0 : 0.0; }); + } else if (op == "lt") { + binary([](double a, double b) { return a < b ? 1.0 : 0.0; }); + } else if (op == "le") { + binary([](double a, double b) { return a <= b ? 1.0 : 0.0; }); + } else if (op == "and") { + bitwise_or_logical( + s, [](long a, long b) { return a & b; }, + [](bool a, bool b) { return a && b; }); + } else if (op == "or") { + bitwise_or_logical( + s, [](long a, long b) { return a | b; }, + [](bool a, bool b) { return a || b; }); + } else if (op == "xor") { + bitwise_or_logical( + s, [](long a, long b) { return a ^ b; }, + [](bool a, bool b) { return a != b; }); + } else if (op == "not") { + const double a = pop_number(s); + if (a == 0.0 || a == 1.0) { + s.emplace_back(a == 0.0 ? 1.0 : 0.0); + } else { + s.emplace_back(static_cast(~static_cast(a))); + } + } else if (op == "bitshift") { + const auto shift = static_cast(pop_number(s)); + const auto value = static_cast(pop_number(s)); + s.emplace_back( + static_cast(shift >= 0 ? value << shift : value >> -shift)); + } else if (op == "true") { + s.emplace_back(1.0); + } else if (op == "false") { + s.emplace_back(0.0); + } else if (op == "pop") { + pop_number(s); + } else if (op == "exch") { + const Item b = s.at(s.size() - 1); + const Item a = s.at(s.size() - 2); + s[s.size() - 1] = a; + s[s.size() - 2] = b; + } else if (op == "dup") { + s.push_back(s.back()); + } else if (op == "copy") { + const auto count = static_cast(pop_number(s)); + const std::size_t start = s.size() - count; + for (std::size_t i = 0; i < count; ++i) { + s.push_back(s[start + i]); + } + } else if (op == "index") { + const auto i = static_cast(pop_number(s)); + s.push_back(s.at(s.size() - 1 - i)); + } else if (op == "roll") { + const auto j = static_cast(pop_number(s)); + const auto count = static_cast(pop_number(s)); + if (count > 0) { + const auto first = s.end() - count; + long shift = ((j % count) + count) % count; + std::rotate(first, s.end() - shift, s.end()); + } + } else if (op == "if") { + const auto *proc = pop_block(s); + const double cond = pop_number(s); + if (cond != 0.0) { + run(*proc, s); + } + } else if (op == "ifelse") { + const auto *proc2 = pop_block(s); + const auto *proc1 = pop_block(s); + const double cond = pop_number(s); + run(cond != 0.0 ? *proc1 : *proc2, s); + } else { + throw std::runtime_error("unknown PostScript operator: " + op); + } + } + + template + static void bitwise_or_logical(std::vector &s, IntOp int_op, + BoolOp bool_op) { + const double b = pop_number(s); + const double a = pop_number(s); + const bool boolean = (a == 0.0 || a == 1.0) && (b == 0.0 || b == 1.0); + if (boolean) { + s.emplace_back(bool_op(a != 0.0, b != 0.0) ? 1.0 : 0.0); + } else { + s.emplace_back(static_cast( + int_op(static_cast(a), static_cast(b)))); + } + } + + std::vector m_program; +}; + +/// Tokenize a type-4 program body into a nested item tree. `pos` advances past +/// the consumed text; a `{` recurses and the matching `}` returns. +std::vector parse_postscript(const std::string &text, + std::size_t &pos) { + std::vector items; + while (pos < text.size()) { + const char c = text[pos]; + if (std::isspace(static_cast(c)) != 0) { + ++pos; + } else if (c == '{') { + ++pos; + PostScriptItem item; + item.kind = PostScriptItem::Kind::block; + item.block = parse_postscript(text, pos); + items.push_back(std::move(item)); + } else if (c == '}') { + ++pos; + return items; + } else if ((std::isdigit(static_cast(c)) != 0) || c == '-' || + c == '+' || c == '.') { + std::size_t end = pos; + while (end < text.size() && + (std::isdigit(static_cast(text[end])) != 0 || + text[end] == '-' || text[end] == '+' || text[end] == '.' || + text[end] == 'e' || text[end] == 'E')) { + ++end; + } + PostScriptItem item; + item.kind = PostScriptItem::Kind::number; + item.number = std::stod(text.substr(pos, end - pos)); + items.push_back(std::move(item)); + pos = end; + } else { + std::size_t end = pos; + while (end < text.size() && + std::isalpha(static_cast(text[end])) != 0) { + ++end; + } + if (end == pos) { + ++pos; // skip an unexpected character + continue; + } + PostScriptItem item; + item.kind = PostScriptItem::Kind::op; + item.op = text.substr(pos, end - pos); + items.push_back(std::move(item)); + pos = end; + } + } + return items; +} + +} // namespace + +std::vector Function::eval(std::vector in) const { + const int m = input_arity(); + in.resize(static_cast(m), 0.0); + for (int i = 0; i < m; ++i) { + const std::size_t d = 2 * static_cast(i); + in[i] = clamp(in[i], m_domain[d], m_domain[d + 1]); + } + std::vector out = compute(in); + const int n = output_arity(); + if (n != 0) { + out.resize(static_cast(n), 0.0); + for (int j = 0; j < n; ++j) { + const std::size_t d = 2 * static_cast(j); + out[j] = clamp(out[j], m_range[d], m_range[d + 1]); + } + } + return out; +} + +std::shared_ptr parse_function(const Object &object, + const FunctionContext &context) { + const Object resolved = context.resolve(object); + if (!resolved.is_dictionary()) { + return nullptr; + } + const Dictionary &dict = resolved.as_dictionary(); + if (!dict.get("FunctionType").is_integer()) { + return nullptr; + } + const int type = static_cast(dict.get("FunctionType").as_integer()); + + std::vector domain = read_numbers(dict, "Domain"); + std::vector range = read_numbers(dict, "Range"); + + switch (type) { + case 2: { + std::vector c0 = read_numbers(dict, "C0"); + std::vector c1 = read_numbers(dict, "C1"); + if (c0.empty()) { + c0 = {0.0}; + } + if (c1.empty()) { + c1 = {1.0}; + } + const double n = dict.get("N").as_real(); + if (domain.empty()) { + domain = {0.0, 1.0}; + } + return std::make_shared( + std::move(domain), std::move(range), std::move(c0), std::move(c1), n); + } + case 3: { + std::vector> functions; + if (const Object &fns = dict.get("Functions"); fns.is_array()) { + for (const Object &fn : fns.as_array()) { + functions.push_back(parse_function(fn, context)); + } + } + std::vector bounds = read_numbers(dict, "Bounds"); + std::vector encode = read_numbers(dict, "Encode"); + if (domain.empty()) { + domain = {0.0, 1.0}; + } + return std::make_shared( + std::move(domain), std::move(range), std::move(functions), + std::move(bounds), std::move(encode)); + } + case 0: { + std::vector size; + if (const Object &s = dict.get("Size"); s.is_array()) { + for (const Object &item : s.as_array()) { + size.push_back(static_cast(item.as_integer())); + } + } + const int bits = static_cast(dict.get("BitsPerSample").as_integer()); + std::vector encode = read_numbers(dict, "Encode"); + if (encode.empty()) { + for (const int dim : size) { + encode.push_back(0.0); + encode.push_back(dim - 1.0); + } + } + std::vector decode = read_numbers(dict, "Decode"); + if (decode.empty()) { + decode = range; + } + std::string samples = context.load_stream(object); + if (size.empty() || bits == 0 || range.empty()) { + return nullptr; + } + return std::make_shared( + std::move(domain), std::move(range), std::move(size), bits, + std::move(encode), std::move(decode), std::move(samples)); + } + case 4: { + const std::string program = context.load_stream(object); + std::size_t pos = 0; + std::vector items = parse_postscript(program, pos); + // Unwrap the outer `{ ... }` block so the program runs at top level. Move + // the inner block out to a local first: assigning it back into `items` + // directly would free the vector the source still lives in. + if (items.size() == 1 && + items.front().kind == PostScriptItem::Kind::block) { + std::vector body = std::move(items.front().block); + items = std::move(body); + } + if (domain.empty() || range.empty()) { + return nullptr; + } + return std::make_shared( + std::move(domain), std::move(range), std::move(items)); + } + default: + return nullptr; + } +} + +} // namespace odr::internal::pdf diff --git a/src/odr/internal/pdf/pdf_function.hpp b/src/odr/internal/pdf/pdf_function.hpp new file mode 100644 index 00000000..1a50fc26 --- /dev/null +++ b/src/odr/internal/pdf/pdf_function.hpp @@ -0,0 +1,62 @@ +#pragma once + +#include +#include +#include + +namespace odr::internal::pdf { + +class Object; + +/// A PDF function object (ISO 32000-1 7.10): a black-box mapping from `m` input +/// values to `n` output values. The four sampled/analytic flavours +/// (type 0 sampled, type 2 exponential, type 3 stitching, type 4 PostScript +/// calculator) share one interface. Inputs are clipped to `/Domain` and outputs +/// to `/Range` (when declared) by `eval`; the type-specific math lives in the +/// concrete subclasses. +class Function { +public: + virtual ~Function() = default; + + /// Evaluate the function: clip `in` to the domain, compute, clip the result + /// to the range (when one is declared). `in` should carry `input_arity` + /// values; a short input is zero-padded, a long one truncated. + [[nodiscard]] std::vector eval(std::vector in) const; + + [[nodiscard]] int input_arity() const { + return static_cast(m_domain.size() / 2); + } + /// Declared output arity, or 0 when the function carries no `/Range` (only + /// type 0 and 4 must; type 2/3 may omit it). + [[nodiscard]] int output_arity() const { + return static_cast(m_range.size() / 2); + } + +protected: + Function(std::vector domain, std::vector range) + : m_domain{std::move(domain)}, m_range{std::move(range)} {} + + [[nodiscard]] virtual std::vector + compute(const std::vector &in) const = 0; + + std::vector m_domain; // [min0 max0 min1 max1 ...] + std::vector m_range; // [min0 max0 ...], possibly empty +}; + +/// How `parse_function` reaches data behind indirect references: `resolve` +/// dereferences an object to a direct value (identity for a direct one); +/// `load_stream` returns the filter-decoded bytes of a stream object (used by +/// the sampled type 0 and the PostScript type 4). Decoupling these from the +/// document parser keeps `Function` pure and unit-testable. +struct FunctionContext { + std::function resolve; + std::function load_stream; +}; + +/// Build a function from its PDF object (a dictionary, or a stream for type +/// 0/4; may be an indirect reference). Returns `nullptr` for an unsupported or +/// malformed function type. +std::shared_ptr parse_function(const Object &object, + const FunctionContext &context); + +} // namespace odr::internal::pdf diff --git a/src/odr/internal/pdf/pdf_graphics_state.cpp b/src/odr/internal/pdf/pdf_graphics_state.cpp index ea8eb246..0c0e6d87 100644 --- a/src/odr/internal/pdf/pdf_graphics_state.cpp +++ b/src/odr/internal/pdf/pdf_graphics_state.cpp @@ -1,24 +1,11 @@ #include #include -#include - -#include namespace odr::internal::pdf { namespace { -ColorSpace color_space_name_to_enum(const std::string &name) { - static std::unordered_map mapping{ - {"grey", ColorSpace::device_grey}, - {"rgb", ColorSpace::device_rgb}, - {"cmyk", ColorSpace::device_cmyk}, - }; - - return util::map::lookup_default(mapping, name, ColorSpace::unknown); -} - util::math::Transform2D matrix_from_args(const GraphicsOperator &op) { return {op.arguments.at(0).as_real(), op.arguments.at(1).as_real(), op.arguments.at(2).as_real(), op.arguments.at(3).as_real(), @@ -129,6 +116,28 @@ void GraphicsState::clear_path() { m_subpath_start = {0, 0}; } +void GraphicsState::set_pending_clip(const bool even_odd) { + m_pending_clip = even_odd ? PendingClip::even_odd : PendingClip::nonzero; +} + +void GraphicsState::commit_clip() { + if (m_pending_clip == PendingClip::none) { + return; + } + current().clip.push_back( + ClipPath{path, m_pending_clip == PendingClip::even_odd}); + m_pending_clip = PendingClip::none; +} + +void GraphicsState::clip_bounding_box(const double x0, const double y0, + const double x1, const double y1) { + Subpath rect{to_user(x0, y0), {}, true}; + rect.segments.push_back({PathSegment::Kind::line, {}, {}, to_user(x1, y0)}); + rect.segments.push_back({PathSegment::Kind::line, {}, {}, to_user(x1, y1)}); + rect.segments.push_back({PathSegment::Kind::line, {}, {}, to_user(x0, y1)}); + current().clip.push_back(ClipPath{{std::move(rect)}, false}); +} + void GraphicsState::save() { stack.push_back(stack.back()); } void GraphicsState::restore() { stack.pop_back(); } @@ -216,6 +225,13 @@ void GraphicsState::execute(const GraphicsOperator &op) { op.arguments.at(2).as_real(), op.arguments.at(3).as_real()); break; + case GraphicsOperatorType::set_clipping_nonzero: // W + set_pending_clip(false); + break; + case GraphicsOperatorType::set_clipping_evenodd: // W* + set_pending_clip(true); + break; + case GraphicsOperatorType::set_text_char_spacing: current().text.char_spacing = op.arguments.at(0).as_real(); break; @@ -256,10 +272,9 @@ void GraphicsState::execute(const GraphicsOperator &op) { current().text.matrix = matrix_from_args(op); current().text.line_matrix = current().text.matrix; break; - case GraphicsOperatorType::text_next_line: // T* - next_line(0, -current().text.leading); - break; - case GraphicsOperatorType::show_text_next_line: // ' : T* then show + case GraphicsOperatorType::text_next_line: // T* + case GraphicsOperatorType::show_text_next_line: // ' : T* then show (in + // extractor) next_line(0, -current().text.leading); break; case GraphicsOperatorType::show_text_next_line_set_spacing: @@ -269,54 +284,49 @@ void GraphicsState::execute(const GraphicsOperator &op) { next_line(0, -current().text.leading); break; - case GraphicsOperatorType::set_stroke_color_space: - current().stroke_color.space = - color_space_name_to_enum(op.arguments.at(0).as_string()); - break; - case GraphicsOperatorType::set_stroke_color: - case GraphicsOperatorType::set_stroke_color_name: - // SC/SCN over a named/ICC/Separation/Pattern color space: stage 4.4. - break; + // The color-space (`cs`/`CS`) and general color (`sc`/`scn`/`SC`/`SCN`) + // operators need the `/ColorSpace` resources to resolve, so they are handled + // in `pdf_page_extractor` (which has the `Resources`), not here. The device + // color operators below carry their components inline and clear any active + // non-device color space. case GraphicsOperatorType::set_stroke_grey_color: current().stroke_color.space = ColorSpace::device_grey; current().stroke_color.grey = op.arguments.at(0).as_real(); + current().stroke_color.def = nullptr; break; case GraphicsOperatorType::set_stroke_rgb_color: current().stroke_color.space = ColorSpace::device_rgb; for (int i = 0; i < 3; ++i) { current().stroke_color.rgb.at(i) = op.arguments.at(i).as_real(); } + current().stroke_color.def = nullptr; break; case GraphicsOperatorType::set_stroke_cmyk_color: current().stroke_color.space = ColorSpace::device_cmyk; for (int i = 0; i < 4; ++i) { current().stroke_color.cmyk.at(i) = op.arguments.at(i).as_real(); } + current().stroke_color.def = nullptr; break; - case GraphicsOperatorType::set_other_color_space: - current().other_color.space = - color_space_name_to_enum(op.arguments.at(0).as_string()); - break; - case GraphicsOperatorType::set_other_color: - case GraphicsOperatorType::set_other_color_name: - // sc/scn over a named/ICC/Separation/Pattern color space: stage 4.4. - break; case GraphicsOperatorType::set_other_grey_color: current().other_color.space = ColorSpace::device_grey; current().other_color.grey = op.arguments.at(0).as_real(); + current().other_color.def = nullptr; break; case GraphicsOperatorType::set_other_rgb_color: current().other_color.space = ColorSpace::device_rgb; for (int i = 0; i < 3; ++i) { current().other_color.rgb.at(i) = op.arguments.at(i).as_real(); } + current().other_color.def = nullptr; break; case GraphicsOperatorType::set_other_cmyk_color: current().other_color.space = ColorSpace::device_cmyk; for (int i = 0; i < 4; ++i) { current().other_color.cmyk.at(i) = op.arguments.at(i).as_real(); } + current().other_color.def = nullptr; break; case GraphicsOperatorType::set_glyph_width: diff --git a/src/odr/internal/pdf/pdf_graphics_state.hpp b/src/odr/internal/pdf/pdf_graphics_state.hpp index 0e9c3c2e..9bf32662 100644 --- a/src/odr/internal/pdf/pdf_graphics_state.hpp +++ b/src/odr/internal/pdf/pdf_graphics_state.hpp @@ -9,6 +9,7 @@ namespace odr::internal::pdf { struct GraphicsOperator; +struct ColorSpaceDef; enum class ColorSpace { unknown, @@ -37,6 +38,15 @@ struct Subpath { bool closed{false}; }; +/// One clipping region (ISO 32000-1 8.5.4): the path established by a `W`/`W*` +/// operator, in user space, plus the winding rule that fills it. The current +/// clip is the *intersection* of an ordered list of these — a path is visible +/// only where it lies inside every one. +struct ClipPath { + std::vector subpaths; + bool even_odd{false}; +}; + struct GraphicsState { /// Dash pattern (`d`): the on/off lengths and the starting phase, in user /// space. An empty array is a solid line (ISO 32000-1 8.4.3.6). @@ -82,6 +92,12 @@ struct GraphicsState { double grey{}; std::array rgb{}; std::array cmyk{}; + /// The active non-device colour space set by `cs`/`CS` (a `/ColorSpace` + /// resource: ICCBased, Separation, Indexed, …), owned by `Resources`. When + /// set, `sc`/`scn` components are converted through it to the `rgb` above + /// at the time the operator runs; null for a device colour space. Cleared + /// by the device colour operators (`g`/`rg`/`k`). + const ColorSpaceDef *def{nullptr}; }; struct State { @@ -89,6 +105,10 @@ struct GraphicsState { Text text; Color stroke_color; Color other_color; + /// Current clipping path: the intersection of these regions (empty = the + /// whole page). Part of the saved/restored state (ISO 32000-1 8.5.4), so + /// `q`/`Q` and form-XObject invocation scope it like the CTM. + std::vector clip; }; std::vector stack; @@ -122,6 +142,18 @@ struct GraphicsState { /// the accumulated path, as every path-painting operator does on completion. void clear_path(); + /// Clipping (`W`/`W*`, ISO 32000-1 8.5.4). `set_pending_clip` records that + /// the current path is to *become* a clip; `commit_clip` then installs it — + /// as the intersection with the current clip — when the next painting (or + /// `n`) operator completes. The path painted by that operator is still + /// clipped by the *old* clip, so the caller snapshots the clip before calling + /// `commit_clip`. + void set_pending_clip(bool even_odd); + void commit_clip(); + /// Intersect a rectangle (in the current CTM's space, e.g. a form's `/BBox`) + /// into the current clip; the corners are mapped through the CTM. + void clip_bounding_box(double x0, double y0, double x1, double y1); + /// Push a copy of the current state (`q`). void save(); /// Pop the current state (`Q`). @@ -154,6 +186,12 @@ struct GraphicsState { std::array m_current_point{0, 0}; // user space std::array m_subpath_start{0, 0}; // user space, for `h`/close + + /// A pending `W`/`W*` between path construction and the painting operator + /// that installs it. Not part of the saved state: a `W` is always followed by + /// a painting/`n` operator before any `q`/`Q` (ISO 32000-1 8.5.4). + enum class PendingClip { none, nonzero, even_odd }; + PendingClip m_pending_clip{PendingClip::none}; }; } // namespace odr::internal::pdf diff --git a/src/odr/internal/pdf/pdf_page_element.hpp b/src/odr/internal/pdf/pdf_page_element.hpp index 80cd4be7..b071c22d 100644 --- a/src/odr/internal/pdf/pdf_page_element.hpp +++ b/src/odr/internal/pdf/pdf_page_element.hpp @@ -59,11 +59,15 @@ struct TextElement { /// space. The geometry is fully resolved (the CTM applied at construction), so /// a renderer maps it through the page transform alone. The paint intent and /// the paint-relevant graphics state are snapshotted; colors are kept as PDF -/// device colors and converted to RGB by the renderer. `/Pattern` color and -/// clipping are stage 4.3+ and not yet represented. +/// device colors and converted to RGB by the renderer. `/Pattern` color is +/// stage 4.9+ and not yet represented. struct PathElement { /// The subpaths to paint, in user space. std::vector subpaths; + /// The clip in force when this path was painted: the intersection of these + /// regions (empty = unclipped). Snapshotted from the graphics state at paint + /// time so the renderer can install it without replaying the clip stack. + std::vector clip; bool fill{false}; bool stroke{false}; /// Fill rule: false = nonzero winding, true = even-odd. diff --git a/src/odr/internal/pdf/pdf_page_extractor.cpp b/src/odr/internal/pdf/pdf_page_extractor.cpp index 4b265aac..ce069371 100644 --- a/src/odr/internal/pdf/pdf_page_extractor.cpp +++ b/src/odr/internal/pdf/pdf_page_extractor.cpp @@ -2,6 +2,7 @@ #include +#include #include #include #include @@ -307,6 +308,70 @@ void show(std::vector &out, GraphicsState &state, pen = Pen{{after.e, after.f}, direction, text.size * basis, trailing_space}; } +/// Resolve a colour-space operator (`cs`/`CS`): set the active colour space on +/// `color` from the resource `/ColorSpace` table (or a device space named +/// inline) and reset it to the space's initial colour (ISO 32000-1 8.6.8). +void set_color_space(GraphicsState::Color &color, const std::string &name, + const Resources &resources) { + color = GraphicsState::Color{}; + if (name == "DeviceGray" || name == "G") { + color.space = ColorSpace::device_grey; + } else if (name == "DeviceRGB" || name == "RGB") { + color.space = ColorSpace::device_rgb; + } else if (name == "DeviceCMYK" || name == "CMYK") { + color.space = ColorSpace::device_cmyk; + color.cmyk = {0, 0, 0, 1}; // initial DeviceCMYK colour is black + } else if (const auto it = resources.color_space.find(name); + it != resources.color_space.end() && it->second != nullptr) { + color.def = it->second.get(); + color.space = ColorSpace::device_rgb; + color.rgb = color.def->to_rgb(color.def->initial_components()); + } else { + color.space = ColorSpace::unknown; + } +} + +/// Resolve a general colour operator (`sc`/`scn`/`SC`/`SCN`): convert the +/// operand components through the active colour space to RGB. With no resource +/// colour space, interpret the components as a device colour by their count +/// (ISO 32000-1 8.6.8). A trailing name operand (a `/Pattern`) carries no +/// convertible components — left as-is (stage 4.9/4.10). +void set_color(GraphicsState::Color &color, const GraphicsOperator &op) { + std::vector components; + bool has_pattern_name = false; + for (const Object &argument : op.arguments) { + if (argument.is_name()) { + has_pattern_name = true; + } else if (argument.is_real()) { + components.push_back(argument.as_real()); + } + } + if (color.def != nullptr) { + color.space = ColorSpace::device_rgb; + color.rgb = color.def->to_rgb(components); + return; + } + if (has_pattern_name) { + return; + } + switch (components.size()) { + case 1: + color.space = ColorSpace::device_grey; + color.grey = components[0]; + break; + case 3: + color.space = ColorSpace::device_rgb; + color.rgb = {components[0], components[1], components[2]}; + break; + case 4: + color.space = ColorSpace::device_cmyk; + color.cmyk = {components[0], components[1], components[2], components[3]}; + break; + default: + break; + } +} + /// Emit a path-painting element from the path accumulated in `state` and the /// current paint state, then clear the path (as every painting operator does). /// `close` first closes the current subpath (the `s`/`b`/`b*` variants). @@ -318,6 +383,9 @@ void paint_path(std::vector &out, GraphicsState &state, bool fill, const GraphicsState::State &s = state.current(); PathElement element; element.subpaths = state.path; + // The path is painted under the clip in force *before* a pending `W`/`W*` is + // installed (ISO 32000-1 8.5.4): snapshot the current clip, then commit. + element.clip = s.clip; element.fill = fill; element.stroke = stroke; element.even_odd = even_odd; @@ -339,6 +407,9 @@ void paint_path(std::vector &out, GraphicsState &state, bool fill, } element.dash_phase = s.general.dash.phase * scale; out.push_back(std::move(element)); + // Install a pending `W`/`W*` now (it uses the just-painted path) so it scopes + // the *following* content, then drop the path. + state.commit_clip(); state.clear_path(); } @@ -405,6 +476,13 @@ void invoke_x_object(const std::string &name, const Resources &resources, state.save(); state.concat_matrix(x_object->matrix); + // `/BBox` clips the form's content to its bounding box (ISO 32000-1 8.10.2), + // mapped through the (now form-matrix-concatenated) CTM. Scoped by the + // surrounding save/restore. + if (x_object->bbox.has_value()) { + const std::array &b = *x_object->bbox; + state.clip_bounding_box(b[0], b[1], b[2], b[3]); + } const Resources &scope = x_object->resources != nullptr ? *x_object->resources : resources; // A form's marked content must be self-balanced; truncate back to the entry @@ -487,10 +565,37 @@ void run_content(const std::string &content, const Resources &resources, paint_path(out, state, true, true, true, true); break; case GraphicsOperatorType::end_path: // n - // Path painted with no marks (used after a clip operator, stage 4.3); - // discard the geometry. + // Path painted with no marks — its only role is to install a pending + // `W`/`W*` clip (ISO 32000-1 8.5.4); commit it, then discard the + // geometry. + state.commit_clip(); state.clear_path(); break; + + // Colour-space and general-colour operators are resolved here (not in + // `GraphicsState::execute`) because they consult the `/ColorSpace` + // resources. The device colour operators stay in `execute`. + case GraphicsOperatorType::set_other_color_space: // cs + if (!op.arguments.empty()) { + set_color_space(state.current().other_color, + op.arguments.at(0).as_string(), resources); + } + break; + case GraphicsOperatorType::set_stroke_color_space: // CS + if (!op.arguments.empty()) { + set_color_space(state.current().stroke_color, + op.arguments.at(0).as_string(), resources); + } + break; + case GraphicsOperatorType::set_other_color: // sc + case GraphicsOperatorType::set_other_color_name: // scn + set_color(state.current().other_color, op); + break; + case GraphicsOperatorType::set_stroke_color: // SC + case GraphicsOperatorType::set_stroke_color_name: // SCN + set_color(state.current().stroke_color, op); + break; + case GraphicsOperatorType::begin_marked_content_seq: // BMC case GraphicsOperatorType::begin_marked_content_seq_props: // BDC begin_marked_content(op, resources, marked); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 415fa6e0..65452585 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -41,6 +41,7 @@ add_executable(odr_test "src/internal/pdf/pdf_cid.cpp" "src/internal/pdf/pdf_cmap.cpp" + "src/internal/pdf/pdf_color.cpp" "src/internal/pdf/pdf_document_parser.cpp" "src/internal/pdf/pdf_encoding.cpp" "src/internal/pdf/pdf_encryption.cpp" @@ -48,6 +49,7 @@ add_executable(odr_test "src/internal/pdf/pdf_file_parser.cpp" "src/internal/pdf/pdf_filter.cpp" "src/internal/pdf/pdf_font.cpp" + "src/internal/pdf/pdf_function.cpp" "src/internal/util/math_util_test.cpp" "src/internal/pdf/pdf_object.cpp" "src/internal/pdf/pdf_object_parser.cpp" diff --git a/test/src/internal/pdf/pdf_color.cpp b/test/src/internal/pdf/pdf_color.cpp new file mode 100644 index 00000000..41e539b2 --- /dev/null +++ b/test/src/internal/pdf/pdf_color.cpp @@ -0,0 +1,140 @@ +#include + +#include + +#include +#include +#include + +#include + +using namespace odr::internal::pdf; + +namespace { + +ColorSpaceContext context() { + ColorSpaceContext ctx; + ctx.resolve = [](const Object &object) { return object; }; + ctx.load_stream = [](const Object &) { return std::string{}; }; + ctx.named = nullptr; + return ctx; +} + +Object reals(std::initializer_list values) { + std::vector holder; + for (const double value : values) { + holder.emplace_back(Real{value}); + } + return Object(Array(std::move(holder))); +} + +ColorSpaceDef device(const ColorSpaceKind kind, const int components) { + ColorSpaceDef def; + def.kind = kind; + def.components = components; + return def; +} + +} // namespace + +// The device spaces convert as expected (CMYK by the naive formula). +TEST(PdfColor, device_spaces) { + EXPECT_EQ(device(ColorSpaceKind::device_gray, 1).to_rgb({0.5}), + (std::array{0.5, 0.5, 0.5})); + EXPECT_EQ(device(ColorSpaceKind::device_rgb, 3).to_rgb({0.2, 0.4, 0.6}), + (std::array{0.2, 0.4, 0.6})); + EXPECT_EQ(device(ColorSpaceKind::device_cmyk, 4).to_rgb({0, 0, 0, 0}), + (std::array{1, 1, 1})); + EXPECT_EQ(device(ColorSpaceKind::device_cmyk, 4).to_rgb({0, 0, 0, 1}), + (std::array{0, 0, 0})); +} + +// L*a*b* maps the lightness extremes to white and black under the default +// (D65) white point. +TEST(PdfColor, lab_extremes) { + ColorSpaceDef lab = device(ColorSpaceKind::lab, 3); + const std::array white = lab.to_rgb({100, 0, 0}); + EXPECT_NEAR(white[0], 1.0, 0.02); + EXPECT_NEAR(white[1], 1.0, 0.02); + EXPECT_NEAR(white[2], 1.0, 0.02); + const std::array black = lab.to_rgb({0, 0, 0}); + EXPECT_NEAR(black[0], 0.0, 0.02); + EXPECT_NEAR(black[1], 0.0, 0.02); + EXPECT_NEAR(black[2], 0.0, 0.02); +} + +// ICCBased approximates by its component count when no engine is present: +// N = 3 behaves as DeviceRGB. +TEST(PdfColor, iccbased_by_component_count) { + Dictionary stream_dict; + stream_dict["N"] = Object(Integer{3}); + std::vector array{Object(Name{"ICCBased"}), Object(stream_dict)}; + + const auto def = + parse_color_space(Object(Array(std::move(array))), context()); + ASSERT_NE(def, nullptr); + EXPECT_EQ(def->kind, ColorSpaceKind::icc_based); + EXPECT_EQ(def->components, 3); + EXPECT_EQ(def->to_rgb({0.2, 0.4, 0.6}), + (std::array{0.2, 0.4, 0.6})); +} + +// Indexed looks an index up in the palette and converts through the base space. +TEST(PdfColor, indexed_palette) { + // hival 1, base DeviceRGB, palette = [255,0,0, 0,255,0]. + const std::string palette("\xff\x00\x00\x00\xff\x00", 6); + std::vector array{Object(Name{"Indexed"}), Object(Name{"DeviceRGB"}), + Object(Integer{1}), + Object(StandardString(palette))}; + + const auto def = + parse_color_space(Object(Array(std::move(array))), context()); + ASSERT_NE(def, nullptr); + EXPECT_EQ(def->kind, ColorSpaceKind::indexed); + EXPECT_EQ(def->to_rgb({0}), (std::array{1, 0, 0})); + EXPECT_EQ(def->to_rgb({1}), (std::array{0, 1, 0})); +} + +// Separation samples its tint transform, then converts through the alternate. +TEST(PdfColor, separation_tint_transform) { + // tint: type 2, C0 = white, C1 = red, N = 1 -> tint(t) = (1, 1-t, 1-t). + Dictionary tint; + tint["FunctionType"] = Object(Integer{2}); + tint["Domain"] = reals({0, 1}); + tint["C0"] = reals({1, 1, 1}); + tint["C1"] = reals({1, 0, 0}); + tint["N"] = Object(Real{1}); + + std::vector array{Object(Name{"Separation"}), Object(Name{"Spot"}), + Object(Name{"DeviceRGB"}), Object(tint)}; + + const auto def = + parse_color_space(Object(Array(std::move(array))), context()); + ASSERT_NE(def, nullptr); + EXPECT_EQ(def->kind, ColorSpaceKind::separation); + EXPECT_EQ(def->components, 1); + // full tint -> C1 = red + EXPECT_EQ(def->to_rgb({1.0}), (std::array{1, 0, 0})); + // half tint -> (1, 0.5, 0.5) + const std::array half = def->to_rgb({0.5}); + EXPECT_NEAR(half[0], 1.0, 1e-9); + EXPECT_NEAR(half[1], 0.5, 1e-9); + EXPECT_NEAR(half[2], 0.5, 1e-9); +} + +// A colour space's initial component values (ISO 32000-1 8.6.3): zero for the +// device families, full tint for Separation/DeviceN. +TEST(PdfColor, initial_components) { + EXPECT_EQ(device(ColorSpaceKind::device_rgb, 3).initial_components(), + (std::vector{0, 0, 0})); + ColorSpaceDef sep = device(ColorSpaceKind::separation, 1); + EXPECT_EQ(sep.initial_components(), (std::vector{1.0})); +} + +// A name resolves to the matching device space. +TEST(PdfColor, name_resolves_device_space) { + const auto def = parse_color_space(Object(Name{"DeviceCMYK"}), context()); + ASSERT_NE(def, nullptr); + EXPECT_EQ(def->kind, ColorSpaceKind::device_cmyk); + EXPECT_EQ(def->components, 4); +} diff --git a/test/src/internal/pdf/pdf_function.cpp b/test/src/internal/pdf/pdf_function.cpp new file mode 100644 index 00000000..cc61432f --- /dev/null +++ b/test/src/internal/pdf/pdf_function.cpp @@ -0,0 +1,183 @@ +#include + +#include + +#include +#include + +#include + +using namespace odr::internal::pdf; + +namespace { + +// A context with no indirection: objects resolve to themselves, and every +// stream returns the captured bytes (the function under test is the only one). +FunctionContext context(std::string stream = {}) { + FunctionContext ctx; + ctx.resolve = [](const Object &object) { return object; }; + ctx.load_stream = [data = std::move(stream)](const Object &) { return data; }; + return ctx; +} + +Object reals(std::initializer_list values) { + std::vector holder; + for (const double value : values) { + holder.emplace_back(Real{value}); + } + return Object(Array(std::move(holder))); +} + +Object integers(std::initializer_list values) { + std::vector holder; + for (const int value : values) { + holder.emplace_back(Integer{value}); + } + return Object(Array(std::move(holder))); +} + +} // namespace + +// Type 2: linear interpolation between C0 and C1 (N = 1). +TEST(PdfFunction, exponential_linear) { + Dictionary dict; + dict["FunctionType"] = Object(Integer{2}); + dict["Domain"] = reals({0, 1}); + dict["C0"] = reals({0, 0, 0}); + dict["C1"] = reals({1, 0.5, 0}); + dict["N"] = Object(Real{1}); + + const auto fn = parse_function(Object(dict), context()); + ASSERT_NE(fn, nullptr); + const std::vector out = fn->eval({0.5}); + ASSERT_EQ(out.size(), 3); + EXPECT_DOUBLE_EQ(out[0], 0.5); + EXPECT_DOUBLE_EQ(out[1], 0.25); + EXPECT_DOUBLE_EQ(out[2], 0.0); +} + +// Type 2 with N = 2: the input is raised to the exponent before interpolation. +TEST(PdfFunction, exponential_power) { + Dictionary dict; + dict["FunctionType"] = Object(Integer{2}); + dict["Domain"] = reals({0, 1}); + dict["C0"] = reals({0}); + dict["C1"] = reals({1}); + dict["N"] = Object(Real{2}); + + const auto fn = parse_function(Object(dict), context()); + EXPECT_DOUBLE_EQ(fn->eval({0.5})[0], 0.25); +} + +// Inputs outside the domain are clipped before evaluation. +TEST(PdfFunction, clips_domain) { + Dictionary dict; + dict["FunctionType"] = Object(Integer{2}); + dict["Domain"] = reals({0, 1}); + dict["C0"] = reals({0}); + dict["C1"] = reals({1}); + dict["N"] = Object(Real{1}); + + const auto fn = parse_function(Object(dict), context()); + EXPECT_DOUBLE_EQ(fn->eval({2.0})[0], 1.0); // clipped to 1 + EXPECT_DOUBLE_EQ(fn->eval({-1.0})[0], 0.0); // clipped to 0 +} + +// Type 3: stitch two subfunctions at the bound, each over its encoded +// subdomain. +TEST(PdfFunction, stitching) { + Dictionary sub0; + sub0["FunctionType"] = Object(Integer{2}); + sub0["Domain"] = reals({0, 1}); + sub0["C0"] = reals({0}); + sub0["C1"] = reals({1}); + sub0["N"] = Object(Real{1}); + Dictionary sub1; + sub1["FunctionType"] = Object(Integer{2}); + sub1["Domain"] = reals({0, 1}); + sub1["C0"] = reals({1}); + sub1["C1"] = reals({0}); + sub1["N"] = Object(Real{1}); + + std::vector functions{Object(sub0), Object(sub1)}; + Dictionary dict; + dict["FunctionType"] = Object(Integer{3}); + dict["Domain"] = reals({0, 1}); + dict["Functions"] = Object(Array(std::move(functions))); + dict["Bounds"] = reals({0.5}); + dict["Encode"] = reals({0, 1, 0, 1}); + + const auto fn = parse_function(Object(dict), context()); + ASSERT_NE(fn, nullptr); + // 0.25 -> sub0, encoded 0.25/0.5 = 0.5 -> 0.5. + EXPECT_DOUBLE_EQ(fn->eval({0.25})[0], 0.5); + // 0.75 -> sub1, encoded (0.75-0.5)/0.5 = 0.5 -> 1 + 0.5*(0-1) = 0.5. + EXPECT_DOUBLE_EQ(fn->eval({0.75})[0], 0.5); + // endpoints of each segment + EXPECT_DOUBLE_EQ(fn->eval({0.0})[0], 0.0); + EXPECT_DOUBLE_EQ(fn->eval({1.0})[0], 0.0); +} + +// Type 0: a 1-D, 8-bit sample table, linearly interpolated and decoded. +TEST(PdfFunction, sampled_linear) { + Dictionary dict; + dict["FunctionType"] = Object(Integer{0}); + dict["Domain"] = reals({0, 1}); + dict["Range"] = reals({0, 1}); + dict["Size"] = integers({2}); + dict["BitsPerSample"] = Object(Integer{8}); + + const std::string samples("\x00\xff", 2); // sample[0]=0, sample[1]=255 + const auto fn = parse_function(Object(dict), context(samples)); + ASSERT_NE(fn, nullptr); + EXPECT_DOUBLE_EQ(fn->eval({0.0})[0], 0.0); + EXPECT_DOUBLE_EQ(fn->eval({1.0})[0], 1.0); + EXPECT_NEAR(fn->eval({0.5})[0], 0.5, 1e-6); // (0 + 255)/2 / 255 +} + +// Type 4: a PostScript calculator program, one input and one output. +TEST(PdfFunction, postscript_invert) { + Dictionary dict; + dict["FunctionType"] = Object(Integer{4}); + dict["Domain"] = reals({0, 1}); + dict["Range"] = reals({0, 1}); + + const auto fn = parse_function(Object(dict), context("{ 1 exch sub }")); + ASSERT_NE(fn, nullptr); + EXPECT_DOUBLE_EQ(fn->eval({0.25})[0], 0.75); + EXPECT_DOUBLE_EQ(fn->eval({1.0})[0], 0.0); +} + +// Type 4 with two inputs and arithmetic; outputs clipped to the range. +TEST(PdfFunction, postscript_two_inputs) { + Dictionary dict; + dict["FunctionType"] = Object(Integer{4}); + dict["Domain"] = reals({0, 1, 0, 1}); + dict["Range"] = reals({0, 2}); + + const auto fn = parse_function(Object(dict), context("{ add }")); + ASSERT_NE(fn, nullptr); + EXPECT_DOUBLE_EQ(fn->eval({0.3, 0.4})[0], 0.7); +} + +// Type 4 control flow: ifelse selects a branch on a boolean. +TEST(PdfFunction, postscript_ifelse) { + Dictionary dict; + dict["FunctionType"] = Object(Integer{4}); + dict["Domain"] = reals({0, 1}); + dict["Range"] = reals({0, 10}); + + // x 0.5 gt { 9 } { 1 } ifelse -> 9 when x > 0.5, else 1 + const auto fn = + parse_function(Object(dict), context("{ 0.5 gt { 9 } { 1 } ifelse }")); + ASSERT_NE(fn, nullptr); + EXPECT_DOUBLE_EQ(fn->eval({0.8})[0], 9.0); + EXPECT_DOUBLE_EQ(fn->eval({0.2})[0], 1.0); +} + +// An unsupported function type yields a null function rather than throwing. +TEST(PdfFunction, unsupported_type_is_null) { + Dictionary dict; + dict["FunctionType"] = Object(Integer{9}); + EXPECT_EQ(parse_function(Object(dict), context()), nullptr); +} diff --git a/test/src/internal/pdf/pdf_page_extractor.cpp b/test/src/internal/pdf/pdf_page_extractor.cpp index bf3a4e6e..7d6f1fce 100644 --- a/test/src/internal/pdf/pdf_page_extractor.cpp +++ b/test/src/internal/pdf/pdf_page_extractor.cpp @@ -2,8 +2,10 @@ #include +#include #include +#include #include #include #include @@ -695,3 +697,158 @@ TEST(PdfPageExtractor, stroke_dash_reset_to_solid) { const auto page = run_page("[3 2] 0 d [] 0 d 0 0 m 10 0 l S"); EXPECT_TRUE(path_at(page, 0).dash_array.empty()); } + +// --- stage 4.4: colour spaces --------------------------------------------- + +namespace { + +std::shared_ptr rgb_space() { + auto def = std::make_shared(); + def->kind = ColorSpaceKind::device_rgb; + def->components = 3; + return def; +} + +} // namespace + +// `cs`/`scn` over a named resource colour space resolve the fill colour to RGB. +TEST(PdfPageExtractor, scn_resolves_named_color_space) { + Resources res; + res.color_space["CS0"] = rgb_space(); + + const auto page = extract_page("/CS0 cs 0.2 0.4 0.6 scn 0 0 10 10 re f", res, + Logger::null()); + ASSERT_EQ(page.size(), 1); + const PathElement &p = std::get(page[0]); + EXPECT_EQ(p.fill_color.space, ColorSpace::device_rgb); + EXPECT_DOUBLE_EQ(p.fill_color.rgb[0], 0.2); + EXPECT_DOUBLE_EQ(p.fill_color.rgb[1], 0.4); + EXPECT_DOUBLE_EQ(p.fill_color.rgb[2], 0.6); +} + +// A Separation space samples its tint transform to RGB at `scn` time. +TEST(PdfPageExtractor, scn_separation_through_tint) { + // tint: type 2, C0 = white, C1 = red -> tint(t) = (1, 1-t, 1-t). + Dictionary tint; + tint["FunctionType"] = Object(Integer{2}); + tint["Domain"] = [] { + std::vector h{Object(Real{0}), Object(Real{1})}; + return Object(Array(std::move(h))); + }(); + tint["C0"] = [] { + std::vector h{Object(Real{1}), Object(Real{1}), Object(Real{1})}; + return Object(Array(std::move(h))); + }(); + tint["C1"] = [] { + std::vector h{Object(Real{1}), Object(Real{0}), Object(Real{0})}; + return Object(Array(std::move(h))); + }(); + tint["N"] = Object(Real{1}); + std::vector array{Object(Name{"Separation"}), Object(Name{"Spot"}), + Object(Name{"DeviceRGB"}), Object(tint)}; + ColorSpaceContext ctx; + ctx.resolve = [](const Object &o) { return o; }; + ctx.load_stream = [](const Object &) { return std::string{}; }; + ctx.named = nullptr; + + Resources res; + res.color_space["Sep"] = + parse_color_space(Object(Array(std::move(array))), ctx); + + const auto page = + extract_page("/Sep cs 1.0 scn 0 0 10 10 re f", res, Logger::null()); + ASSERT_EQ(page.size(), 1); + const PathElement &p = std::get(page[0]); + EXPECT_EQ(p.fill_color.space, ColorSpace::device_rgb); + EXPECT_DOUBLE_EQ(p.fill_color.rgb[0], 1.0); // full tint -> red + EXPECT_DOUBLE_EQ(p.fill_color.rgb[1], 0.0); + EXPECT_DOUBLE_EQ(p.fill_color.rgb[2], 0.0); +} + +// A device colour operator (`rg`) clears a previously set resource colour +// space, so a following device colour is not mis-resolved through it. +TEST(PdfPageExtractor, device_color_clears_color_space) { + Resources res; + res.color_space["CS0"] = rgb_space(); + + const auto page = extract_page("/CS0 cs 0.1 0.2 0.3 scn 1 0 0 rg " + "0 0 10 10 re f", + res, Logger::null()); + ASSERT_EQ(page.size(), 1); + const PathElement &p = std::get(page[0]); + EXPECT_EQ(p.fill_color.space, ColorSpace::device_rgb); + EXPECT_DOUBLE_EQ(p.fill_color.rgb[0], 1.0); + EXPECT_DOUBLE_EQ(p.fill_color.rgb[1], 0.0); +} + +// --- stage 4.3: clipping -------------------------------------------------- + +// A `W n` clip rect limits a later fill: the fill carries the clip region (the +// rect), nonzero rule, while the clip itself paints nothing. +TEST(PdfPageExtractor, clip_rect_limits_later_fill) { + const auto page = run_page("0 0 100 100 re W n 10 10 200 200 re f"); + ASSERT_EQ(page.size(), 1); // only the fill emits an element + const PathElement &p = path_at(page, 0); + EXPECT_TRUE(p.fill); + ASSERT_EQ(p.clip.size(), 1); + EXPECT_FALSE(p.clip[0].even_odd); + ASSERT_EQ(p.clip[0].subpaths.size(), 1); + EXPECT_DOUBLE_EQ(p.clip[0].subpaths[0].start[0], 0); + EXPECT_DOUBLE_EQ(p.clip[0].subpaths[0].segments[0].end[0], 100); // 0 + 100 +} + +// `W*` selects the even-odd clip rule. +TEST(PdfPageExtractor, clip_evenodd_rule) { + const auto page = run_page("0 0 10 10 re W* n 0 0 5 5 re f"); + ASSERT_EQ(page.size(), 1); + ASSERT_EQ(path_at(page, 0).clip.size(), 1); + EXPECT_TRUE(path_at(page, 0).clip[0].even_odd); +} + +// The painting operator that installs a clip is itself clipped only by the +// *previous* clip, not the one it establishes (ISO 32000-1 8.5.4). +TEST(PdfPageExtractor, clip_excludes_its_own_paint) { + // `re W f`: the fill paints under no clip; the rect becomes a clip + // afterwards. + const auto page = run_page("0 0 10 10 re W f 0 0 5 5 re f"); + ASSERT_EQ(page.size(), 2); + EXPECT_TRUE(path_at(page, 0).clip.empty()); // the clip-establishing fill + ASSERT_EQ(path_at(page, 1).clip.size(), 1); // the next fill is clipped +} + +// Nested clips intersect: a second `W n` adds a region, so a later fill carries +// both, in order. +TEST(PdfPageExtractor, clip_nested_intersect) { + const auto page = + run_page("0 0 100 100 re W n 10 10 50 50 re W n 20 20 10 10 re f"); + ASSERT_EQ(page.size(), 1); + ASSERT_EQ(path_at(page, 0).clip.size(), 2); +} + +// The clip is part of the saved/restored graphics state: a clip set inside +// `q`/`Q` does not leak to content after the `Q`. +TEST(PdfPageExtractor, clip_save_restore) { + const auto page = run_page("q 0 0 10 10 re W n 0 0 5 5 re f Q 0 0 5 5 re f"); + ASSERT_EQ(page.size(), 2); + EXPECT_EQ(path_at(page, 0).clip.size(), 1); // inside q/Q: clipped + EXPECT_TRUE(path_at(page, 1).clip.empty()); // after Q: clip gone +} + +// A form XObject's `/BBox` clips its content (ISO 32000-1 8.10.2), mapped +// through the form `/Matrix` + CTM; the clip is scoped to the form. +TEST(PdfPageExtractor, form_bbox_clips_content) { + XObject form = form_x_object("0 0 100 100 re f"); + form.bbox = std::array{10, 20, 30, 40}; + Resources res; + res.x_object["Fm0"] = &form; + + const auto page = extract_page("/Fm0 Do", res, Logger::null()); + ASSERT_EQ(page.size(), 1); + const PathElement &p = std::get(page[0]); + ASSERT_EQ(p.clip.size(), 1); + const Subpath &s = p.clip[0].subpaths[0]; + EXPECT_DOUBLE_EQ(s.start[0], 10); + EXPECT_DOUBLE_EQ(s.start[1], 20); + EXPECT_DOUBLE_EQ(s.segments[1].end[0], 30); // x1 + EXPECT_DOUBLE_EQ(s.segments[1].end[1], 40); // y1 +}