-
Notifications
You must be signed in to change notification settings - Fork 624
[client-v2] Binary String support #2887
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,169 @@ | ||
| package com.clickhouse.client.api.data_formats; | ||
|
|
||
| import java.nio.ByteBuffer; | ||
| import java.nio.charset.Charset; | ||
| import java.nio.charset.StandardCharsets; | ||
| import java.util.Objects; | ||
|
|
||
| /** | ||
| * Holder for ClickHouse {@code String} or {@code FixedString} values that preserves raw bytes | ||
| * to avoid lossy decoding and unnecessary allocations. | ||
| * <p> | ||
| * <b>This is a mutable structure and must be used with care.</b> To avoid copying, it does not | ||
| * duplicate the bytes it is given: the constructor wraps the supplied array/buffer instead of | ||
| * copying it, and {@link #toByteArray()} returns a direct reference to the backing array rather | ||
| * than a defensive copy. Consequently, mutating the source array, the array returned by | ||
| * {@link #toByteArray()}, or reading the same value concurrently while it is being modified will | ||
| * change the observed value. Callers that need an independent snapshot must copy the bytes | ||
| * themselves. | ||
| * <p> | ||
| * Backed by a {@link ByteBuffer} for a richer API and future off-heap memory support. Only heap | ||
| * buffers (with an accessible backing array) are supported today; constructing a value from a | ||
| * direct (off-heap) buffer is rejected. The decoded {@link String} produced by {@link #asString()} | ||
| * is cached. | ||
| */ | ||
| public class StringValue { | ||
|
|
||
| /** Charset used by {@link #asString()} and {@link #toString()} when no charset is provided. */ | ||
| public static final Charset DEFAULT_CHARSET = StandardCharsets.UTF_8; | ||
|
|
||
| private final ByteBuffer buffer; | ||
|
|
||
| private final Charset defaultCharset; | ||
|
|
||
| private volatile String cached; | ||
|
|
||
| /** | ||
| * Creates a value backed by the given bytes. The array is wrapped, not copied, so it must not be | ||
| * modified after being passed in. | ||
| * | ||
| * @param bytes raw value bytes (not null) | ||
| */ | ||
| public StringValue(byte[] bytes) { | ||
| this(bytes, DEFAULT_CHARSET); | ||
| } | ||
|
|
||
| /** | ||
| * Creates a value backed by the given bytes using the provided default charset. The array is wrapped, | ||
| * not copied, so it must not be modified after being passed in. | ||
| * | ||
| * @param bytes raw value bytes (not null) | ||
| * @param defaultCharset charset used by {@link #asString()} and {@link #toString()} (not null) | ||
| */ | ||
| public StringValue(byte[] bytes, Charset defaultCharset) { | ||
| this(ByteBuffer.wrap(bytes), defaultCharset); | ||
| } | ||
|
|
||
| /** | ||
| * Creates a value backed by the remaining content of the given buffer using the provided default charset. | ||
| * The buffer is referenced, not copied, so its content must not be modified afterwards. | ||
| * | ||
| * @param buffer backing heap buffer (not null); its remaining bytes define the value | ||
| * @param defaultCharset charset used by {@link #asString()} and {@link #toString()} (not null) | ||
| * @throws IllegalArgumentException if the buffer is a direct (off-heap) buffer with no accessible array | ||
| */ | ||
| public StringValue(ByteBuffer buffer, Charset defaultCharset) { | ||
| Objects.requireNonNull(buffer, "buffer cannot be null"); | ||
| Objects.requireNonNull(defaultCharset, "charset is required to convert buffer to String"); | ||
|
|
||
| if (!buffer.hasArray()) { | ||
| throw new IllegalArgumentException("Can work only with heap buffer."); | ||
| } | ||
|
|
||
| // Keep an independent view so external position/limit changes do not affect this value. | ||
| this.buffer = buffer.slice(); | ||
| this.defaultCharset = defaultCharset; | ||
| } | ||
|
|
||
| /** | ||
| * Returns a read-only view over the raw bytes of this value. The returned buffer is independent | ||
| * (its own position/limit) and shares no mutable state with this value. | ||
| * | ||
| * @return read-only buffer positioned at the first byte of the value | ||
| */ | ||
| public ByteBuffer asByteBuffer() { | ||
| return buffer.asReadOnlyBuffer(); | ||
| } | ||
|
|
||
| /** | ||
| * Returns a direct reference to the backing byte array of this value (no copy is made). | ||
| * <p> | ||
| * The returned array is the live backing storage: mutating it mutates this value, and any change | ||
| * to the underlying bytes is reflected here. Callers that need an independent, immutable snapshot | ||
| * must copy the result themselves. | ||
| * | ||
| * @return the backing array holding the value bytes | ||
| */ | ||
| public byte[] toByteArray() { | ||
| return buffer.array(); | ||
| } | ||
|
|
||
| /** | ||
| * @return number of bytes in this value | ||
| */ | ||
| public int size() { | ||
| return buffer.remaining(); | ||
| } | ||
|
|
||
| /** | ||
| * @return {@code true} if the value has no bytes | ||
| */ | ||
| public boolean isEmpty() { | ||
| return buffer.remaining() == 0; | ||
| } | ||
|
|
||
| /** | ||
| * Decodes the value using the default charset (UTF-8 unless another was provided at construction). | ||
| * The result is cached so repeated calls do not allocate a new string. | ||
| * | ||
| * @return decoded string | ||
| */ | ||
| public String asString() { | ||
| String s = cached; | ||
| if (s == null) { | ||
| s = decode(defaultCharset); | ||
| cached = s; | ||
| } | ||
| return s; | ||
| } | ||
|
|
||
| /** | ||
| * Decodes the value using the given charset. The result is cached only when the charset matches the | ||
| * default charset of this value. | ||
| * | ||
| * @param charset charset to decode with (not null) | ||
| * @return decoded string | ||
| */ | ||
| public String asString(Charset charset) { | ||
| Objects.requireNonNull(charset, "charset cannot be null"); | ||
| if (charset.equals(defaultCharset)) { | ||
| return asString(); | ||
| } | ||
| return decode(charset); | ||
| } | ||
|
|
||
| private String decode(Charset charset) { | ||
| return new String(buffer.array(), charset); | ||
| } | ||
|
|
||
| @Override | ||
| public String toString() { | ||
| return asString(); | ||
| } | ||
|
|
||
| @Override | ||
| public boolean equals(Object o) { | ||
| if (this == o) { | ||
| return true; | ||
| } | ||
| if (!(o instanceof StringValue)) { | ||
| return false; | ||
| } | ||
| return buffer.equals(((StringValue) o).buffer); | ||
| } | ||
|
|
||
| @Override | ||
| public int hashCode() { | ||
| return buffer.hashCode(); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,6 +2,7 @@ | |
|
|
||
| import com.clickhouse.client.api.ClientException; | ||
| import com.clickhouse.client.api.DataTypeUtils; | ||
| import com.clickhouse.client.api.data_formats.StringValue; | ||
| import com.clickhouse.data.ClickHouseColumn; | ||
| import com.clickhouse.data.ClickHouseDataType; | ||
| import com.clickhouse.data.ClickHouseEnum; | ||
|
|
@@ -55,6 +56,8 @@ public class BinaryStreamReader { | |
|
|
||
| private final Class<?> arrayDefaultTypeHint; | ||
|
|
||
| private final boolean stringAsBinaryDefault; | ||
|
|
||
| private static final int SB_INIT_SIZE = 100; | ||
|
|
||
| private ClickHouseColumn lastDataColumn = null; | ||
|
|
@@ -69,7 +72,7 @@ public class BinaryStreamReader { | |
| * @param jsonAsString - use string to serialize/deserialize JSON columns | ||
| * @param typeHintMapping - what type use as hint if hint is not set or may not be known. | ||
| */ | ||
| BinaryStreamReader(InputStream input, TimeZone timeZone, Logger log, ByteBufferAllocator bufferAllocator, boolean jsonAsString, Map<ClickHouseDataType, Class<?>> typeHintMapping) { | ||
| public BinaryStreamReader(InputStream input, TimeZone timeZone, Logger log, ByteBufferAllocator bufferAllocator, boolean jsonAsString, Map<ClickHouseDataType, Class<?>> typeHintMapping) { | ||
| this.log = log == null ? NOPLogger.NOP_LOGGER : log; | ||
| this.timeZone = timeZone; | ||
| this.input = input; | ||
|
|
@@ -78,6 +81,20 @@ public class BinaryStreamReader { | |
|
|
||
| this.arrayDefaultTypeHint = typeHintMapping == null || | ||
| typeHintMapping.isEmpty()? NO_TYPE_HINT : typeHintMapping.get(ClickHouseDataType.Array); | ||
| this.stringAsBinaryDefault = typeHintMapping != null && | ||
| typeHintMapping.get(ClickHouseDataType.String) == StringValue.class; | ||
| } | ||
|
|
||
| private boolean readStringAsBinary(Class<?> typeHint) { | ||
| if (typeHint != null) { | ||
| if (typeHint == StringValue.class) { | ||
| return true; | ||
| } | ||
| if (typeHint == String.class) { | ||
| return false; | ||
| } | ||
| } | ||
| return stringAsBinaryDefault; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FixedString type hint ignoredMedium Severity
Reviewed by Cursor Bugbot for commit 3012f42. Configure here. |
||
| } | ||
|
|
||
| /** | ||
|
|
@@ -121,12 +138,18 @@ public <T> T readValue(ClickHouseColumn column, Class<?> typeHint) throws IOExce | |
| switch (dataType) { | ||
| // Primitives | ||
| case FixedString: { | ||
| if (readStringAsBinary(typeHint)) { | ||
| return (T) new StringValue(readStringBytes(input, precision)); | ||
| } | ||
| byte[] bytes = precision > STRING_BUFF.length ? | ||
| new byte[precision] : STRING_BUFF; | ||
| readNBytes(input, bytes, 0, precision); | ||
| return (T) new String(bytes, 0, precision, StandardCharsets.UTF_8); | ||
| } | ||
| case String: { | ||
| if (readStringAsBinary(typeHint)) { | ||
| return (T) readStringValue(); | ||
| } | ||
| return (T) readString(); | ||
| } | ||
| case Int8: | ||
|
|
@@ -1119,17 +1142,41 @@ public String readString() throws IOException { | |
| } | ||
|
|
||
| /** | ||
| * Reads a decimal value from input stream. | ||
| * Reads a string from the internal input stream preserving the raw bytes as a {@link StringValue}. | ||
| * Unlike {@link #readString()} this does not decode bytes into a {@link String} and never reuses the | ||
| * shared buffer, so the value is safe to keep after the next read. | ||
| * | ||
| * @return string value holding the raw bytes | ||
| * @throws IOException when IO error occurs | ||
| */ | ||
| public StringValue readStringValue() throws IOException { | ||
| return new StringValue(readStringBytes(input, readVarInt(input))); | ||
| } | ||
|
|
||
| /** | ||
| * Reads the raw bytes of a string from the input stream given its length. | ||
| * | ||
| * @param input - source of bytes | ||
| * @return String | ||
| * @param len - number of bytes to read | ||
| * @return byte[] containing the raw string bytes | ||
| * @throws IOException when IO error occurs | ||
| */ | ||
| public static String readString(InputStream input) throws IOException { | ||
| int len = readVarInt(input); | ||
| public static byte[] readStringBytes(InputStream input, int len) throws IOException { | ||
| if (len == 0) { | ||
| return ""; | ||
| return new byte[0]; | ||
| } | ||
| return new String(readNBytes(input, len), StandardCharsets.UTF_8); | ||
| return readNBytes(input, len); | ||
| } | ||
|
|
||
| /** | ||
| * Reads a string value from input stream. | ||
| * @param input - source of bytes | ||
| * @return String | ||
| * @throws IOException when IO error occurs | ||
| */ | ||
| public static String readString(InputStream input) throws IOException { | ||
| byte[] bytes = readStringBytes(input, readVarInt(input)); | ||
| return bytes.length == 0 ? "" : new String(bytes, StandardCharsets.UTF_8); | ||
| } | ||
|
|
||
| public static int readByteOrEOF(InputStream input) throws IOException { | ||
|
|
||


There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
StringValue ignores buffer slice
Medium Severity
For values built from a sliced heap
ByteBuffer,toByteArray()anddecode()usebuffer.array()withoutarrayOffset,position, orremaining(), so they expose or decode the whole backing array instead of the documented remaining bytes.size()stays correct while bytes and strings disagree.Reviewed by Cursor Bugbot for commit 3012f42. Configure here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
cannot understand your explanation. There is no problem definition. is it a documentation issue ?