diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index b52f72d6..fa4666cc 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -53,7 +53,8 @@ jobs: libcurl4-openssl-dev \ pkg-config \ libsasl2-dev \ - protobuf-compiler + protobuf-compiler \ + libprotobuf-dev - name: Cache Cargo uses: Swatinem/rust-cache@v2 diff --git a/.github/workflows/verify-package.yml b/.github/workflows/verify-package.yml index 085ac61d..bd7dfd51 100644 --- a/.github/workflows/verify-package.yml +++ b/.github/workflows/verify-package.yml @@ -52,7 +52,8 @@ jobs: libcurl4-openssl-dev \ pkg-config \ libsasl2-dev \ - protobuf-compiler + protobuf-compiler \ + libprotobuf-dev - name: Cache Cargo uses: Swatinem/rust-cache@v2 diff --git a/Cargo.lock b/Cargo.lock index b8edca1b..18b01c0d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -132,6 +132,30 @@ version = "1.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" +[[package]] +name = "apache-avro" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36fa98bc79671c7981272d91a8753a928ff6a1cd8e4f20a44c45bd5d313840bf" +dependencies = [ + "bigdecimal", + "bon", + "digest", + "log", + "miniz_oxide", + "num-bigint", + "quad-rand", + "rand 0.9.2", + "regex-lite", + "serde", + "serde_bytes", + "serde_json", + "strum 0.27.2", + "strum_macros 0.27.2", + "thiserror 2.0.18", + "uuid", +] + [[package]] name = "ar_archive_writer" version = "0.5.1" @@ -147,6 +171,15 @@ version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1" +[[package]] +name = "arc-swap" +version = "1.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a3a1fd6f75306b68087b831f025c712524bcb19aad54e557b1129cfa0a2b207" +dependencies = [ + "rustversion", +] + [[package]] name = "arrayref" version = "0.3.9" @@ -165,19 +198,40 @@ version = "55.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f3f15b4c6b148206ff3a2b35002e08929c2462467b62b9c02036d9c34f9ef994" dependencies = [ - "arrow-arith", + "arrow-arith 55.2.0", "arrow-array 55.2.0", "arrow-buffer 55.2.0", "arrow-cast 55.2.0", - "arrow-csv", + "arrow-csv 55.2.0", "arrow-data 55.2.0", "arrow-ipc 55.2.0", - "arrow-json", - "arrow-ord", - "arrow-row", + "arrow-json 55.2.0", + "arrow-ord 55.2.0", + "arrow-row 55.2.0", "arrow-schema 55.2.0", "arrow-select 55.2.0", - "arrow-string", + "arrow-string 55.2.0", +] + +[[package]] +name = "arrow" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4754a624e5ae42081f464514be454b39711daae0458906dacde5f4c632f33a8" +dependencies = [ + "arrow-arith 57.3.0", + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-cast 57.3.0", + "arrow-csv 57.3.0", + "arrow-data 57.3.0", + "arrow-ipc 57.3.0", + "arrow-json 57.3.0", + "arrow-ord 57.3.0", + "arrow-row 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", + "arrow-string 57.3.0", ] [[package]] @@ -194,6 +248,20 @@ dependencies = [ "num", ] +[[package]] +name = "arrow-arith" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7b3141e0ec5145a22d8694ea8b6d6f69305971c4fa1c1a13ef0195aef2d678b" +dependencies = [ + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "chrono", + "num-traits", +] + [[package]] name = "arrow-array" version = "52.2.0" @@ -227,6 +295,25 @@ dependencies = [ "num", ] +[[package]] +name = "arrow-array" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c8955af33b25f3b175ee10af580577280b4bd01f7e823d94c7cdef7cf8c9aef" +dependencies = [ + "ahash", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "chrono", + "chrono-tz", + "half", + "hashbrown 0.16.1", + "num-complex", + "num-integer", + "num-traits", +] + [[package]] name = "arrow-buffer" version = "52.2.0" @@ -249,6 +336,18 @@ dependencies = [ "num", ] +[[package]] +name = "arrow-buffer" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c697ddca96183182f35b3a18e50b9110b11e916d7b7799cbfd4d34662f2c56c2" +dependencies = [ + "bytes", + "half", + "num-bigint", + "num-traits", +] + [[package]] name = "arrow-cast" version = "52.2.0" @@ -290,6 +389,28 @@ dependencies = [ "ryu", ] +[[package]] +name = "arrow-cast" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "646bbb821e86fd57189c10b4fcdaa941deaf4181924917b0daa92735baa6ada5" +dependencies = [ + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-ord 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", + "atoi", + "base64", + "chrono", + "comfy-table", + "half", + "lexical-core 1.0.6", + "num-traits", + "ryu", +] + [[package]] name = "arrow-csv" version = "55.2.0" @@ -305,6 +426,21 @@ dependencies = [ "regex", ] +[[package]] +name = "arrow-csv" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8da746f4180004e3ce7b83c977daf6394d768332349d3d913998b10a120b790a" +dependencies = [ + "arrow-array 57.3.0", + "arrow-cast 57.3.0", + "arrow-schema 57.3.0", + "chrono", + "csv", + "csv-core", + "regex", +] + [[package]] name = "arrow-data" version = "52.2.0" @@ -329,6 +465,19 @@ dependencies = [ "num", ] +[[package]] +name = "arrow-data" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fdd994a9d28e6365aa78e15da3f3950c0fdcea6b963a12fa1c391afb637b304" +dependencies = [ + "arrow-buffer 57.3.0", + "arrow-schema 57.3.0", + "half", + "num-integer", + "num-traits", +] + [[package]] name = "arrow-ipc" version = "52.2.0" @@ -354,7 +503,23 @@ dependencies = [ "arrow-data 55.2.0", "arrow-schema 55.2.0", "flatbuffers 25.12.19", - "lz4_flex", + "lz4_flex 0.11.6", +] + +[[package]] +name = "arrow-ipc" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abf7df950701ab528bf7c0cf7eeadc0445d03ef5d6ffc151eaae6b38a58feff1" +dependencies = [ + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", + "flatbuffers 25.12.19", + "lz4_flex 0.12.1", + "zstd", ] [[package]] @@ -379,6 +544,30 @@ dependencies = [ "simdutf8", ] +[[package]] +name = "arrow-json" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ff8357658bedc49792b13e2e862b80df908171275f8e6e075c460da5ee4bf86" +dependencies = [ + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-cast 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "chrono", + "half", + "indexmap 2.13.1", + "itoa", + "lexical-core 1.0.6", + "memchr", + "num-traits", + "ryu", + "serde_core", + "serde_json", + "simdutf8", +] + [[package]] name = "arrow-ord" version = "55.2.0" @@ -392,6 +581,19 @@ dependencies = [ "arrow-select 55.2.0", ] +[[package]] +name = "arrow-ord" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7d8f1870e03d4cbed632959498bcc84083b5a24bded52905ae1695bd29da45b" +dependencies = [ + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", +] + [[package]] name = "arrow-row" version = "55.2.0" @@ -405,6 +607,19 @@ dependencies = [ "half", ] +[[package]] +name = "arrow-row" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18228633bad92bff92a95746bbeb16e5fc318e8382b75619dec26db79e4de4c0" +dependencies = [ + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "half", +] + [[package]] name = "arrow-schema" version = "52.2.0" @@ -421,6 +636,17 @@ dependencies = [ "serde_json", ] +[[package]] +name = "arrow-schema" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c872d36b7bf2a6a6a2b40de9156265f0242910791db366a2c17476ba8330d68" +dependencies = [ + "bitflags 2.11.0", + "serde_core", + "serde_json", +] + [[package]] name = "arrow-select" version = "52.2.0" @@ -449,6 +675,20 @@ dependencies = [ "num", ] +[[package]] +name = "arrow-select" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68bf3e3efbd1278f770d67e5dc410257300b161b93baedb3aae836144edcaf4b" +dependencies = [ + "ahash", + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "num-traits", +] + [[package]] name = "arrow-string" version = "55.2.0" @@ -466,6 +706,35 @@ dependencies = [ "regex-syntax", ] +[[package]] +name = "arrow-string" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85e968097061b3c0e9fe3079cf2e703e487890700546b5b0647f60fca1b5a8d8" +dependencies = [ + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", + "memchr", + "num-traits", + "regex", + "regex-syntax", +] + +[[package]] +name = "async-channel" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "924ed96dd52d1b75e9c1a3e6275715fd320f5f9439fb5a4a11fa51f4221158d2" +dependencies = [ + "concurrent-queue", + "event-listener-strategy", + "futures-core", + "pin-project-lite", +] + [[package]] name = "async-compression" version = "0.4.19" @@ -483,6 +752,28 @@ dependencies = [ "zstd-safe", ] +[[package]] +name = "async-lock" +version = "3.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "290f7f2596bd5b78a9fec8088ccd89180d7f9f55b94b0576823bbbdc72ee8311" +dependencies = [ + "event-listener", + "event-listener-strategy", + "pin-project-lite", +] + +[[package]] +name = "async-recursion" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "async-stream" version = "0.3.6" @@ -502,7 +793,7 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -513,7 +804,16 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", +] + +[[package]] +name = "async_cell" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "447ab28afbb345f5408b120702a44e5529ebf90b1796ec76e9528df8e288e6c2" +dependencies = [ + "loom", ] [[package]] @@ -538,93 +838,471 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" [[package]] -name = "axum" -version = "0.7.9" +name = "aws-config" +version = "1.8.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f" +checksum = "8a8fc176d53d6fe85017f230405e3255cedb4a02221cb55ed6d76dccbbb099b2" dependencies = [ - "async-trait", - "axum-core", + "aws-credential-types", + "aws-runtime", + "aws-sdk-sso", + "aws-sdk-ssooidc", + "aws-sdk-sts", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", "bytes", - "futures-util", - "http", - "http-body", - "http-body-util", - "itoa", - "matchit", - "memchr", - "mime", - "percent-encoding", - "pin-project-lite", - "rustversion", - "serde", - "sync_wrapper", - "tower 0.5.3", - "tower-layer", - "tower-service", + "fastrand", + "hex", + "http 1.4.0", + "ring", + "time", + "tokio", + "tracing", + "url", + "zeroize", ] [[package]] -name = "axum-core" -version = "0.4.5" +name = "aws-credential-types" +version = "1.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199" +checksum = "6d203b0bf2626dcba8665f5cd0871d7c2c0930223d6b6be9097592fea21242d0" dependencies = [ - "async-trait", - "bytes", - "futures-util", - "http", - "http-body", - "http-body-util", - "mime", - "pin-project-lite", - "rustversion", - "sync_wrapper", - "tower-layer", - "tower-service", + "aws-smithy-async", + "aws-smithy-runtime-api", + "aws-smithy-types", + "zeroize", ] [[package]] -name = "base64" -version = "0.22.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" - -[[package]] -name = "bigdecimal" -version = "0.4.10" +name = "aws-lc-rs" +version = "1.16.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d6867f1565b3aad85681f1015055b087fcfd840d6aeee6eee7f2da317603695" +checksum = "0ec6fb3fe69024a75fa7e1bfb48aa6cf59706a101658ea01bfd33b2b248a038f" dependencies = [ - "autocfg", - "libm", - "num-bigint", - "num-integer", - "num-traits", + "aws-lc-sys", + "zeroize", ] [[package]] -name = "bincode" -version = "2.0.1" +name = "aws-lc-sys" +version = "0.40.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36eaf5d7b090263e8150820482d5d93cd964a81e4019913c972f4edcc6edb740" +checksum = "f50037ee5e1e41e7b8f9d161680a725bd1626cb6f8c7e901f91f942850852fe7" dependencies = [ - "bincode_derive", - "serde", - "unty", + "cc", + "cmake", + "dunce", + "fs_extra", ] [[package]] -name = "bincode_derive" -version = "2.0.1" +name = "aws-runtime" +version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf95709a440f45e986983918d0e8a1f30a9b1df04918fc828670606804ac3c09" +checksum = "ede2ddc593e6c8acc6ce3358c28d6677a6dc49b65ba4b37a2befe14a11297e75" dependencies = [ - "virtue", + "aws-credential-types", + "aws-sigv4", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "bytes-utils", + "fastrand", + "http 1.4.0", + "http-body 1.0.1", + "percent-encoding", + "pin-project-lite", + "tracing", + "uuid", ] [[package]] -name = "bindgen" +name = "aws-sdk-sso" +version = "1.95.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00c5ff27c6ba2cbd95e6e26e2e736676fdf6bcf96495b187733f521cfe4ce448" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-observability", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand", + "http 0.2.12", + "http 1.4.0", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sdk-ssooidc" +version = "1.97.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d186f1e5a3694a188e5a0640b3115ccc6e084d104e16fd6ba968dca072ffef8" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-observability", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand", + "http 0.2.12", + "http 1.4.0", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sdk-sts" +version = "1.99.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9acba7c62f3d4e2408fa998a3a8caacd8b9a5b5549cf36e2372fbdae329d5449" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-observability", + "aws-smithy-query", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-smithy-xml", + "aws-types", + "fastrand", + "http 0.2.12", + "http 1.4.0", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sigv4" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37411f8e0f4bea0c3ca0958ce7f18f6439db24d555dbd809787262cd00926aa9" +dependencies = [ + "aws-credential-types", + "aws-smithy-http", + "aws-smithy-runtime-api", + "aws-smithy-types", + "bytes", + "form_urlencoded", + "hex", + "hmac", + "http 0.2.12", + "http 1.4.0", + "percent-encoding", + "sha2", + "time", + "tracing", +] + +[[package]] +name = "aws-smithy-async" +version = "1.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cc50d0f63e714784b84223abd7abbc8577de8c35d699e0edd19f0a88a08ae13" +dependencies = [ + "futures-util", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "aws-smithy-http" +version = "0.63.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d619373d490ad70966994801bc126846afaa0d1ee920697a031f0cf63f2568e7" +dependencies = [ + "aws-smithy-runtime-api", + "aws-smithy-types", + "bytes", + "bytes-utils", + "futures-core", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "percent-encoding", + "pin-project-lite", + "pin-utils", + "tracing", +] + +[[package]] +name = "aws-smithy-http-client" +version = "1.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00ccbb08c10f6bcf912f398188e42ee2eab5f1767ce215a02a73bc5df1bbdd95" +dependencies = [ + "aws-smithy-async", + "aws-smithy-runtime-api", + "aws-smithy-types", + "h2", + "http 1.4.0", + "hyper", + "hyper-rustls", + "hyper-util", + "pin-project-lite", + "rustls", + "rustls-native-certs", + "rustls-pki-types", + "tokio", + "tokio-rustls", + "tower 0.5.3", + "tracing", +] + +[[package]] +name = "aws-smithy-json" +version = "0.62.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27b3a779093e18cad88bbae08dc4261e1d95018c4c5b9356a52bcae7c0b6e9bb" +dependencies = [ + "aws-smithy-types", +] + +[[package]] +name = "aws-smithy-observability" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d3f39d5bb871aaf461d59144557f16d5927a5248a983a40654d9cf3b9ba183b" +dependencies = [ + "aws-smithy-runtime-api", +] + +[[package]] +name = "aws-smithy-query" +version = "0.60.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f76a580e3d8f8961e5d48763214025a2af65c2fa4cd1fb7f270a0e107a71b0" +dependencies = [ + "aws-smithy-types", + "urlencoding", +] + +[[package]] +name = "aws-smithy-runtime" +version = "1.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22ccf7f6eba8b2dcf8ce9b74806c6c185659c311665c4bf8d6e71ebd454db6bf" +dependencies = [ + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-http-client", + "aws-smithy-observability", + "aws-smithy-runtime-api", + "aws-smithy-types", + "bytes", + "fastrand", + "http 0.2.12", + "http 1.4.0", + "http-body 0.4.6", + "http-body 1.0.1", + "http-body-util", + "pin-project-lite", + "pin-utils", + "tokio", + "tracing", +] + +[[package]] +name = "aws-smithy-runtime-api" +version = "1.11.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4af6e5def28be846479bbeac55aa4603d6f7986fc5da4601ba324dd5d377516" +dependencies = [ + "aws-smithy-async", + "aws-smithy-types", + "bytes", + "http 0.2.12", + "http 1.4.0", + "pin-project-lite", + "tokio", + "tracing", + "zeroize", +] + +[[package]] +name = "aws-smithy-types" +version = "1.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ca2734c16913a45343b37313605d84e7d8b34a4611598ce1d25b35860a2bed3" +dependencies = [ + "base64-simd", + "bytes", + "bytes-utils", + "http 0.2.12", + "http 1.4.0", + "http-body 0.4.6", + "http-body 1.0.1", + "http-body-util", + "itoa", + "num-integer", + "pin-project-lite", + "pin-utils", + "ryu", + "serde", + "time", +] + +[[package]] +name = "aws-smithy-xml" +version = "0.60.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b53543b4b86ed43f051644f704a98c7291b3618b67adf057ee77a366fa52fcaa" +dependencies = [ + "xmlparser", +] + +[[package]] +name = "aws-types" +version = "1.3.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0470cc047657c6e286346bdf10a8719d26efd6a91626992e0e64481e44323e96" +dependencies = [ + "aws-credential-types", + "aws-smithy-async", + "aws-smithy-runtime-api", + "aws-smithy-types", + "rustc_version", + "tracing", +] + +[[package]] +name = "axum" +version = "0.7.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f" +dependencies = [ + "async-trait", + "axum-core", + "bytes", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "itoa", + "matchit", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "rustversion", + "serde", + "sync_wrapper", + "tower 0.5.3", + "tower-layer", + "tower-service", +] + +[[package]] +name = "axum-core" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199" +dependencies = [ + "async-trait", + "bytes", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "mime", + "pin-project-lite", + "rustversion", + "sync_wrapper", + "tower-layer", + "tower-service", +] + +[[package]] +name = "backon" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cffb0e931875b666fc4fcb20fee52e9bbd1ef836fd9e9e04ec21555f9f85f7ef" +dependencies = [ + "fastrand", + "gloo-timers", + "tokio", +] + +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + +[[package]] +name = "base64-simd" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "339abbe78e73178762e23bea9dfd08e697eb3f3301cd4be981c0f78ba5859195" +dependencies = [ + "outref", + "vsimd", +] + +[[package]] +name = "bigdecimal" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d6867f1565b3aad85681f1015055b087fcfd840d6aeee6eee7f2da317603695" +dependencies = [ + "autocfg", + "libm", + "num-bigint", + "num-integer", + "num-traits", + "serde", +] + +[[package]] +name = "bincode" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36eaf5d7b090263e8150820482d5d93cd964a81e4019913c972f4edcc6edb740" +dependencies = [ + "bincode_derive", + "serde", + "unty", +] + +[[package]] +name = "bincode_derive" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf95709a440f45e986983918d0e8a1f30a9b1df04918fc828670606804ac3c09" +dependencies = [ + "virtue", +] + +[[package]] +name = "bindgen" version = "0.65.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cfdf7b466f9a4903edc73f95d6d2bcd5baf8ae620638762244d3f60143643cc5" @@ -641,7 +1319,7 @@ dependencies = [ "regex", "rustc-hash 1.1.0", "shlex", - "syn", + "syn 2.0.117", ] [[package]] @@ -659,7 +1337,7 @@ dependencies = [ "regex", "rustc-hash 2.1.2", "shlex", - "syn", + "syn 2.0.117", ] [[package]] @@ -683,6 +1361,27 @@ dependencies = [ "typenum", ] +[[package]] +name = "bitpacking" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96a7139abd3d9cebf8cd6f920a389cf3dc9576172e32f4563f188cae3c3eb019" +dependencies = [ + "crunchy", +] + +[[package]] +name = "bitvec" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c" +dependencies = [ + "funty", + "radium", + "tap", + "wyz", +] + [[package]] name = "blake2" version = "0.10.6" @@ -715,6 +1414,31 @@ dependencies = [ "generic-array", ] +[[package]] +name = "bon" +version = "3.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f47dbe92550676ee653353c310dfb9cf6ba17ee70396e1f7cf0a2020ad49b2fe" +dependencies = [ + "bon-macros", + "rustversion", +] + +[[package]] +name = "bon-macros" +version = "3.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "519bd3116aeeb42d5372c29d982d16d0170d3d4a5ed85fc7dd91642ffff3c67c" +dependencies = [ + "darling", + "ident_case", + "prettyplease", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.117", +] + [[package]] name = "brotli" version = "8.0.2" @@ -745,6 +1469,12 @@ dependencies = [ "allocator-api2", ] +[[package]] +name = "bytemuck" +version = "1.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec" + [[package]] name = "byteorder" version = "1.5.0" @@ -757,6 +1487,16 @@ version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" +[[package]] +name = "bytes-utils" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dafe3a8757b027e2be6e4e5601ed563c55989fcf1546e933c66c8eb3a058d35" +dependencies = [ + "bytes", + "either", +] + [[package]] name = "bzip2" version = "0.5.2" @@ -866,13 +1606,19 @@ dependencies = [ "shlex", ] +[[package]] +name = "census" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f4c707c6a209cbe82d10abd08e1ea8995e9ea937d2550646e02798948992be0" + [[package]] name = "cexpr" version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" dependencies = [ - "nom", + "nom 7.1.3", ] [[package]] @@ -887,6 +1633,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fd16c4719339c4530435d38e511904438d07cce7950afa3718a84ac36c10e89e" +[[package]] +name = "cfg_aliases" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" + [[package]] name = "chrono" version = "0.4.44" @@ -896,6 +1648,7 @@ dependencies = [ "iana-time-zone", "js-sys", "num-traits", + "serde", "wasm-bindgen", "windows-link", ] @@ -952,7 +1705,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -1005,6 +1758,21 @@ dependencies = [ "unicode-width 0.2.2", ] +[[package]] +name = "concurrent-queue" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ca0197aee26d1ae37445ee532fefce43251d24cc7c166799f4d46817f1d3973" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "const-oid" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" + [[package]] name = "const-random" version = "0.1.18" @@ -1031,6 +1799,16 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" +[[package]] +name = "core-foundation" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2a6cd9ae233e7f62ba4e9353e81a88df7fc8a5987b8d445b4d90c879bd156f6" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "core-foundation-sys" version = "0.8.7" @@ -1205,6 +1983,15 @@ version = "0.128.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "903adeaf4938e60209a97b53a2e4326cd2d356aab9764a1934630204bae381c9" +[[package]] +name = "crc32c" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a47af21622d091a8f0fb295b88bc886ac74efcc613efc19f5d0b21de5c89e47" +dependencies = [ + "rustc_version", +] + [[package]] name = "crc32fast" version = "1.5.0" @@ -1237,8 +2024,27 @@ dependencies = [ name = "crossbeam-epoch" version = "0.9.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-queue" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f58bbc28f91df819d0aa2a2c00cd19754769c2fad90579b3592b1c9ba7a3115" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-skiplist" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df29de440c58ca2cc6e587ec3d22347551a32435fbde9d2bff64e78a9ffa151b" dependencies = [ + "crossbeam-epoch", "crossbeam-utils", ] @@ -1323,6 +2129,40 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "darling" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25ae13da2f202d56bd7f91c25fba009e7717a1e4a1cc98a76d844b65ae912e9d" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9865a50f7c335f53564bb694ef660825eb8610e0a53d3e11bf1b0d3df31e03b0" +dependencies = [ + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn 2.0.117", +] + +[[package]] +name = "darling_macro" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" +dependencies = [ + "darling_core", + "quote", + "syn 2.0.117", +] + [[package]] name = "dashmap" version = "6.1.0" @@ -1342,36 +2182,36 @@ name = "datafusion" version = "48.0.1" source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ - "arrow", + "arrow 55.2.0", "arrow-ipc 55.2.0", "arrow-schema 55.2.0", "async-trait", "bytes", "bzip2", "chrono", - "datafusion-catalog", - "datafusion-catalog-listing", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-datasource-csv", - "datafusion-datasource-json", + "datafusion-catalog 48.0.1", + "datafusion-catalog-listing 48.0.1", + "datafusion-common 48.0.1", + "datafusion-common-runtime 48.0.1", + "datafusion-datasource 48.0.1", + "datafusion-datasource-csv 48.0.1", + "datafusion-datasource-json 48.0.1", "datafusion-datasource-parquet", - "datafusion-execution", - "datafusion-expr", - "datafusion-expr-common", - "datafusion-functions", - "datafusion-functions-aggregate", - "datafusion-functions-nested", - "datafusion-functions-table", - "datafusion-functions-window", - "datafusion-optimizer", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "datafusion-physical-optimizer", - "datafusion-physical-plan", - "datafusion-session", - "datafusion-sql", + "datafusion-execution 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-expr-common 48.0.1", + "datafusion-functions 48.0.1", + "datafusion-functions-aggregate 48.0.1", + "datafusion-functions-nested 48.0.1", + "datafusion-functions-table 48.0.1", + "datafusion-functions-window 48.0.1", + "datafusion-optimizer 48.0.1", + "datafusion-physical-expr 48.0.1", + "datafusion-physical-expr-common 48.0.1", + "datafusion-physical-optimizer 48.0.1", + "datafusion-physical-plan 48.0.1", + "datafusion-session 48.0.1", + "datafusion-sql 48.0.1", "flate2", "futures", "itertools 0.14.0", @@ -1381,7 +2221,7 @@ dependencies = [ "parquet", "rand 0.9.2", "regex", - "sqlparser", + "sqlparser 0.55.0", "tempfile", "tokio", "url", @@ -1390,23 +2230,97 @@ dependencies = [ "zstd", ] +[[package]] +name = "datafusion" +version = "52.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7541353e77dc7262b71ca27be07d8393661737e3a73b5d1b1c6f7d814c64fa2a" +dependencies = [ + "arrow 57.3.0", + "arrow-schema 57.3.0", + "async-trait", + "bytes", + "chrono", + "datafusion-catalog 52.5.0", + "datafusion-catalog-listing 52.5.0", + "datafusion-common 52.5.0", + "datafusion-common-runtime 52.5.0", + "datafusion-datasource 52.5.0", + "datafusion-datasource-arrow", + "datafusion-datasource-csv 52.5.0", + "datafusion-datasource-json 52.5.0", + "datafusion-execution 52.5.0", + "datafusion-expr 52.5.0", + "datafusion-expr-common 52.5.0", + "datafusion-functions 52.5.0", + "datafusion-functions-aggregate 52.5.0", + "datafusion-functions-nested 52.5.0", + "datafusion-functions-table 52.5.0", + "datafusion-functions-window 52.5.0", + "datafusion-optimizer 52.5.0", + "datafusion-physical-expr 52.5.0", + "datafusion-physical-expr-adapter", + "datafusion-physical-expr-common 52.5.0", + "datafusion-physical-optimizer 52.5.0", + "datafusion-physical-plan 52.5.0", + "datafusion-session 52.5.0", + "datafusion-sql 52.5.0", + "futures", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot", + "rand 0.9.2", + "regex", + "sqlparser 0.59.0", + "tempfile", + "tokio", + "url", + "uuid", +] + [[package]] name = "datafusion-catalog" version = "48.0.1" source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ - "arrow", + "arrow 55.2.0", + "async-trait", + "dashmap", + "datafusion-common 48.0.1", + "datafusion-common-runtime 48.0.1", + "datafusion-datasource 48.0.1", + "datafusion-execution 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-physical-expr 48.0.1", + "datafusion-physical-plan 48.0.1", + "datafusion-session 48.0.1", + "datafusion-sql 48.0.1", + "futures", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot", + "tokio", +] + +[[package]] +name = "datafusion-catalog" +version = "52.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9997731f90fa5398ef831ad0e69600f92c861b79c0d38bd1a29b6f0e3a0ce4c8" +dependencies = [ + "arrow 57.3.0", "async-trait", "dashmap", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-expr", - "datafusion-physical-plan", - "datafusion-session", - "datafusion-sql", + "datafusion-common 52.5.0", + "datafusion-common-runtime 52.5.0", + "datafusion-datasource 52.5.0", + "datafusion-execution 52.5.0", + "datafusion-expr 52.5.0", + "datafusion-physical-expr 52.5.0", + "datafusion-physical-plan 52.5.0", + "datafusion-session 52.5.0", "futures", "itertools 0.14.0", "log", @@ -1420,30 +2334,53 @@ name = "datafusion-catalog-listing" version = "48.0.1" source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ - "arrow", + "arrow 55.2.0", "async-trait", - "datafusion-catalog", - "datafusion-common", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-session", + "datafusion-catalog 48.0.1", + "datafusion-common 48.0.1", + "datafusion-datasource 48.0.1", + "datafusion-execution 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-physical-expr 48.0.1", + "datafusion-physical-expr-common 48.0.1", + "datafusion-physical-plan 48.0.1", + "datafusion-session 48.0.1", "futures", "log", "object_store", "tokio", ] +[[package]] +name = "datafusion-catalog-listing" +version = "52.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b30a3dd50dec860c9559275c8d97d9de602e611237a6ecfbda0b3b63b872352" +dependencies = [ + "arrow 57.3.0", + "async-trait", + "datafusion-catalog 52.5.0", + "datafusion-common 52.5.0", + "datafusion-datasource 52.5.0", + "datafusion-execution 52.5.0", + "datafusion-expr 52.5.0", + "datafusion-physical-expr 52.5.0", + "datafusion-physical-expr-adapter", + "datafusion-physical-expr-common 52.5.0", + "datafusion-physical-plan 52.5.0", + "futures", + "itertools 0.14.0", + "log", + "object_store", +] + [[package]] name = "datafusion-common" version = "48.0.1" source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ "ahash", - "arrow", + "arrow 55.2.0", "arrow-ipc 55.2.0", "base64", "half", @@ -1455,7 +2392,29 @@ dependencies = [ "parquet", "paste", "recursive", - "sqlparser", + "sqlparser 0.55.0", + "tokio", + "web-time", +] + +[[package]] +name = "datafusion-common" +version = "52.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d551054acec0398ca604512310b77ce05c46f66e54b54d48200a686e385cca4e" +dependencies = [ + "ahash", + "arrow 57.3.0", + "arrow-ipc 57.3.0", + "chrono", + "half", + "hashbrown 0.16.1", + "indexmap 2.13.1", + "libc", + "log", + "object_store", + "paste", + "sqlparser 0.59.0", "tokio", "web-time", ] @@ -1470,25 +2429,36 @@ dependencies = [ "tokio", ] +[[package]] +name = "datafusion-common-runtime" +version = "52.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "567d40e285f5b79f8737b576605721cd6c1133b5d2b00bdbd5d9838d90d0812f" +dependencies = [ + "futures", + "log", + "tokio", +] + [[package]] name = "datafusion-datasource" version = "48.0.1" source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ - "arrow", + "arrow 55.2.0", "async-compression", "async-trait", "bytes", "bzip2", "chrono", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-session", + "datafusion-common 48.0.1", + "datafusion-common-runtime 48.0.1", + "datafusion-execution 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-physical-expr 48.0.1", + "datafusion-physical-expr-common 48.0.1", + "datafusion-physical-plan 48.0.1", + "datafusion-session 48.0.1", "flate2", "futures", "glob", @@ -1505,24 +2475,100 @@ dependencies = [ "zstd", ] +[[package]] +name = "datafusion-datasource" +version = "52.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27d2668f51b3b30befae2207472569e37807fdedd1d14da58acc6f8ca6257eae" +dependencies = [ + "arrow 57.3.0", + "async-trait", + "bytes", + "chrono", + "datafusion-common 52.5.0", + "datafusion-common-runtime 52.5.0", + "datafusion-execution 52.5.0", + "datafusion-expr 52.5.0", + "datafusion-physical-expr 52.5.0", + "datafusion-physical-expr-adapter", + "datafusion-physical-expr-common 52.5.0", + "datafusion-physical-plan 52.5.0", + "datafusion-session 52.5.0", + "futures", + "glob", + "itertools 0.14.0", + "log", + "object_store", + "rand 0.9.2", + "tokio", + "url", +] + +[[package]] +name = "datafusion-datasource-arrow" +version = "52.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e02e1b3e3a8ec55f1f62de4252b0407c8567363d056078769a197e24fc834a0f" +dependencies = [ + "arrow 57.3.0", + "arrow-ipc 57.3.0", + "async-trait", + "bytes", + "datafusion-common 52.5.0", + "datafusion-common-runtime 52.5.0", + "datafusion-datasource 52.5.0", + "datafusion-execution 52.5.0", + "datafusion-expr 52.5.0", + "datafusion-physical-expr-common 52.5.0", + "datafusion-physical-plan 52.5.0", + "datafusion-session 52.5.0", + "futures", + "itertools 0.14.0", + "object_store", + "tokio", +] + [[package]] name = "datafusion-datasource-csv" version = "48.0.1" source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ - "arrow", + "arrow 55.2.0", + "async-trait", + "bytes", + "datafusion-catalog 48.0.1", + "datafusion-common 48.0.1", + "datafusion-common-runtime 48.0.1", + "datafusion-datasource 48.0.1", + "datafusion-execution 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-physical-expr 48.0.1", + "datafusion-physical-expr-common 48.0.1", + "datafusion-physical-plan 48.0.1", + "datafusion-session 48.0.1", + "futures", + "object_store", + "regex", + "tokio", +] + +[[package]] +name = "datafusion-datasource-csv" +version = "52.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b559d7bf87d4f900f847baba8509634f838d9718695389e903604cdcccdb01f3" +dependencies = [ + "arrow 57.3.0", "async-trait", "bytes", - "datafusion-catalog", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-session", + "datafusion-common 52.5.0", + "datafusion-common-runtime 52.5.0", + "datafusion-datasource 52.5.0", + "datafusion-execution 52.5.0", + "datafusion-expr 52.5.0", + "datafusion-physical-expr-common 52.5.0", + "datafusion-physical-plan 52.5.0", + "datafusion-session 52.5.0", "futures", "object_store", "regex", @@ -1534,45 +2580,67 @@ name = "datafusion-datasource-json" version = "48.0.1" source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ - "arrow", + "arrow 55.2.0", "async-trait", "bytes", - "datafusion-catalog", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-session", + "datafusion-catalog 48.0.1", + "datafusion-common 48.0.1", + "datafusion-common-runtime 48.0.1", + "datafusion-datasource 48.0.1", + "datafusion-execution 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-physical-expr 48.0.1", + "datafusion-physical-expr-common 48.0.1", + "datafusion-physical-plan 48.0.1", + "datafusion-session 48.0.1", "futures", "object_store", "serde_json", "tokio", ] +[[package]] +name = "datafusion-datasource-json" +version = "52.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "250e2d7591ba8b638f063854650faa40bca4e8bd4059b2ece8836f6388d02db4" +dependencies = [ + "arrow 57.3.0", + "async-trait", + "bytes", + "datafusion-common 52.5.0", + "datafusion-common-runtime 52.5.0", + "datafusion-datasource 52.5.0", + "datafusion-execution 52.5.0", + "datafusion-expr 52.5.0", + "datafusion-physical-expr-common 52.5.0", + "datafusion-physical-plan 52.5.0", + "datafusion-session 52.5.0", + "futures", + "object_store", + "tokio", +] + [[package]] name = "datafusion-datasource-parquet" version = "48.0.1" source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ - "arrow", + "arrow 55.2.0", "async-trait", "bytes", - "datafusion-catalog", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-functions-aggregate", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "datafusion-physical-optimizer", - "datafusion-physical-plan", - "datafusion-session", + "datafusion-catalog 48.0.1", + "datafusion-common 48.0.1", + "datafusion-common-runtime 48.0.1", + "datafusion-datasource 48.0.1", + "datafusion-execution 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-functions-aggregate 48.0.1", + "datafusion-physical-expr 48.0.1", + "datafusion-physical-expr-common 48.0.1", + "datafusion-physical-optimizer 48.0.1", + "datafusion-physical-plan 48.0.1", + "datafusion-session 48.0.1", "futures", "itertools 0.14.0", "log", @@ -1588,15 +2656,42 @@ name = "datafusion-doc" version = "48.0.1" source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" +[[package]] +name = "datafusion-doc" +version = "52.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9496cb0db222dbb9a3735760ceca7fc56f35e1d5502c38d0caa77a81e9c1f6a" + [[package]] name = "datafusion-execution" version = "48.0.1" source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ - "arrow", + "arrow 55.2.0", + "dashmap", + "datafusion-common 48.0.1", + "datafusion-expr 48.0.1", + "futures", + "log", + "object_store", + "parking_lot", + "rand 0.9.2", + "tempfile", + "url", +] + +[[package]] +name = "datafusion-execution" +version = "52.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc45d23c516ed8d3637751e44e09e21b45b3f58b473c802dddd1f1ad4fe435ff" +dependencies = [ + "arrow 57.3.0", + "async-trait", + "chrono", "dashmap", - "datafusion-common", - "datafusion-expr", + "datafusion-common 52.5.0", + "datafusion-expr 52.5.0", "futures", "log", "object_store", @@ -1611,19 +2706,41 @@ name = "datafusion-expr" version = "48.0.1" source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ - "arrow", + "arrow 55.2.0", "chrono", - "datafusion-common", - "datafusion-doc", - "datafusion-expr-common", - "datafusion-functions-aggregate-common", - "datafusion-functions-window-common", - "datafusion-physical-expr-common", + "datafusion-common 48.0.1", + "datafusion-doc 48.0.1", + "datafusion-expr-common 48.0.1", + "datafusion-functions-aggregate-common 48.0.1", + "datafusion-functions-window-common 48.0.1", + "datafusion-physical-expr-common 48.0.1", "indexmap 2.13.1", "paste", "recursive", "serde_json", - "sqlparser", + "sqlparser 0.55.0", +] + +[[package]] +name = "datafusion-expr" +version = "52.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63dd30526d2db4fda6440806a41e4676334a94bc0596cc9cc2a0efed20ef2c44" +dependencies = [ + "arrow 57.3.0", + "async-trait", + "chrono", + "datafusion-common 52.5.0", + "datafusion-doc 52.5.0", + "datafusion-expr-common 52.5.0", + "datafusion-functions-aggregate-common 52.5.0", + "datafusion-functions-window-common 52.5.0", + "datafusion-physical-expr-common 52.5.0", + "indexmap 2.13.1", + "itertools 0.14.0", + "paste", + "serde_json", + "sqlparser 0.59.0", ] [[package]] @@ -1631,8 +2748,21 @@ name = "datafusion-expr-common" version = "48.0.1" source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ - "arrow", - "datafusion-common", + "arrow 55.2.0", + "datafusion-common 48.0.1", + "indexmap 2.13.1", + "itertools 0.14.0", + "paste", +] + +[[package]] +name = "datafusion-expr-common" +version = "52.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b486b5f6255d40976b88bb83813b0d035a8333e0ec39864824e78068cf42fa6" +dependencies = [ + "arrow 57.3.0", + "datafusion-common 52.5.0", "indexmap 2.13.1", "itertools 0.14.0", "paste", @@ -1643,22 +2773,53 @@ name = "datafusion-functions" version = "48.0.1" source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ - "arrow", + "arrow 55.2.0", "arrow-buffer 55.2.0", "base64", "blake2", "blake3", "chrono", - "datafusion-common", - "datafusion-doc", - "datafusion-execution", - "datafusion-expr", - "datafusion-expr-common", - "datafusion-macros", + "datafusion-common 48.0.1", + "datafusion-doc 48.0.1", + "datafusion-execution 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-expr-common 48.0.1", + "datafusion-macros 48.0.1", + "hex", + "itertools 0.14.0", + "log", + "md-5", + "rand 0.9.2", + "regex", + "sha2", + "unicode-segmentation", + "uuid", +] + +[[package]] +name = "datafusion-functions" +version = "52.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07356c94118d881130dd0ffbff127540407d969c8978736e324edcd6c41cd48f" +dependencies = [ + "arrow 57.3.0", + "arrow-buffer 57.3.0", + "base64", + "blake2", + "blake3", + "chrono", + "chrono-tz", + "datafusion-common 52.5.0", + "datafusion-doc 52.5.0", + "datafusion-execution 52.5.0", + "datafusion-expr 52.5.0", + "datafusion-expr-common 52.5.0", + "datafusion-macros 52.5.0", "hex", "itertools 0.14.0", "log", "md-5", + "num-traits", "rand 0.9.2", "regex", "sha2", @@ -1672,15 +2833,36 @@ version = "48.0.1" source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ "ahash", - "arrow", - "datafusion-common", - "datafusion-doc", - "datafusion-execution", - "datafusion-expr", - "datafusion-functions-aggregate-common", - "datafusion-macros", - "datafusion-physical-expr", - "datafusion-physical-expr-common", + "arrow 55.2.0", + "datafusion-common 48.0.1", + "datafusion-doc 48.0.1", + "datafusion-execution 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-functions-aggregate-common 48.0.1", + "datafusion-macros 48.0.1", + "datafusion-physical-expr 48.0.1", + "datafusion-physical-expr-common 48.0.1", + "half", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-aggregate" +version = "52.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b644f9cf696df9233ce6958b9807666d78563b56f923267474dd6c07795f1f8f" +dependencies = [ + "ahash", + "arrow 57.3.0", + "datafusion-common 52.5.0", + "datafusion-doc 52.5.0", + "datafusion-execution 52.5.0", + "datafusion-expr 52.5.0", + "datafusion-functions-aggregate-common 52.5.0", + "datafusion-macros 52.5.0", + "datafusion-physical-expr 52.5.0", + "datafusion-physical-expr-common 52.5.0", "half", "log", "paste", @@ -1692,10 +2874,23 @@ version = "48.0.1" source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ "ahash", - "arrow", - "datafusion-common", - "datafusion-expr-common", - "datafusion-physical-expr-common", + "arrow 55.2.0", + "datafusion-common 48.0.1", + "datafusion-expr-common 48.0.1", + "datafusion-physical-expr-common 48.0.1", +] + +[[package]] +name = "datafusion-functions-aggregate-common" +version = "52.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1de2deaaabe8923ce9ea9f29c47bbb4ee14f67ea2fe1ab5398d9bbebcf86e56" +dependencies = [ + "ahash", + "arrow 57.3.0", + "datafusion-common 52.5.0", + "datafusion-expr-common 52.5.0", + "datafusion-physical-expr-common 52.5.0", ] [[package]] @@ -1703,16 +2898,39 @@ name = "datafusion-functions-nested" version = "48.0.1" source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ - "arrow", - "arrow-ord", - "datafusion-common", - "datafusion-doc", - "datafusion-execution", - "datafusion-expr", - "datafusion-functions", - "datafusion-functions-aggregate", - "datafusion-macros", - "datafusion-physical-expr-common", + "arrow 55.2.0", + "arrow-ord 55.2.0", + "datafusion-common 48.0.1", + "datafusion-doc 48.0.1", + "datafusion-execution 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-functions 48.0.1", + "datafusion-functions-aggregate 48.0.1", + "datafusion-macros 48.0.1", + "datafusion-physical-expr-common 48.0.1", + "itertools 0.14.0", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-nested" +version = "52.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "552f8d92e4331ee91d23c02d12bb6acf32cbfd5215117e01c0fb63cd4b15af1a" +dependencies = [ + "arrow 57.3.0", + "arrow-ord 57.3.0", + "datafusion-common 52.5.0", + "datafusion-doc 52.5.0", + "datafusion-execution 52.5.0", + "datafusion-expr 52.5.0", + "datafusion-expr-common 52.5.0", + "datafusion-functions 52.5.0", + "datafusion-functions-aggregate 52.5.0", + "datafusion-functions-aggregate-common 52.5.0", + "datafusion-macros 52.5.0", + "datafusion-physical-expr-common 52.5.0", "itertools 0.14.0", "log", "paste", @@ -1723,12 +2941,28 @@ name = "datafusion-functions-table" version = "48.0.1" source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ - "arrow", + "arrow 55.2.0", + "async-trait", + "datafusion-catalog 48.0.1", + "datafusion-common 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-physical-plan 48.0.1", + "parking_lot", + "paste", +] + +[[package]] +name = "datafusion-functions-table" +version = "52.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "970fd0cdd3df8802b9a9975ff600998289ba9d46682a4f7285cba4820c9ada78" +dependencies = [ + "arrow 57.3.0", "async-trait", - "datafusion-catalog", - "datafusion-common", - "datafusion-expr", - "datafusion-physical-plan", + "datafusion-catalog 52.5.0", + "datafusion-common 52.5.0", + "datafusion-expr 52.5.0", + "datafusion-physical-plan 52.5.0", "parking_lot", "paste", ] @@ -1738,14 +2972,32 @@ name = "datafusion-functions-window" version = "48.0.1" source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ - "arrow", - "datafusion-common", - "datafusion-doc", - "datafusion-expr", - "datafusion-functions-window-common", - "datafusion-macros", - "datafusion-physical-expr", - "datafusion-physical-expr-common", + "arrow 55.2.0", + "datafusion-common 48.0.1", + "datafusion-doc 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-functions-window-common 48.0.1", + "datafusion-macros 48.0.1", + "datafusion-physical-expr 48.0.1", + "datafusion-physical-expr-common 48.0.1", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-window" +version = "52.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40b4c21a7c8a986a1866c0a87ab756d0bbf7b5f41f306009fa2d9af79c52ed31" +dependencies = [ + "arrow 57.3.0", + "datafusion-common 52.5.0", + "datafusion-doc 52.5.0", + "datafusion-expr 52.5.0", + "datafusion-functions-window-common 52.5.0", + "datafusion-macros 52.5.0", + "datafusion-physical-expr 52.5.0", + "datafusion-physical-expr-common 52.5.0", "log", "paste", ] @@ -1755,8 +3007,18 @@ name = "datafusion-functions-window-common" version = "48.0.1" source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ - "datafusion-common", - "datafusion-physical-expr-common", + "datafusion-common 48.0.1", + "datafusion-physical-expr-common 48.0.1", +] + +[[package]] +name = "datafusion-functions-window-common" +version = "52.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1210ad73b8b3211aeaf4a42bef9bd7a2b7fce3ec119a478831f18c6ff7f7b93" +dependencies = [ + "datafusion-common 52.5.0", + "datafusion-physical-expr-common 52.5.0", ] [[package]] @@ -1764,9 +3026,20 @@ name = "datafusion-macros" version = "48.0.1" source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ - "datafusion-expr", + "datafusion-expr 48.0.1", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "datafusion-macros" +version = "52.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aaa566a963013a38681ad82a727a654bc7feb19632426aea8c3412d415d200c5" +dependencies = [ + "datafusion-doc 52.5.0", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -1774,11 +3047,11 @@ name = "datafusion-optimizer" version = "48.0.1" source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ - "arrow", + "arrow 55.2.0", "chrono", - "datafusion-common", - "datafusion-expr", - "datafusion-physical-expr", + "datafusion-common 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-physical-expr 48.0.1", "indexmap 2.13.1", "itertools 0.14.0", "log", @@ -1787,18 +3060,37 @@ dependencies = [ "regex-syntax", ] +[[package]] +name = "datafusion-optimizer" +version = "52.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff9aa82b240252a88dee118372f9b9757c545ab9e53c0736bebab2e7da0ef1f2" +dependencies = [ + "arrow 57.3.0", + "chrono", + "datafusion-common 52.5.0", + "datafusion-expr 52.5.0", + "datafusion-expr-common 52.5.0", + "datafusion-physical-expr 52.5.0", + "indexmap 2.13.1", + "itertools 0.14.0", + "log", + "regex", + "regex-syntax", +] + [[package]] name = "datafusion-physical-expr" version = "48.0.1" source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ "ahash", - "arrow", - "datafusion-common", - "datafusion-expr", - "datafusion-expr-common", - "datafusion-functions-aggregate-common", - "datafusion-physical-expr-common", + "arrow 55.2.0", + "datafusion-common 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-expr-common 48.0.1", + "datafusion-functions-aggregate-common 48.0.1", + "datafusion-physical-expr-common 48.0.1", "half", "hashbrown 0.14.5", "indexmap 2.13.1", @@ -1808,55 +3100,128 @@ dependencies = [ "petgraph 0.8.3", ] +[[package]] +name = "datafusion-physical-expr" +version = "52.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d48022b8af9988c1d852644f9e8b5584c490659769a550c5e8d39457a1da0a5" +dependencies = [ + "ahash", + "arrow 57.3.0", + "datafusion-common 52.5.0", + "datafusion-expr 52.5.0", + "datafusion-expr-common 52.5.0", + "datafusion-functions-aggregate-common 52.5.0", + "datafusion-physical-expr-common 52.5.0", + "half", + "hashbrown 0.16.1", + "indexmap 2.13.1", + "itertools 0.14.0", + "parking_lot", + "paste", + "petgraph 0.8.3", + "tokio", +] + +[[package]] +name = "datafusion-physical-expr-adapter" +version = "52.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae7a8abc0b4fe624000972a9b145b30b7f1b680bffaa950ea53f78d9b21c27c3" +dependencies = [ + "arrow 57.3.0", + "datafusion-common 52.5.0", + "datafusion-expr 52.5.0", + "datafusion-functions 52.5.0", + "datafusion-physical-expr 52.5.0", + "datafusion-physical-expr-common 52.5.0", + "itertools 0.14.0", +] + [[package]] name = "datafusion-physical-expr-common" version = "48.0.1" source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ "ahash", - "arrow", - "datafusion-common", - "datafusion-expr-common", + "arrow 55.2.0", + "datafusion-common 48.0.1", + "datafusion-expr-common 48.0.1", "hashbrown 0.14.5", "itertools 0.14.0", ] +[[package]] +name = "datafusion-physical-expr-common" +version = "52.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147253ca3e6b9d59c162de64c02800973018660e13340dd1886dd038d17ac429" +dependencies = [ + "ahash", + "arrow 57.3.0", + "chrono", + "datafusion-common 52.5.0", + "datafusion-expr-common 52.5.0", + "hashbrown 0.16.1", + "indexmap 2.13.1", + "itertools 0.14.0", + "parking_lot", +] + [[package]] name = "datafusion-physical-optimizer" version = "48.0.1" source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ - "arrow", - "datafusion-common", - "datafusion-execution", - "datafusion-expr", - "datafusion-expr-common", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "datafusion-physical-plan", + "arrow 55.2.0", + "datafusion-common 48.0.1", + "datafusion-execution 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-expr-common 48.0.1", + "datafusion-physical-expr 48.0.1", + "datafusion-physical-expr-common 48.0.1", + "datafusion-physical-plan 48.0.1", "itertools 0.14.0", "log", "recursive", ] +[[package]] +name = "datafusion-physical-optimizer" +version = "52.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "689156bb2282107b6239db8d7ef44b4dab10a9b33d3491a0c74acac5e4fedd72" +dependencies = [ + "arrow 57.3.0", + "datafusion-common 52.5.0", + "datafusion-execution 52.5.0", + "datafusion-expr 52.5.0", + "datafusion-expr-common 52.5.0", + "datafusion-physical-expr 52.5.0", + "datafusion-physical-expr-common 52.5.0", + "datafusion-physical-plan 52.5.0", + "datafusion-pruning", + "itertools 0.14.0", +] + [[package]] name = "datafusion-physical-plan" version = "48.0.1" source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ "ahash", - "arrow", - "arrow-ord", + "arrow 55.2.0", + "arrow-ord 55.2.0", "arrow-schema 55.2.0", "async-trait", "chrono", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-execution", - "datafusion-expr", - "datafusion-functions-window-common", - "datafusion-physical-expr", - "datafusion-physical-expr-common", + "datafusion-common 48.0.1", + "datafusion-common-runtime 48.0.1", + "datafusion-execution 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-functions-window-common 48.0.1", + "datafusion-physical-expr 48.0.1", + "datafusion-physical-expr-common 48.0.1", "futures", "half", "hashbrown 0.14.5", @@ -1868,19 +3233,50 @@ dependencies = [ "tokio", ] +[[package]] +name = "datafusion-physical-plan" +version = "52.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68253dc0ee5330aa558b2549c9b0da5af9fc17d753ae73022939014ad616fc28" +dependencies = [ + "ahash", + "arrow 57.3.0", + "arrow-ord 57.3.0", + "arrow-schema 57.3.0", + "async-trait", + "datafusion-common 52.5.0", + "datafusion-common-runtime 52.5.0", + "datafusion-execution 52.5.0", + "datafusion-expr 52.5.0", + "datafusion-functions 52.5.0", + "datafusion-functions-aggregate-common 52.5.0", + "datafusion-functions-window-common 52.5.0", + "datafusion-physical-expr 52.5.0", + "datafusion-physical-expr-common 52.5.0", + "futures", + "half", + "hashbrown 0.16.1", + "indexmap 2.13.1", + "itertools 0.14.0", + "log", + "parking_lot", + "pin-project-lite", + "tokio", +] + [[package]] name = "datafusion-proto" version = "48.0.1" source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ - "arrow", + "arrow 55.2.0", "chrono", - "datafusion", - "datafusion-common", - "datafusion-expr", + "datafusion 48.0.1", + "datafusion-common 48.0.1", + "datafusion-expr 48.0.1", "datafusion-proto-common", "object_store", - "prost", + "prost 0.13.5", ] [[package]] @@ -1888,9 +3284,26 @@ name = "datafusion-proto-common" version = "48.0.1" source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ - "arrow", - "datafusion-common", - "prost", + "arrow 55.2.0", + "datafusion-common 48.0.1", + "prost 0.13.5", +] + +[[package]] +name = "datafusion-pruning" +version = "52.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fcad240a54d0b1d3e8f668398900260a53122d522b2102ab57218590decacd6" +dependencies = [ + "arrow 57.3.0", + "datafusion-common 52.5.0", + "datafusion-datasource 52.5.0", + "datafusion-expr-common 52.5.0", + "datafusion-physical-expr 52.5.0", + "datafusion-physical-expr-common 52.5.0", + "datafusion-physical-plan 52.5.0", + "itertools 0.14.0", + "log", ] [[package]] @@ -1898,16 +3311,16 @@ name = "datafusion-session" version = "48.0.1" source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ - "arrow", + "arrow 55.2.0", "async-trait", "dashmap", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-expr", - "datafusion-physical-plan", - "datafusion-sql", + "datafusion-common 48.0.1", + "datafusion-common-runtime 48.0.1", + "datafusion-execution 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-physical-expr 48.0.1", + "datafusion-physical-plan 48.0.1", + "datafusion-sql 48.0.1", "futures", "itertools 0.14.0", "log", @@ -1916,20 +3329,51 @@ dependencies = [ "tokio", ] +[[package]] +name = "datafusion-session" +version = "52.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f58e83a68bb67007a8fcbf005c44cefe441270c7ee7f6dee10c0e0109b556f6d" +dependencies = [ + "async-trait", + "datafusion-common 52.5.0", + "datafusion-execution 52.5.0", + "datafusion-expr 52.5.0", + "datafusion-physical-plan 52.5.0", + "parking_lot", +] + [[package]] name = "datafusion-sql" version = "48.0.1" source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ - "arrow", + "arrow 55.2.0", "bigdecimal", - "datafusion-common", - "datafusion-expr", + "datafusion-common 48.0.1", + "datafusion-expr 48.0.1", "indexmap 2.13.1", "log", "recursive", "regex", - "sqlparser", + "sqlparser 0.55.0", +] + +[[package]] +name = "datafusion-sql" +version = "52.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be53e9eb55db0fbb8980bb6d87f2435b0524acf4c718ed54a57cabbb299b2ab3" +dependencies = [ + "arrow 57.3.0", + "bigdecimal", + "chrono", + "datafusion-common 52.5.0", + "datafusion-expr 52.5.0", + "indexmap 2.13.1", + "log", + "regex", + "sqlparser 0.59.0", ] [[package]] @@ -1941,6 +3385,26 @@ dependencies = [ "uuid", ] +[[package]] +name = "deepsize" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cdb987ec36f6bf7bfbea3f928b75590b736fc42af8e54d97592481351b2b96c" +dependencies = [ + "deepsize_derive", +] + +[[package]] +name = "deepsize_derive" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990101d41f3bc8c1a45641024377ee284ecc338e5ecf3ea0f0e236d897c72796" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "deranged" version = "0.5.8" @@ -1948,6 +3412,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" dependencies = [ "powerfmt", + "serde_core", ] [[package]] @@ -1957,6 +3422,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ "block-buffer", + "const-oid", "crypto-common", "subtle", ] @@ -1971,6 +3437,27 @@ dependencies = [ "dirs-sys-next", ] +[[package]] +name = "dirs" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3e8aa94d75141228480295a7d0e7feb620b1a5ad9f12bc40be62411e38cce4e" +dependencies = [ + "dirs-sys", +] + +[[package]] +name = "dirs-sys" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e01a3366d27ee9890022452ee61b2b63a67e6f13f58900b651ff5665f0bb1fab" +dependencies = [ + "libc", + "option-ext", + "redox_users 0.5.2", + "windows-sys 0.61.2", +] + [[package]] name = "dirs-sys-next" version = "0.1.2" @@ -1978,7 +3465,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d" dependencies = [ "libc", - "redox_users", + "redox_users 0.4.6", "winapi", ] @@ -1990,7 +3477,16 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", +] + +[[package]] +name = "dlv-list" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "442039f5147480ba31067cb00ada1adae6892028e40e45fc5de7b7df6dcc1b5f" +dependencies = [ + "const-random", ] [[package]] @@ -2002,6 +3498,12 @@ dependencies = [ "litrs", ] +[[package]] +name = "downcast-rs" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "117240f60069e65410b3ae1bb213295bd828f707b5bec6596a1afc8793ce0cbc" + [[package]] name = "duct" version = "0.13.7" @@ -2014,6 +3516,12 @@ dependencies = [ "shared_child", ] +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + [[package]] name = "either" version = "1.15.0" @@ -2082,12 +3590,51 @@ version = "3.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dea2df4cf52843e0452895c455a1a2cfbb842a1e7329671acf418fdc53ed4c59" +[[package]] +name = "ethnum" +version = "1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca81e6b4777c89fd810c25a4be2b1bd93ea034fbe58e6a75216a34c6b82c539b" + +[[package]] +name = "event-listener" +version = "5.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13b66accf52311f30a0db42147dadea9850cb48cd070028831ae5f5d4b856ab" +dependencies = [ + "concurrent-queue", + "parking", + "pin-project-lite", +] + +[[package]] +name = "event-listener-strategy" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8be9f3dfaaffdae2972880079a491a1a8bb7cbed0b8dd7a347f668b4150a3b93" +dependencies = [ + "event-listener", + "pin-project-lite", +] + [[package]] name = "fallible-iterator" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649" +[[package]] +name = "fast-float2" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8eb564c5c7423d25c886fb561d1e4ee69f72354d16918afa32c08811f6b6a55" + +[[package]] +name = "fastdivide" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afc2bd4d5a73106dd53d10d73d3401c2f32730ba2c0b93ddb888a8983680471" + [[package]] name = "fastrand" version = "2.4.0" @@ -2166,6 +3713,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "form_urlencoded" version = "1.2.2" @@ -2186,39 +3739,80 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "fs4" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7e180ac76c23b45e767bd7ae9579bc0bb458618c4bc71835926e098e61d15f8" +dependencies = [ + "rustix 0.38.44", + "windows-sys 0.52.0", +] + +[[package]] +name = "fs_extra" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" + +[[package]] +name = "fsst" +version = "4.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2195cc7f87e84bd695586137de99605e7e9579b26ec5e01b82960ddb4d0922f2" +dependencies = [ + "arrow-array 57.3.0", + "rand 0.9.2", +] + +[[package]] +name = "fst" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ab85b9b05e3978cc9a9cf8fea7f01b494e1a09ed3037e16ba39edc7a29eb61a" +dependencies = [ + "utf8-ranges", +] + [[package]] name = "function-stream" version = "0.6.0" dependencies = [ "ahash", "anyhow", - "arrow", + "apache-avro", + "arrow 55.2.0", "arrow-array 55.2.0", + "arrow-array 57.3.0", "arrow-ipc 55.2.0", - "arrow-json", + "arrow-ipc 57.3.0", + "arrow-json 55.2.0", "arrow-schema 55.2.0", "async-trait", "base64", "bincode", + "bytes", "chrono", "crossbeam-channel", - "datafusion", - "datafusion-common", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-expr", + "datafusion 48.0.1", + "datafusion-common 48.0.1", + "datafusion-execution 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-physical-expr 48.0.1", "datafusion-proto", "futures", "governor", "itertools 0.14.0", + "lance", "log", "lru", "num_cpus", + "object_store", "parking_lot", "parquet", "petgraph 0.7.1", "proctitle", - "prost", + "prost 0.13.5", "protocol", "rand 0.8.5", "rdkafka", @@ -2227,8 +3821,8 @@ dependencies = [ "serde_json", "serde_json_path", "serde_yaml", - "sqlparser", - "strum", + "sqlparser 0.55.0", + "strum 0.26.3", "tempfile", "thiserror 2.0.18", "tokio", @@ -2259,6 +3853,12 @@ dependencies = [ "tonic", ] +[[package]] +name = "funty" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" + [[package]] name = "futures" version = "0.3.32" @@ -2315,7 +3915,7 @@ checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -2367,6 +3967,21 @@ dependencies = [ "serde_json", ] +[[package]] +name = "generator" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52f04ae4152da20c76fe800fa48659201d5cf627c5149ca0b707b69d7eef6cf9" +dependencies = [ + "cc", + "cfg-if", + "libc", + "log", + "rustversion", + "windows-link", + "windows-result", +] + [[package]] name = "generic-array" version = "0.14.7" @@ -2384,8 +3999,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" dependencies = [ "cfg-if", + "js-sys", "libc", "wasi", + "wasm-bindgen", ] [[package]] @@ -2432,6 +4049,18 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" +[[package]] +name = "gloo-timers" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbb143cf96099802033e0d4f4963b19fd2e0b728bcf076cd9cf7f6634f092994" +dependencies = [ + "futures-channel", + "futures-core", + "js-sys", + "wasm-bindgen", +] + [[package]] name = "governor" version = "0.8.1" @@ -2466,7 +4095,7 @@ dependencies = [ "fnv", "futures-core", "futures-sink", - "http", + "http 1.4.0", "indexmap 2.13.1", "slab", "tokio", @@ -2510,7 +4139,7 @@ checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ "allocator-api2", "equivalent", - "foldhash", + "foldhash 0.1.5", "serde", ] @@ -2519,6 +4148,11 @@ name = "hashbrown" version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash 0.2.0", +] [[package]] name = "heck" @@ -2538,6 +4172,15 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + [[package]] name = "home" version = "0.5.12" @@ -2547,6 +4190,23 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "htmlescape" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9025058dae765dee5070ec375f591e2ba14638c63feff74f13805a72e523163" + +[[package]] +name = "http" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + [[package]] name = "http" version = "1.4.0" @@ -2557,6 +4217,17 @@ dependencies = [ "itoa", ] +[[package]] +name = "http-body" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" +dependencies = [ + "bytes", + "http 0.2.12", + "pin-project-lite", +] + [[package]] name = "http-body" version = "1.0.1" @@ -2564,7 +4235,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" dependencies = [ "bytes", - "http", + "http 1.4.0", ] [[package]] @@ -2575,8 +4246,8 @@ checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" dependencies = [ "bytes", "futures-core", - "http", - "http-body", + "http 1.4.0", + "http-body 1.0.1", "pin-project-lite", ] @@ -2609,15 +4280,33 @@ dependencies = [ "futures-channel", "futures-core", "h2", - "http", - "http-body", + "http 1.4.0", + "http-body 1.0.1", "httparse", "httpdate", "itoa", "pin-project-lite", "smallvec", "tokio", - "want", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.27.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58" +dependencies = [ + "http 1.4.0", + "hyper", + "hyper-util", + "rustls", + "rustls-native-certs", + "rustls-pki-types", + "tokio", + "tokio-rustls", + "tower-service", + "webpki-roots", ] [[package]] @@ -2639,13 +4328,16 @@ version = "0.1.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0" dependencies = [ + "base64", "bytes", "futures-channel", "futures-util", - "http", - "http-body", + "http 1.4.0", + "http-body 1.0.1", "hyper", + "ipnet", "libc", + "percent-encoding", "pin-project-lite", "socket2 0.6.3", "tokio", @@ -2653,6 +4345,15 @@ dependencies = [ "tracing", ] +[[package]] +name = "hyperloglogplus" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "621debdf94dcac33e50475fdd76d34d5ea9c0362a834b9db08c3024696c1fbe3" +dependencies = [ + "serde", +] + [[package]] name = "iana-time-zone" version = "0.1.65" @@ -2765,6 +4466,12 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + [[package]] name = "idna" version = "1.1.0" @@ -2777,169 +4484,698 @@ dependencies = [ ] [[package]] -name = "idna_adapter" -version = "1.2.1" +name = "idna_adapter" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + +[[package]] +name = "im-rc" +version = "15.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af1955a75fa080c677d3972822ec4bad316169ab1cfc6c257a942c2265dbe5fe" +dependencies = [ + "bitmaps", + "rand_core 0.6.4", + "rand_xoshiro 0.6.0", + "sized-chunks", + "typenum", + "version_check", +] + +[[package]] +name = "indexmap" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +dependencies = [ + "autocfg", + "hashbrown 0.12.3", +] + +[[package]] +name = "indexmap" +version = "2.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45a8a2b9cb3e0b0c1803dbb0758ffac5de2f425b23c28f518faabd9d805342ff" +dependencies = [ + "equivalent", + "hashbrown 0.16.1", + "serde", + "serde_core", +] + +[[package]] +name = "integer-encoding" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" + +[[package]] +name = "inventory" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4f0c30c76f2f4ccee3fe55a2435f691ca00c0e4bd87abe4f4a851b1d4dac39b" +dependencies = [ + "rustversion", +] + +[[package]] +name = "io-extras" +version = "0.18.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2285ddfe3054097ef4b2fe909ef8c3bcd1ea52a8f0d274416caebeef39f04a65" +dependencies = [ + "io-lifetimes", + "windows-sys 0.59.0", +] + +[[package]] +name = "io-lifetimes" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06432fb54d3be7964ecd3649233cddf80db2832f47fec34c01f65b3d9d774983" + +[[package]] +name = "ipnet" +version = "2.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2" + +[[package]] +name = "iri-string" +version = "0.7.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25e659a4bb38e810ebc252e53b5814ff908a8c58c2a9ce2fae1bbec24cbf4e20" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "is-terminal" +version = "0.4.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys 0.61.2", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + +[[package]] +name = "ittapi" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b996fe614c41395cdaedf3cf408a9534851090959d90d54a535f675550b64b1" +dependencies = [ + "anyhow", + "ittapi-sys", + "log", +] + +[[package]] +name = "ittapi-sys" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52f5385394064fa2c886205dba02598013ce83d3e92d33dbdc0c52fe0e7bf4fc" +dependencies = [ + "cc", +] + +[[package]] +name = "jiff" +version = "0.2.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a3546dc96b6d42c5f24902af9e2538e82e39ad350b0c766eb3fbf2d8f3d8359" +dependencies = [ + "jiff-static", + "jiff-tzdb-platform", + "log", + "portable-atomic", + "portable-atomic-util", + "serde_core", + "windows-sys 0.61.2", +] + +[[package]] +name = "jiff-static" +version = "0.2.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a8c8b344124222efd714b73bb41f8b5120b27a7cc1c75593a6ff768d9d05aa4" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "jiff-tzdb" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c900ef84826f1338a557697dc8fc601df9ca9af4ac137c7fb61d4c6f2dfd3076" + +[[package]] +name = "jiff-tzdb-platform" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "875a5a69ac2bab1a891711cf5eccbec1ce0341ea805560dcd90b7a2e925132e8" +dependencies = [ + "jiff-tzdb", +] + +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.4", + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.94" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e04e2ef80ce82e13552136fabeef8a5ed1f985a96805761cbb9a2c34e7664d9" +dependencies = [ + "cfg-if", + "futures-util", + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "jsonb" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb98fb29636087c40ad0d1274d9a30c0c1e83e03ae93f6e7e89247b37fcc6953" +dependencies = [ + "byteorder", + "ethnum", + "fast-float2", + "itoa", + "jiff", + "nom 8.0.0", + "num-traits", + "ordered-float 5.3.0", + "rand 0.9.2", + "serde", + "serde_json", + "zmij", +] + +[[package]] +name = "lance" +version = "4.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efe6c3ddd79cdfd2b7e1c23cafae52806906bc40fbd97de9e8cf2f8c7a75fc04" +dependencies = [ + "arrow 57.3.0", + "arrow-arith 57.3.0", + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-ipc 57.3.0", + "arrow-ord 57.3.0", + "arrow-row 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", + "async-recursion", + "async-trait", + "async_cell", + "aws-credential-types", + "byteorder", + "bytes", + "chrono", + "crossbeam-skiplist", + "dashmap", + "datafusion 52.5.0", + "datafusion-expr 52.5.0", + "datafusion-functions 52.5.0", + "datafusion-physical-expr 52.5.0", + "datafusion-physical-plan 52.5.0", + "deepsize", + "either", + "futures", + "half", + "humantime", + "itertools 0.13.0", + "lance-arrow", + "lance-core", + "lance-datafusion", + "lance-encoding", + "lance-file", + "lance-index", + "lance-io", + "lance-linalg", + "lance-namespace", + "lance-table", + "log", + "moka", + "object_store", + "permutation", + "pin-project", + "prost 0.14.3", + "prost-types 0.14.3", + "rand 0.9.2", + "roaring", + "semver", + "serde", + "serde_json", + "snafu", + "tantivy", + "tokio", + "tokio-stream", + "tokio-util", + "tracing", + "url", + "uuid", +] + +[[package]] +name = "lance-arrow" +version = "4.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344" +checksum = "5d9f5d95bdda2a2b790f1fb8028b5b6dcf661abeb3133a8bca0f3d24b054af87" dependencies = [ - "icu_normalizer", - "icu_properties", + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-cast 57.3.0", + "arrow-data 57.3.0", + "arrow-ord 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", + "bytes", + "futures", + "getrandom 0.2.17", + "half", + "jsonb", + "num-traits", + "rand 0.9.2", ] [[package]] -name = "im-rc" -version = "15.1.0" +name = "lance-bitpacking" +version = "4.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af1955a75fa080c677d3972822ec4bad316169ab1cfc6c257a942c2265dbe5fe" +checksum = "f827d6ab9f8f337a9509d5ad66a12f3314db8713868260521c344ef6135eb4e4" dependencies = [ - "bitmaps", - "rand_core 0.6.4", - "rand_xoshiro", - "sized-chunks", - "typenum", - "version_check", + "arrayref", + "paste", + "seq-macro", ] [[package]] -name = "indexmap" -version = "1.9.3" +name = "lance-core" +version = "4.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +checksum = "0f1e25df6a79bf72ee6bcde0851f19b1cd36c5848c1b7db83340882d3c9fdecb" dependencies = [ - "autocfg", - "hashbrown 0.12.3", + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-schema 57.3.0", + "async-trait", + "byteorder", + "bytes", + "chrono", + "datafusion-common 52.5.0", + "datafusion-sql 52.5.0", + "deepsize", + "futures", + "itertools 0.13.0", + "lance-arrow", + "libc", + "log", + "mock_instant", + "moka", + "num_cpus", + "object_store", + "pin-project", + "prost 0.14.3", + "rand 0.9.2", + "roaring", + "serde_json", + "snafu", + "tempfile", + "tokio", + "tokio-stream", + "tokio-util", + "tracing", + "url", ] [[package]] -name = "indexmap" -version = "2.13.1" +name = "lance-datafusion" +version = "4.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45a8a2b9cb3e0b0c1803dbb0758ffac5de2f425b23c28f518faabd9d805342ff" +checksum = "93146de8ae720cb90edef81c2f2d0a1b065fc2f23ecff2419546f389b0fa70a4" dependencies = [ - "equivalent", - "hashbrown 0.16.1", - "serde", - "serde_core", + "arrow 57.3.0", + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-ord 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", + "async-trait", + "chrono", + "datafusion 52.5.0", + "datafusion-common 52.5.0", + "datafusion-functions 52.5.0", + "datafusion-physical-expr 52.5.0", + "futures", + "jsonb", + "lance-arrow", + "lance-core", + "lance-datagen", + "log", + "pin-project", + "prost 0.14.3", + "prost-build 0.14.3", + "snafu", + "tokio", + "tracing", ] [[package]] -name = "integer-encoding" -version = "3.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" - -[[package]] -name = "inventory" -version = "0.3.24" +name = "lance-datagen" +version = "4.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4f0c30c76f2f4ccee3fe55a2435f691ca00c0e4bd87abe4f4a851b1d4dac39b" +checksum = "ccec8ce4d8e0a87a99c431dab2364398029f2ffb649c1a693c60c79e05ed30dd" dependencies = [ - "rustversion", + "arrow 57.3.0", + "arrow-array 57.3.0", + "arrow-cast 57.3.0", + "arrow-schema 57.3.0", + "chrono", + "futures", + "half", + "hex", + "rand 0.9.2", + "rand_distr 0.5.1", + "rand_xoshiro 0.7.0", + "random_word", ] [[package]] -name = "io-extras" -version = "0.18.4" +name = "lance-encoding" +version = "4.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2285ddfe3054097ef4b2fe909ef8c3bcd1ea52a8f0d274416caebeef39f04a65" +checksum = "5c1aec0bbbac6bce829bc10f1ba066258126100596c375fb71908ecf11c2c2a5" dependencies = [ - "io-lifetimes", - "windows-sys 0.59.0", + "arrow-arith 57.3.0", + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-cast 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", + "bytemuck", + "byteorder", + "bytes", + "fsst", + "futures", + "hex", + "hyperloglogplus", + "itertools 0.13.0", + "lance-arrow", + "lance-bitpacking", + "lance-core", + "log", + "lz4", + "num-traits", + "prost 0.14.3", + "prost-build 0.14.3", + "prost-types 0.14.3", + "rand 0.9.2", + "snafu", + "strum 0.26.3", + "tokio", + "tracing", + "xxhash-rust", + "zstd", ] [[package]] -name = "io-lifetimes" -version = "2.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06432fb54d3be7964ecd3649233cddf80db2832f47fec34c01f65b3d9d774983" - -[[package]] -name = "ipnet" -version = "2.12.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2" - -[[package]] -name = "is-terminal" -version = "0.4.17" +name = "lance-file" +version = "4.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" +checksum = "14a8c548804f5b17486dc2d3282356ed1957095a852780283bc401fdd69e9075" dependencies = [ - "hermit-abi", - "libc", - "windows-sys 0.61.2", + "arrow-arith 57.3.0", + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", + "async-recursion", + "async-trait", + "byteorder", + "bytes", + "datafusion-common 52.5.0", + "deepsize", + "futures", + "lance-arrow", + "lance-core", + "lance-encoding", + "lance-io", + "log", + "num-traits", + "object_store", + "prost 0.14.3", + "prost-build 0.14.3", + "prost-types 0.14.3", + "snafu", + "tokio", + "tracing", ] [[package]] -name = "is_terminal_polyfill" -version = "1.70.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" - -[[package]] -name = "itertools" -version = "0.13.0" +name = "lance-index" +version = "4.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +checksum = "2da212f0090ea59f79ac3686660f596520c167fe1cb5f408900cf71d215f0e03" dependencies = [ - "either", + "arrow 57.3.0", + "arrow-arith 57.3.0", + "arrow-array 57.3.0", + "arrow-ord 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", + "async-channel", + "async-recursion", + "async-trait", + "bitpacking", + "bitvec", + "bytes", + "chrono", + "crossbeam-queue", + "datafusion 52.5.0", + "datafusion-common 52.5.0", + "datafusion-expr 52.5.0", + "datafusion-physical-expr 52.5.0", + "datafusion-sql 52.5.0", + "deepsize", + "dirs", + "fst", + "futures", + "half", + "itertools 0.13.0", + "jsonb", + "lance-arrow", + "lance-core", + "lance-datafusion", + "lance-datagen", + "lance-encoding", + "lance-file", + "lance-io", + "lance-linalg", + "lance-table", + "libm", + "log", + "ndarray", + "num-traits", + "object_store", + "prost 0.14.3", + "prost-build 0.14.3", + "prost-types 0.14.3", + "rand 0.9.2", + "rand_distr 0.5.1", + "rangemap", + "rayon", + "roaring", + "serde", + "serde_json", + "smallvec", + "snafu", + "tantivy", + "tempfile", + "tokio", + "tracing", + "twox-hash", + "uuid", ] [[package]] -name = "itertools" -version = "0.14.0" +name = "lance-io" +version = "4.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +checksum = "41d958eb4b56f03bbe0f5f85eb2b4e9657882812297b6f711f201ffc995f259f" dependencies = [ - "either", + "arrow 57.3.0", + "arrow-arith 57.3.0", + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-cast 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", + "async-recursion", + "async-trait", + "aws-config", + "aws-credential-types", + "byteorder", + "bytes", + "chrono", + "deepsize", + "futures", + "http 1.4.0", + "lance-arrow", + "lance-core", + "lance-namespace", + "log", + "object_store", + "object_store_opendal", + "opendal", + "path_abs", + "pin-project", + "prost 0.14.3", + "rand 0.9.2", + "serde", + "snafu", + "tempfile", + "tokio", + "tracing", + "url", ] [[package]] -name = "itoa" -version = "1.0.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" - -[[package]] -name = "ittapi" -version = "0.4.0" +name = "lance-linalg" +version = "4.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b996fe614c41395cdaedf3cf408a9534851090959d90d54a535f675550b64b1" +checksum = "0285b70da35def7ed95e150fae1d5308089554e1290470403ed3c50cb235bc5e" dependencies = [ - "anyhow", - "ittapi-sys", - "log", + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-schema 57.3.0", + "cc", + "deepsize", + "half", + "lance-arrow", + "lance-core", + "num-traits", + "rand 0.9.2", ] [[package]] -name = "ittapi-sys" -version = "0.4.0" +name = "lance-namespace" +version = "4.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52f5385394064fa2c886205dba02598013ce83d3e92d33dbdc0c52fe0e7bf4fc" +checksum = "5f78e2a828b654e062a495462c6e3eb4fcf0e7e907d761b8f217fc09ccd3ceac" dependencies = [ - "cc", + "arrow 57.3.0", + "async-trait", + "bytes", + "lance-core", + "lance-namespace-reqwest-client", + "serde", + "snafu", ] [[package]] -name = "jobserver" -version = "0.1.34" +name = "lance-namespace-reqwest-client" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +checksum = "ee2e48de899e2931afb67fcddd0a08e439bf5d8b6ea2a2ed9cb8f4df669bd5cc" dependencies = [ - "getrandom 0.3.4", - "libc", + "reqwest", + "serde", + "serde_json", + "serde_repr", + "url", ] [[package]] -name = "js-sys" -version = "0.3.94" +name = "lance-table" +version = "4.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e04e2ef80ce82e13552136fabeef8a5ed1f985a96805761cbb9a2c34e7664d9" +checksum = "3df9c4adca3eb2074b3850432a9fb34248a3d90c3d6427d158b13ff9355664ee" dependencies = [ - "cfg-if", - "futures-util", - "once_cell", - "wasm-bindgen", + "arrow 57.3.0", + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-ipc 57.3.0", + "arrow-schema 57.3.0", + "async-trait", + "byteorder", + "bytes", + "chrono", + "deepsize", + "futures", + "lance-arrow", + "lance-core", + "lance-file", + "lance-io", + "log", + "object_store", + "prost 0.14.3", + "prost-build 0.14.3", + "prost-types 0.14.3", + "rand 0.9.2", + "rangemap", + "roaring", + "semver", + "serde", + "serde_json", + "snafu", + "tokio", + "tracing", + "url", + "uuid", ] [[package]] @@ -2966,6 +5202,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" +[[package]] +name = "levenshtein_automata" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c2cdeb66e45e9f36bfad5bbdb4d2384e70936afbee843c6f6543f0c551ebb25" + [[package]] name = "lexical-core" version = "0.8.5" @@ -3185,6 +5427,19 @@ version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" +[[package]] +name = "loom" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "419e0dc8046cb947daa77eb95ae174acfbddb7673b4151f56d1eed8e93fbfaca" +dependencies = [ + "cfg-if", + "generator", + "scoped-tls", + "tracing", + "tracing-subscriber", +] + [[package]] name = "lru" version = "0.12.5" @@ -3194,6 +5449,21 @@ dependencies = [ "hashbrown 0.15.5", ] +[[package]] +name = "lru-slab" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" + +[[package]] +name = "lz4" +version = "1.28.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a20b523e860d03443e98350ceaac5e71c6ba89aea7d960769ec3ce37f4de5af4" +dependencies = [ + "lz4-sys", +] + [[package]] name = "lz4-sys" version = "1.11.1+lz4-1.10.0" @@ -3206,9 +5476,18 @@ dependencies = [ [[package]] name = "lz4_flex" -version = "0.11.6" +version = "0.11.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "373f5eceeeab7925e0c1098212f2fbc4d416adec9d35051a6ab251e824c1854a" +dependencies = [ + "twox-hash", +] + +[[package]] +name = "lz4_flex" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "373f5eceeeab7925e0c1098212f2fbc4d416adec9d35051a6ab251e824c1854a" +checksum = "98c23545df7ecf1b16c303910a69b079e8e251d60f7dd2cc9b4177f2afaf1746" dependencies = [ "twox-hash", ] @@ -3248,6 +5527,19 @@ version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" +[[package]] +name = "matrixmultiply" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06de3016e9fae57a36fd14dba131fccf49f74b40b7fbdb472f96e361ec71a08" +dependencies = [ + "autocfg", + "num_cpus", + "once_cell", + "rawpointer", + "thread-tree", +] + [[package]] name = "maybe-owned" version = "0.3.4" @@ -3264,6 +5556,15 @@ dependencies = [ "digest", ] +[[package]] +name = "measure_time" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51c55d61e72fc3ab704396c5fa16f4c184db37978ae4e94ca8959693a235fc0e" +dependencies = [ + "log", +] + [[package]] name = "memchr" version = "2.8.0" @@ -3279,12 +5580,31 @@ dependencies = [ "rustix 1.1.4", ] +[[package]] +name = "memmap2" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714098028fe011992e1c3962653c96b2d578c4b4bce9036e15ff220319b1e0e3" +dependencies = [ + "libc", +] + [[package]] name = "mime" version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" +[[package]] +name = "mime_guess" +version = "2.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7c44f8e672c00fe5308fa235f821cb4198414e1c77935c1ab6948d3fd78550e" +dependencies = [ + "mime", + "unicase", +] + [[package]] name = "minimal-lexical" version = "0.2.1" @@ -3312,12 +5632,59 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "mock_instant" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dce6dd36094cac388f119d2e9dc82dc730ef91c32a6222170d630e5414b956e6" + +[[package]] +name = "moka" +version = "0.12.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "957228ad12042ee839f93c8f257b62b4c0ab5eaae1d4fa60de53b27c9d7c5046" +dependencies = [ + "async-lock", + "crossbeam-channel", + "crossbeam-epoch", + "crossbeam-utils", + "equivalent", + "event-listener", + "futures-util", + "parking_lot", + "portable-atomic", + "smallvec", + "tagptr", + "uuid", +] + [[package]] name = "multimap" version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" +[[package]] +name = "murmurhash32" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b" + +[[package]] +name = "ndarray" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "882ed72dce9365842bf196bdeedf5055305f11fc8c03dee7bb0194a6cad34841" +dependencies = [ + "matrixmultiply", + "num-complex", + "num-integer", + "num-traits", + "portable-atomic", + "portable-atomic-util", + "rawpointer", +] + [[package]] name = "nibble_vec" version = "0.1.0" @@ -3335,7 +5702,7 @@ checksum = "ab2156c4fce2f8df6c499cc1c763e4394b7482525bf2a9701c9d79d215f519e4" dependencies = [ "bitflags 2.11.0", "cfg-if", - "cfg_aliases", + "cfg_aliases 0.1.1", "libc", ] @@ -3355,6 +5722,15 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "nom" +version = "8.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df9761775871bdef83bee530e60050f7e54b1105350d6884eb0fb4f46c2f9405" +dependencies = [ + "memchr", +] + [[package]] name = "nonzero_ext" version = "0.3.0" @@ -3392,6 +5768,7 @@ checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" dependencies = [ "num-integer", "num-traits", + "serde", ] [[package]] @@ -3479,7 +5856,7 @@ dependencies = [ "proc-macro-crate", "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -3501,14 +5878,26 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fbfbfff40aeccab00ec8a910b57ca8ecf4319b335c542f2edcd19dd25a1e2a00" dependencies = [ "async-trait", + "base64", "bytes", "chrono", + "form_urlencoded", "futures", - "http", + "http 1.4.0", + "http-body-util", "humantime", + "hyper", "itertools 0.14.0", + "md-5", "parking_lot", "percent-encoding", + "quick-xml 0.38.4", + "rand 0.9.2", + "reqwest", + "ring", + "serde", + "serde_json", + "serde_urlencoded", "thiserror 2.0.18", "tokio", "tracing", @@ -3518,6 +5907,22 @@ dependencies = [ "web-time", ] +[[package]] +name = "object_store_opendal" +version = "0.55.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "113ab0769e972eee585e57407b98de08bda5354fa28e8ba4d89038d6cb6a8991" +dependencies = [ + "async-trait", + "bytes", + "chrono", + "futures", + "object_store", + "opendal", + "pin-project", + "tokio", +] + [[package]] name = "once_cell" version = "1.21.4" @@ -3530,6 +5935,47 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" +[[package]] +name = "oneshot" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "269bca4c2591a28585d6bf10d9ed0332b7d76900a1b02bec41bdc3a2cdcda107" + +[[package]] +name = "opendal" +version = "0.55.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d075ab8a203a6ab4bc1bce0a4b9fe486a72bf8b939037f4b78d95386384bc80a" +dependencies = [ + "anyhow", + "backon", + "base64", + "bytes", + "crc32c", + "futures", + "getrandom 0.2.17", + "http 1.4.0", + "http-body 1.0.1", + "jiff", + "log", + "md-5", + "percent-encoding", + "quick-xml 0.38.4", + "reqsign", + "reqwest", + "serde", + "serde_json", + "tokio", + "url", + "uuid", +] + +[[package]] +name = "openssl-probe" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" + [[package]] name = "openssl-sys" version = "0.9.112" @@ -3542,6 +5988,12 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "option-ext" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" + [[package]] name = "ordered-float" version = "2.10.1" @@ -3551,6 +6003,25 @@ dependencies = [ "num-traits", ] +[[package]] +name = "ordered-float" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7d950ca161dc355eaf28f82b11345ed76c6e1f6eb1f4f4479e0323b9e2fbd0e" +dependencies = [ + "num-traits", +] + +[[package]] +name = "ordered-multimap" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79" +dependencies = [ + "dlv-list", + "hashbrown 0.14.5", +] + [[package]] name = "os_pipe" version = "1.2.3" @@ -3561,6 +6032,27 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "outref" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e" + +[[package]] +name = "ownedbytes" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fbd56f7631767e61784dc43f8580f403f4475bd4aaa4da003e6295e1bab4a7e" +dependencies = [ + "stable_deref_trait", +] + +[[package]] +name = "parking" +version = "2.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba" + [[package]] name = "parking_lot" version = "0.12.5" @@ -3606,7 +6098,7 @@ dependencies = [ "futures", "half", "hashbrown 0.15.5", - "lz4_flex", + "lz4_flex 0.11.6", "num", "num-bigint", "object_store", @@ -3626,6 +6118,18 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" +[[package]] +name = "path_abs" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05ef02f6342ac01d8a93b65f96db53fe68a92a15f41144f97fb00a9e669633c3" +dependencies = [ + "serde", + "serde_derive", + "std_prelude", + "stfu8", +] + [[package]] name = "peeking_take_while" version = "0.1.2" @@ -3638,6 +6142,12 @@ version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" +[[package]] +name = "permutation" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df202b0b0f5b8e389955afd5f27b007b00fb948162953f1db9c70d2c7e3157d7" + [[package]] name = "petgraph" version = "0.6.5" @@ -3705,7 +6215,7 @@ checksum = "d9b20ed30f105399776b9c883e68e536ef602a16ae6f596d2c473591d6ad64c6" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -3714,6 +6224,12 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + [[package]] name = "pkg-config" version = "0.3.32" @@ -3726,6 +6242,15 @@ version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" +[[package]] +name = "portable-atomic-util" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a106d1259c23fac8e543272398ae0e3c0b8d33c88ed73d0cc71b0f1d902618" +dependencies = [ + "portable-atomic", +] + [[package]] name = "postcard" version = "1.1.3" @@ -3769,7 +6294,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ "proc-macro2", - "syn", + "syn 2.0.117", ] [[package]] @@ -3808,7 +6333,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" dependencies = [ "bytes", - "prost-derive", + "prost-derive 0.13.5", +] + +[[package]] +name = "prost" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2ea70524a2f82d518bce41317d0fae74151505651af45faf1ffbd6fd33f0568" +dependencies = [ + "bytes", + "prost-derive 0.14.3", ] [[package]] @@ -3824,10 +6359,29 @@ dependencies = [ "once_cell", "petgraph 0.7.1", "prettyplease", - "prost", - "prost-types", + "prost 0.13.5", + "prost-types 0.13.5", + "regex", + "syn 2.0.117", + "tempfile", +] + +[[package]] +name = "prost-build" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" +dependencies = [ + "heck", + "itertools 0.14.0", + "log", + "multimap", + "petgraph 0.8.3", + "prettyplease", + "prost 0.14.3", + "prost-types 0.14.3", "regex", - "syn", + "syn 2.0.117", "tempfile", ] @@ -3841,7 +6395,20 @@ dependencies = [ "itertools 0.14.0", "proc-macro2", "quote", - "syn", + "syn 2.0.117", +] + +[[package]] +name = "prost-derive" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" +dependencies = [ + "anyhow", + "itertools 0.14.0", + "proc-macro2", + "quote", + "syn 2.0.117", ] [[package]] @@ -3850,7 +6417,16 @@ version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16" dependencies = [ - "prost", + "prost 0.13.5", +] + +[[package]] +name = "prost-types" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8991c4cbdb8bc5b11f0b074ffe286c30e523de90fee5ba8132f1399f23cb3dd7" +dependencies = [ + "prost 0.14.3", ] [[package]] @@ -3859,7 +6435,7 @@ version = "0.1.0" dependencies = [ "env_logger", "log", - "prost", + "prost 0.13.5", "serde", "tonic", "tonic-build", @@ -3888,29 +6464,110 @@ dependencies = [ ] [[package]] -name = "pulley-macros" -version = "41.0.4" +name = "pulley-macros" +version = "41.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56000349b6896e3d44286eb9c330891237f40b27fd43c1ccc84547d0b463cb40" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "quad-rand" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a651516ddc9168ebd67b24afd085a718be02f8858fe406591b013d101ce2f40" + +[[package]] +name = "quanta" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3ab5a9d756f0d97bdc89019bd2e4ea098cf9cde50ee7564dde6b81ccc8f06c7" +dependencies = [ + "crossbeam-utils", + "libc", + "once_cell", + "raw-cpuid", + "wasi", + "web-sys", + "winapi", +] + +[[package]] +name = "quick-xml" +version = "0.37.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "331e97a1af0bf59823e6eadffe373d7b27f485be8748f71471c662c1f269b7fb" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "quick-xml" +version = "0.38.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "quinn" +version = "0.11.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20" +dependencies = [ + "bytes", + "cfg_aliases 0.2.1", + "pin-project-lite", + "quinn-proto", + "quinn-udp", + "rustc-hash 2.1.2", + "rustls", + "socket2 0.6.3", + "thiserror 2.0.18", + "tokio", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-proto" +version = "0.11.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56000349b6896e3d44286eb9c330891237f40b27fd43c1ccc84547d0b463cb40" +checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098" dependencies = [ - "proc-macro2", - "quote", - "syn", + "bytes", + "getrandom 0.3.4", + "lru-slab", + "rand 0.9.2", + "ring", + "rustc-hash 2.1.2", + "rustls", + "rustls-pki-types", + "slab", + "thiserror 2.0.18", + "tinyvec", + "tracing", + "web-time", ] [[package]] -name = "quanta" -version = "0.12.6" +name = "quinn-udp" +version = "0.5.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3ab5a9d756f0d97bdc89019bd2e4ea098cf9cde50ee7564dde6b81ccc8f06c7" +checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd" dependencies = [ - "crossbeam-utils", + "cfg_aliases 0.2.1", "libc", "once_cell", - "raw-cpuid", - "wasi", - "web-sys", - "winapi", + "socket2 0.6.3", + "tracing", + "windows-sys 0.60.2", ] [[package]] @@ -3934,6 +6591,12 @@ version = "6.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" +[[package]] +name = "radium" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09" + [[package]] name = "radix_trie" version = "0.2.1" @@ -4003,6 +6666,26 @@ dependencies = [ "getrandom 0.3.4", ] +[[package]] +name = "rand_distr" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" +dependencies = [ + "num-traits", + "rand 0.8.5", +] + +[[package]] +name = "rand_distr" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463" +dependencies = [ + "num-traits", + "rand 0.9.2", +] + [[package]] name = "rand_xoshiro" version = "0.6.0" @@ -4012,6 +6695,34 @@ dependencies = [ "rand_core 0.6.4", ] +[[package]] +name = "rand_xoshiro" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f703f4665700daf5512dcca5f43afa6af89f09db47fb56be587f80636bda2d41" +dependencies = [ + "rand_core 0.9.5", +] + +[[package]] +name = "random_word" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e47a395bdb55442b883c89062d6bcff25dc90fa5f8369af81e0ac6d49d78cf81" +dependencies = [ + "ahash", + "brotli", + "paste", + "rand 0.9.2", + "unicase", +] + +[[package]] +name = "rangemap" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "973443cf09a9c8656b574a866ab68dfa19f0867d0340648c7d2f6a71b8a8ea68" + [[package]] name = "raw-cpuid" version = "11.6.0" @@ -4021,6 +6732,12 @@ dependencies = [ "bitflags 2.11.0", ] +[[package]] +name = "rawpointer" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" + [[package]] name = "rayon" version = "1.11.0" @@ -4092,7 +6809,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -4115,6 +6832,17 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "redox_users" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4e608c6638b9c18977b00b475ac1f28d14e84b27d8d42f70e0bf1e3dec127ac" +dependencies = [ + "getrandom 0.2.17", + "libredox", + "thiserror 2.0.18", +] + [[package]] name = "regalloc2" version = "0.13.5" @@ -4153,12 +6881,117 @@ dependencies = [ "regex-syntax", ] +[[package]] +name = "regex-lite" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cab834c73d247e67f4fae452806d17d3c7501756d98c8808d7c9c7aa7d18f973" + [[package]] name = "regex-syntax" version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" +[[package]] +name = "reqsign" +version = "0.16.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43451dbf3590a7590684c25fb8d12ecdcc90ed3ac123433e500447c7d77ed701" +dependencies = [ + "anyhow", + "async-trait", + "base64", + "chrono", + "form_urlencoded", + "getrandom 0.2.17", + "hex", + "hmac", + "home", + "http 1.4.0", + "log", + "percent-encoding", + "quick-xml 0.37.5", + "rand 0.8.5", + "reqwest", + "rust-ini", + "serde", + "serde_json", + "sha1", + "sha2", + "tokio", +] + +[[package]] +name = "reqwest" +version = "0.12.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" +dependencies = [ + "base64", + "bytes", + "encoding_rs", + "futures-core", + "futures-util", + "h2", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "hyper", + "hyper-rustls", + "hyper-util", + "js-sys", + "log", + "mime", + "mime_guess", + "percent-encoding", + "pin-project-lite", + "quinn", + "rustls", + "rustls-native-certs", + "rustls-pki-types", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tokio-rustls", + "tokio-util", + "tower 0.5.3", + "tower-http", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "wasm-streams", + "web-sys", + "webpki-roots", +] + +[[package]] +name = "ring" +version = "0.17.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" +dependencies = [ + "cc", + "cfg-if", + "getrandom 0.2.17", + "libc", + "untrusted", + "windows-sys 0.52.0", +] + +[[package]] +name = "roaring" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ba9ce64a8f45d7fc86358410bb1a82e8c987504c0d4900e9141d69a9f26c885" +dependencies = [ + "bytemuck", + "byteorder", +] + [[package]] name = "rocksdb" version = "0.21.0" @@ -4169,6 +7002,26 @@ dependencies = [ "librocksdb-sys", ] +[[package]] +name = "rust-ini" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "796e8d2b6696392a43bea58116b667fb4c29727dc5abd27d6acf338bb4f688c7" +dependencies = [ + "cfg-if", + "ordered-multimap", +] + +[[package]] +name = "rust-stemmers" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e46a2036019fdb888131db7a4c847a1063a7493f971ed94ea82c67eada63ca54" +dependencies = [ + "serde", + "serde_derive", +] + [[package]] name = "rustc-demangle" version = "0.1.27" @@ -4232,6 +7085,55 @@ dependencies = [ "rustix 1.1.4", ] +[[package]] +name = "rustls" +version = "0.23.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4" +dependencies = [ + "aws-lc-rs", + "once_cell", + "ring", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-native-certs" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63" +dependencies = [ + "openssl-probe", + "rustls-pki-types", + "schannel", + "security-framework", +] + +[[package]] +name = "rustls-pki-types" +version = "1.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd" +dependencies = [ + "web-time", + "zeroize", +] + +[[package]] +name = "rustls-webpki" +version = "0.103.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20a6af516fea4b20eccceaf166e8aa666ac996208e8a644ce3ef5aa783bc7cd4" +dependencies = [ + "aws-lc-rs", + "ring", + "rustls-pki-types", + "untrusted", +] + [[package]] name = "rustversion" version = "1.0.22" @@ -4287,12 +7189,50 @@ dependencies = [ "pkg-config", ] +[[package]] +name = "schannel" +version = "0.1.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91c1b7e4904c873ef0710c1f407dde2e6287de2bebc1bbbf7d430bb7cbffd939" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "scoped-tls" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1cf6437eb19a8f4a6cc0f7dca544973b0b78843adbfeb3683d1a94a0024a294" + [[package]] name = "scopeguard" version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "security-framework" +version = "3.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" +dependencies = [ + "bitflags 2.11.0", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "semver" version = "1.0.28" @@ -4319,6 +7259,16 @@ dependencies = [ "serde_derive", ] +[[package]] +name = "serde_bytes" +version = "0.11.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5d440709e79d88e51ac01c4b72fc6cb7314017bb7da9eeff678aa94c10e3ea8" +dependencies = [ + "serde", + "serde_core", +] + [[package]] name = "serde_core" version = "1.0.228" @@ -4336,7 +7286,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -4359,7 +7309,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b992cea3194eea663ba99a042d61cea4bd1872da37021af56f6a37e0359b9d33" dependencies = [ "inventory", - "nom", + "nom 7.1.3", "regex", "serde", "serde_json", @@ -4399,7 +7349,18 @@ checksum = "aafbefbe175fa9bf03ca83ef89beecff7d2a95aaacd5732325b90ac8c3bd7b90" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", +] + +[[package]] +name = "serde_repr" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", ] [[package]] @@ -4411,6 +7372,18 @@ dependencies = [ "serde_core", ] +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + [[package]] name = "serde_yaml" version = "0.9.34+deprecated" @@ -4424,6 +7397,17 @@ dependencies = [ "unsafe-libyaml", ] +[[package]] +name = "sha1" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" +dependencies = [ + "cfg-if", + "cpufeatures 0.2.17", + "digest", +] + [[package]] name = "sha2" version = "0.10.9" @@ -4520,6 +7504,15 @@ dependencies = [ "typenum", ] +[[package]] +name = "sketches-ddsketch" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c6f73aeb92d671e0cc4dca167e59b2deb6387c375391bc99ee743f326994a2b" +dependencies = [ + "serde", +] + [[package]] name = "slab" version = "0.4.12" @@ -4535,6 +7528,27 @@ dependencies = [ "serde", ] +[[package]] +name = "snafu" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1d4bced6a69f90b2056c03dcff2c4737f98d6fb9e0853493996e1d253ca29c6" +dependencies = [ + "snafu-derive", +] + +[[package]] +name = "snafu-derive" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54254b8531cafa275c5e096f62d48c81435d1015405a91198ddb11e967301d40" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "snap" version = "1.1.1" @@ -4577,7 +7591,28 @@ source = "git+https://github.com/FunctionStream/sqlparser-rs?branch=0.58.0%2Ffs# dependencies = [ "log", "recursive", - "sqlparser_derive", + "sqlparser_derive 0.3.0 (git+https://github.com/FunctionStream/sqlparser-rs?branch=0.58.0%2Ffs)", +] + +[[package]] +name = "sqlparser" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4591acadbcf52f0af60eafbb2c003232b2b4cd8de5f0e9437cb8b1b59046cc0f" +dependencies = [ + "log", + "sqlparser_derive 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "sqlparser_derive" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", ] [[package]] @@ -4587,7 +7622,7 @@ source = "git+https://github.com/FunctionStream/sqlparser-rs?branch=0.58.0%2Ffs# dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -4615,6 +7650,18 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" +[[package]] +name = "std_prelude" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8207e78455ffdf55661170876f88daf85356e4edd54e0a3dbc79586ca1e50cbe" + +[[package]] +name = "stfu8" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51f1e89f093f99e7432c491c382b88a6860a5adbe6bf02574bf0a08efff1978" + [[package]] name = "strsim" version = "0.11.1" @@ -4627,9 +7674,15 @@ version = "0.26.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" dependencies = [ - "strum_macros", + "strum_macros 0.26.4", ] +[[package]] +name = "strum" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" + [[package]] name = "strum_macros" version = "0.26.4" @@ -4640,7 +7693,19 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn", + "syn 2.0.117", +] + +[[package]] +name = "strum_macros" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7695ce3845ea4b33927c055a39dc438a45b059f7c1b3d91d38d10355fb8cbca7" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.117", ] [[package]] @@ -4649,6 +7714,17 @@ version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + [[package]] name = "syn" version = "2.0.117" @@ -4665,6 +7741,9 @@ name = "sync_wrapper" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" +dependencies = [ + "futures-core", +] [[package]] name = "synstructure" @@ -4674,7 +7753,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -4693,6 +7772,164 @@ dependencies = [ "winx", ] +[[package]] +name = "tagptr" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417" + +[[package]] +name = "tantivy" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64a966cb0e76e311f09cf18507c9af192f15d34886ee43d7ba7c7e3803660c43" +dependencies = [ + "aho-corasick", + "arc-swap", + "base64", + "bitpacking", + "bon", + "byteorder", + "census", + "crc32fast", + "crossbeam-channel", + "downcast-rs", + "fastdivide", + "fnv", + "fs4", + "htmlescape", + "hyperloglogplus", + "itertools 0.14.0", + "levenshtein_automata", + "log", + "lru", + "lz4_flex 0.11.6", + "measure_time", + "memmap2", + "once_cell", + "oneshot", + "rayon", + "regex", + "rust-stemmers", + "rustc-hash 2.1.2", + "serde", + "serde_json", + "sketches-ddsketch", + "smallvec", + "tantivy-bitpacker", + "tantivy-columnar", + "tantivy-common", + "tantivy-fst", + "tantivy-query-grammar", + "tantivy-stacker", + "tantivy-tokenizer-api", + "tempfile", + "thiserror 2.0.18", + "time", + "uuid", + "winapi", +] + +[[package]] +name = "tantivy-bitpacker" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1adc286a39e089ae9938935cd488d7d34f14502544a36607effd2239ff0e2494" +dependencies = [ + "bitpacking", +] + +[[package]] +name = "tantivy-columnar" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6300428e0c104c4f7db6f95b466a6f5c1b9aece094ec57cdd365337908dc7344" +dependencies = [ + "downcast-rs", + "fastdivide", + "itertools 0.14.0", + "serde", + "tantivy-bitpacker", + "tantivy-common", + "tantivy-sstable", + "tantivy-stacker", +] + +[[package]] +name = "tantivy-common" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91b6ea6090ce03dc72c27d0619e77185d26cc3b20775966c346c6d4f7e99d7f" +dependencies = [ + "async-trait", + "byteorder", + "ownedbytes", + "serde", + "time", +] + +[[package]] +name = "tantivy-fst" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d60769b80ad7953d8a7b2c70cdfe722bbcdcac6bccc8ac934c40c034d866fc18" +dependencies = [ + "byteorder", + "regex-syntax", + "utf8-ranges", +] + +[[package]] +name = "tantivy-query-grammar" +version = "0.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e810cdeeebca57fc3f7bfec5f85fdbea9031b2ac9b990eb5ff49b371d52bbe6a" +dependencies = [ + "nom 7.1.3", + "serde", + "serde_json", +] + +[[package]] +name = "tantivy-sstable" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "709f22c08a4c90e1b36711c1c6cad5ae21b20b093e535b69b18783dd2cb99416" +dependencies = [ + "futures-util", + "itertools 0.14.0", + "tantivy-bitpacker", + "tantivy-common", + "tantivy-fst", + "zstd", +] + +[[package]] +name = "tantivy-stacker" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bcdebb267671311d1e8891fd9d1301803fdb8ad21ba22e0a30d0cab49ba59c1" +dependencies = [ + "murmurhash32", + "rand_distr 0.4.3", + "tantivy-common", +] + +[[package]] +name = "tantivy-tokenizer-api" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfa942fcee81e213e09715bbce8734ae2180070b97b33839a795ba1de201547d" +dependencies = [ + "serde", +] + +[[package]] +name = "tap" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" + [[package]] name = "target-lexicon" version = "0.13.5" @@ -4747,7 +7984,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -4758,7 +7995,16 @@ checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", +] + +[[package]] +name = "thread-tree" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffbd370cb847953a25954d9f63e14824a36113f8c72eecf6eccef5dc4b45d630" +dependencies = [ + "crossbeam-channel", ] [[package]] @@ -4778,7 +8024,7 @@ checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" dependencies = [ "byteorder", "integer-encoding", - "ordered-float", + "ordered-float 2.10.1", ] [[package]] @@ -4831,6 +8077,21 @@ dependencies = [ "zerovec", ] +[[package]] +name = "tinyvec" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + [[package]] name = "tokio" version = "1.51.0" @@ -4855,7 +8116,17 @@ checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", +] + +[[package]] +name = "tokio-rustls" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61" +dependencies = [ + "rustls", + "tokio", ] [[package]] @@ -4954,15 +8225,15 @@ dependencies = [ "base64", "bytes", "h2", - "http", - "http-body", + "http 1.4.0", + "http-body 1.0.1", "http-body-util", "hyper", "hyper-timeout", "hyper-util", "percent-encoding", "pin-project", - "prost", + "prost 0.13.5", "socket2 0.5.10", "tokio", "tokio-stream", @@ -4980,10 +8251,10 @@ checksum = "9557ce109ea773b399c9b9e5dca39294110b74f1f342cb347a80d1fce8c26a11" dependencies = [ "prettyplease", "proc-macro2", - "prost-build", - "prost-types", + "prost-build 0.13.5", + "prost-types 0.13.5", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -5016,6 +8287,30 @@ dependencies = [ "futures-util", "pin-project-lite", "sync_wrapper", + "tokio", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-http" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8" +dependencies = [ + "async-compression", + "bitflags 2.11.0", + "bytes", + "futures-core", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "iri-string", + "pin-project-lite", + "tokio", + "tokio-util", + "tower 0.5.3", "tower-layer", "tower-service", ] @@ -5063,7 +8358,7 @@ checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -5116,6 +8411,9 @@ name = "twox-hash" version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c" +dependencies = [ + "rand 0.9.2", +] [[package]] name = "typenum" @@ -5165,6 +8463,12 @@ version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861" +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + [[package]] name = "unty" version = "0.0.4" @@ -5183,6 +8487,18 @@ dependencies = [ "serde", ] +[[package]] +name = "urlencoding" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" + +[[package]] +name = "utf8-ranges" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fcfc827f90e53a02eaef5e535ee14266c1d569214c6aa70133a624d8a3164ba" + [[package]] name = "utf8_iter" version = "1.0.4" @@ -5203,6 +8519,7 @@ checksum = "5ac8b6f42ead25368cf5b098aeb3dc8a1a2c05a3eee8a9a1a68c640edbfc79d9" dependencies = [ "getrandom 0.4.2", "js-sys", + "serde_core", "wasm-bindgen", ] @@ -5230,6 +8547,12 @@ version = "0.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "051eb1abcf10076295e815102942cc58f9d5e3b4560e46e53c21e8ff6f3af7b1" +[[package]] +name = "vsimd" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" + [[package]] name = "walkdir" version = "2.5.0" @@ -5315,7 +8638,7 @@ dependencies = [ "bumpalo", "proc-macro2", "quote", - "syn", + "syn 2.0.117", "wasm-bindgen-shared", ] @@ -5391,6 +8714,19 @@ dependencies = [ "wasmparser 0.244.0", ] +[[package]] +name = "wasm-streams" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + [[package]] name = "wasmparser" version = "0.243.0" @@ -5551,7 +8887,7 @@ dependencies = [ "anyhow", "proc-macro2", "quote", - "syn", + "syn 2.0.117", "wasmtime-internal-component-util", "wasmtime-internal-wit-bindgen", "wit-parser 0.243.0", @@ -5665,7 +9001,7 @@ checksum = "70f8b9796a3f0451a7b702508b303d654de640271ac80287176de222f187a237" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -5793,6 +9129,15 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "webpki-roots" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52f5ee44c96cf55f1b349600768e3ece3a8f26010c05265ab73f945bb1a2eb9d" +dependencies = [ + "rustls-pki-types", +] + [[package]] name = "wiggle" version = "41.0.4" @@ -5817,7 +9162,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn", + "syn 2.0.117", "witx", ] @@ -5829,7 +9174,7 @@ checksum = "fea2aea744eded58ae092bf57110c27517dab7d5a300513ff13897325c5c5021" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", "wiggle-generate", ] @@ -5905,7 +9250,7 @@ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -5916,7 +9261,7 @@ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -6163,7 +9508,7 @@ dependencies = [ "heck", "indexmap 2.13.1", "prettyplease", - "syn", + "syn 2.0.117", "wasm-metadata", "wit-bindgen-core", "wit-component", @@ -6179,7 +9524,7 @@ dependencies = [ "prettyplease", "proc-macro2", "quote", - "syn", + "syn 2.0.117", "wit-bindgen-core", "wit-bindgen-rust", ] @@ -6257,6 +9602,21 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" +[[package]] +name = "wyz" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed" +dependencies = [ + "tap", +] + +[[package]] +name = "xmlparser" +version = "0.13.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" + [[package]] name = "xxhash-rust" version = "0.8.15" @@ -6291,7 +9651,7 @@ checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", "synstructure", ] @@ -6312,7 +9672,7 @@ checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -6332,10 +9692,16 @@ checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", "synstructure", ] +[[package]] +name = "zeroize" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" + [[package]] name = "zerotrie" version = "0.2.4" @@ -6366,7 +9732,7 @@ checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 531601d0..eebf2a6c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -52,6 +52,8 @@ arrow-array = "55" arrow-ipc = "55" arrow-schema = { version = "55", features = ["serde"] } parquet = "55" +object_store = { version = "0.12.5", features = ["aws"] } +bytes = "1" futures = "0.3" serde_json_path = "0.7" xxhash-rust = { version = "0.8", features = ["xxh3"] } @@ -63,6 +65,7 @@ itertools = "0.14" strum = { version = "0.26", features = ["derive"] } arrow-json = {version = '55.2.0'} +apache-avro = "0.21" datafusion = {git = 'https://github.com/FunctionStream/datafusion', branch = '48.0.1/fs'} datafusion-common = {git = 'https://github.com/FunctionStream/datafusion', branch = '48.0.1/fs'} datafusion-execution = {git = 'https://github.com/FunctionStream/datafusion', branch = '48.0.1/fs'} @@ -74,6 +77,9 @@ sqlparser = { git = "https://github.com/FunctionStream/sqlparser-rs", branch = " ahash = "0.8" governor = "0.8.0" +lance = { version = "4.0.0", default-features = false, features = ["aws"] } +arrow-array-lance = { package = "arrow-array", version = "57.3.0" } +arrow-ipc-lance = { package = "arrow-ipc", version = "57.3.0" } [features] default = ["incremental-cache", "python"] diff --git a/Makefile b/Makefile index 87a2339a..78138dae 100644 --- a/Makefile +++ b/Makefile @@ -9,8 +9,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - - APP_NAME := function-stream VERSION := $(shell grep '^version' Cargo.toml | head -1 | awk -F '"' '{print $$2}') DATE := $(shell date -u +"%Y-%m-%dT%H:%M:%SZ") @@ -29,30 +27,22 @@ endif OS := $(shell uname -s | tr '[:upper:]' '[:lower:]') OS_NAME := $(shell uname -s) -# 2. Configure RUSTFLAGS and target triple per platform +# 2. Configure target triple per platform DIST_ROOT := dist ifeq ($(OS_NAME), Linux) TRIPLE := $(ARCH)-unknown-linux-gnu STATIC_FLAGS := else ifeq ($(OS_NAME), Darwin) - # macOS: strip symbols but keep dynamic linking (Apple system restriction) TRIPLE := $(ARCH)-apple-darwin STATIC_FLAGS := else ifneq (,$(findstring MINGW,$(OS_NAME))$(findstring MSYS,$(OS_NAME))) - # Windows (Git Bash / MSYS2): static-link MSVC runtime TRIPLE := $(ARCH)-pc-windows-msvc STATIC_FLAGS := -C target-feature=+crt-static else - # Fallback TRIPLE := $(ARCH)-unknown-linux-gnu STATIC_FLAGS := endif -# 3. Aggressive optimization flags -# opt-level=z : size-oriented, minimize binary footprint -# strip=symbols: remove debug symbol table at link time -# Note: panic=abort is intentionally omitted to preserve stack unwinding -# for better fault tolerance in the streaming runtime OPTIMIZE_FLAGS := -C opt-level=z -C strip=symbols $(STATIC_FLAGS) TARGET_DIR := target/$(TRIPLE)/release @@ -110,7 +100,7 @@ help: (printf "$(C_Y)[!] Auto-installing target toolchain for $(OS_NAME): $(TRIPLE)$(C_0)\n" && \ rustup target add $(TRIPLE)) -# 5. Build targets (depend on .ensure-target for automatic toolchain setup) +# 5. Build targets build: .check-env .ensure-target .build-wasm $(call log,BUILD,Rust Full [$(OS_NAME) / $(TRIPLE)]) @RUSTFLAGS="$(OPTIMIZE_FLAGS)" \ @@ -128,24 +118,24 @@ build: .check-env .ensure-target .build-wasm build-lite: .check-env .ensure-target $(call log,BUILD,Rust Lite [$(OS_NAME) / $(TRIPLE)]) - @RUSTFLAGS="$(OPTIMIZE_FLAGS)" \ + @RUSTFLAGS="$(INDUSTRIAL_RUSTFLAGS)" \ cargo build --release \ - --target $(TRIPLE) \ - --no-default-features \ - --features incremental-cache \ - --quiet + --target $(TRIPLE) \ + --no-default-features \ + --features incremental-cache \ + --quiet $(call log,BUILD,CLI for dist) - @RUSTFLAGS="$(OPTIMIZE_FLAGS)" \ + @RUSTFLAGS="$(INDUSTRIAL_RUSTFLAGS)" \ cargo build --release \ - --target $(TRIPLE) \ - -p function-stream-cli \ - --quiet + --target $(TRIPLE) \ + -p function-stream-cli \ + --quiet $(call success,Target: $(TARGET_DIR)/$(APP_NAME) $(TARGET_DIR)/cli) .build-wasm: $(call log,WASM,Building Python Runtime using $(PYTHON_EXEC)) @cd $(PYTHON_ROOT)/functionstream-runtime && \ - PYTHONPATH=../functionstream-api:../functionstream-api-advanced ../../$(PYTHON_EXEC) build.py > /dev/null + PYTHONPATH=../functionstream-api:../functionstream-api-advanced ../../$(PYTHON_EXEC) build.py > /dev/null @[ -f "$(WASM_SOURCE)" ] || (printf "$(C_R)[X] WASM Build Failed$(C_0)\n" && exit 1) dist: build @@ -223,9 +213,9 @@ docker: docker-run: $(call log,DOCKER,Starting Container) @docker run --rm -it \ - -p 8080:8080 \ - -v $(CURDIR)/logs:/app/logs \ - $(IMAGE_NAME) + -p 8080:8080 \ + -v $(CURDIR)/logs:/app/logs \ + $(IMAGE_NAME) docker-push: $(call log,DOCKER,Pushing $(IMAGE_NAME)) diff --git a/README-zh.md b/README-zh.md index 05fd5fc2..5d38c020 100644 --- a/README-zh.md +++ b/README-zh.md @@ -204,6 +204,7 @@ function-stream-/ | 文档 | 描述 | |------------------------------------------------------------------------|--------------------------| | [Streaming SQL 使用指南](docs/streaming-sql-guide-zh.md) | 声明式 SQL 实时流处理指南 | +| [Streaming SQL Source/Sink 文档](docs/streaming-sql/README-zh.md) | Source/Sink 能力与参数速查 | | [连接器、格式与类型参考](docs/connectors-and-formats-zh.md) | 支持的 Source/Sink、格式与数据类型 | | [服务端配置与运维指南](docs/server-configuration-zh.md) | 服务端配置与运维操作 | | [Function 任务配置规范](docs/function-configuration-zh.md) | 任务定义规范 | diff --git a/docs/streaming-sql-guide-zh.md b/docs/streaming-sql-guide-zh.md index 5721971c..216bf557 100644 --- a/docs/streaming-sql-guide-zh.md +++ b/docs/streaming-sql-guide-zh.md @@ -90,6 +90,8 @@ flowchart LR | **水位线 (Watermark)** | `AS - INTERVAL ...` | 对**迟到、乱序**数据的容忍度;时间推进由水位线驱动,**过度迟到**的事件会被安全丢弃。 | > **完整参考**:支持的连接器、数据格式和 SQL 数据类型,请参阅 [连接器、格式与类型参考](connectors-and-formats-zh.md)。 +> +> **Source / Sink 专项说明**:请参阅 [Streaming SQL Connector 文档](streaming-sql/README-zh.md)(含 Kafka Source 与 filesystem/s3/delta/iceberg/lanceDB Sink)。 --- diff --git a/docs/streaming-sql/README-zh.md b/docs/streaming-sql/README-zh.md new file mode 100644 index 00000000..563588f1 --- /dev/null +++ b/docs/streaming-sql/README-zh.md @@ -0,0 +1,13 @@ +# Streaming SQL Connector 文档 + +本目录提供 Streaming SQL 的 Source / Sink 专项文档,推荐配合 `CREATE STREAMING TABLE ... AS SELECT ...` 使用。 + +## 目录 + +- [Source 文档](Source/README-zh.md) +- [Sink 文档](Sink/README-zh.md) + +## 使用建议 + +1. 先用 `CREATE TABLE ... WITH (...)` 注册 Source(当前仅 Kafka)。 +2. 再用 `CREATE STREAMING TABLE ... WITH (...) AS SELECT ...` 创建持续运行的 Pipeline 并写入 Sink。 diff --git a/docs/streaming-sql/README.md b/docs/streaming-sql/README.md new file mode 100644 index 00000000..8c3777cf --- /dev/null +++ b/docs/streaming-sql/README.md @@ -0,0 +1,14 @@ +# Streaming SQL Connector Docs + +This directory contains Source/Sink focused docs for Streaming SQL, intended to be used with `CREATE STREAMING TABLE ... AS SELECT ...`. + +## Index + +- [Source Docs](Source/README.md) +- [Sink Docs](Sink/README.md) + +## Recommended workflow + +1. Register sources using `CREATE TABLE ... WITH (...)` (currently Kafka source). +2. Build a continuous pipeline using `CREATE STREAMING TABLE ... WITH (...) AS SELECT ...` and write to sinks. + diff --git a/docs/streaming-sql/Sink/README-zh.md b/docs/streaming-sql/Sink/README-zh.md new file mode 100644 index 00000000..56e9af97 --- /dev/null +++ b/docs/streaming-sql/Sink/README-zh.md @@ -0,0 +1,31 @@ +# Streaming SQL Sink 文档 + +本目录聚焦 Streaming SQL 的下游写出能力(Sink)。 + +## 支持矩阵 + +| Connector | 支持格式 | + +|---|---| +| `kafka` | `json` / `raw_string` / `raw_bytes`(沿用 Kafka Sink 编码能力) | +| `filesystem` | `csv` / `parquet` / `json`(JSONL) / `avro` / `orc` | +| `s3` | `csv` / `parquet` | +| `delta` | `csv` / `parquet` / `json`(JSONL) / `avro` / `orc` | +| `iceberg` | `csv` / `parquet` | +| `lanceDB` | `lance` | + +## 文档列表 + +- [Kafka Sink](kafka-sink-zh.md) +- [Filesystem Sink](filesystem-sink-zh.md) +- [S3 Sink](s3-sink-zh.md) +- [Delta Sink](delta-sink-zh.md) +- [Iceberg Sink](iceberg-sink-zh.md) +- [LanceDB Sink](lancedb-sink-zh.md) + +## 通用约定 + +- 在 `CREATE STREAMING TABLE ... WITH (...) AS SELECT ...` 中通过 `WITH` 指定 `connector` 与 `format`。 +- Sink 场景建议显式指定 `type='sink'`。 +- 仅 `lanceDB` connector 允许 `format='lance'`;其余 Sink connector 不支持 `lance`。 +- `format='json'` 的文件类 Sink 输出为 JSON Lines(NDJSON,`.jsonl`)。 diff --git a/docs/streaming-sql/Sink/README.md b/docs/streaming-sql/Sink/README.md new file mode 100644 index 00000000..63d73458 --- /dev/null +++ b/docs/streaming-sql/Sink/README.md @@ -0,0 +1,31 @@ +# Streaming SQL Sink Docs + +This directory documents sink connectors for Streaming SQL. + +## Support matrix + +| Connector | Supported formats | +|---|---| +| `kafka` | `json` / `raw_string` / `raw_bytes` | +| `filesystem` | `csv` / `parquet` / `json`(JSONL) / `avro` / `orc` | +| `s3` | `csv` / `parquet` | +| `delta` | `csv` / `parquet` / `json`(JSONL) / `avro` / `orc` | +| `iceberg` | `csv` / `parquet` | +| `lanceDB` | `lance` | + +## Documents + +- [Kafka Sink](kafka-sink.md) +- [Filesystem Sink](filesystem-sink.md) +- [S3 Sink](s3-sink.md) +- [Delta Sink](delta-sink.md) +- [Iceberg Sink](iceberg-sink.md) +- [LanceDB Sink](lancedb-sink.md) + +## Notes + +- Configure sink connectors via `WITH (...)` in `CREATE STREAMING TABLE ... AS SELECT ...`. +- Use `type='sink'` explicitly for sink tables. +- Only `lanceDB` accepts `format='lance'`. +- For file-like sinks, `format='json'` is written as JSON Lines (NDJSON, `.jsonl`). + diff --git a/docs/streaming-sql/Sink/delta-sink-zh.md b/docs/streaming-sql/Sink/delta-sink-zh.md new file mode 100644 index 00000000..4e973b2c --- /dev/null +++ b/docs/streaming-sql/Sink/delta-sink-zh.md @@ -0,0 +1,33 @@ +# Delta Sink + +`delta` connector 对应 Delta 数据湖写出通道。 + +## 支持格式 + +- `csv` +- `parquet` +- `json`(写出为 JSON Lines / NDJSON,文件后缀 `.jsonl`) +- `avro` +- `orc` + +## 常用 WITH 参数 + +- `connector='delta'` +- `type='sink'` +- `format='csv'|'parquet'|'json'|'avro'|'orc'` +- `path='/data/delta/orders'`(本地)或对象存储前缀 +- 可选 S3 参数:`s3.bucket` / `s3.region` / `s3.endpoint` / AKSK +- `parquet.compression`(仅 `parquet`) + +## 示例(CREATE STREAMING TABLE) + +```sql +CREATE STREAMING TABLE st_delta_parquet +WITH ( + connector='delta', + type='sink', + format='parquet', + path='/tmp/delta_orders' +) AS +SELECT * FROM src_kafka_orders; +``` diff --git a/docs/streaming-sql/Sink/delta-sink.md b/docs/streaming-sql/Sink/delta-sink.md new file mode 100644 index 00000000..3d3bbd6c --- /dev/null +++ b/docs/streaming-sql/Sink/delta-sink.md @@ -0,0 +1,21 @@ +# Delta Sink + +`delta` sink is the Delta Lake write path. + +## Supported formats + +- `csv` +- `parquet` +- `json` (written as JSON Lines / NDJSON, `.jsonl`) +- `avro` +- `orc` + +## Common `WITH` options + +- `connector='delta'` +- `type='sink'` +- `format='csv'|'parquet'|'json'|'avro'|'orc'` +- `path='/data/delta/orders'` (local) or object-store prefix +- Optional S3 options: `s3.bucket` / `s3.region` / `s3.endpoint` / access keys +- `parquet.compression` (only for `parquet`) + diff --git a/docs/streaming-sql/Sink/filesystem-sink-zh.md b/docs/streaming-sql/Sink/filesystem-sink-zh.md new file mode 100644 index 00000000..e60f4bf2 --- /dev/null +++ b/docs/streaming-sql/Sink/filesystem-sink-zh.md @@ -0,0 +1,33 @@ +# Filesystem Sink + +`filesystem` 用于将流数据落到本地文件系统目录。 + +## 支持格式 + +- `csv` +- `parquet` +- `json`(写出为 JSON Lines / NDJSON,文件后缀 `.jsonl`) +- `avro` +- `orc` + +## 常用 WITH 参数 + +- `connector='filesystem'` +- `type='sink'` +- `format='csv'|'parquet'|'json'|'avro'|'orc'` +- `path='/path/to/output'`(或 `sink.path`) +- `parquet.compression`(仅 `parquet` 生效) + +## 示例(CREATE STREAMING TABLE) + +```sql +CREATE STREAMING TABLE st_fs_parquet +WITH ( + connector='filesystem', + type='sink', + format='parquet', + path='/tmp/fs_orders', + 'parquet.compression'='zstd' +) AS +SELECT * FROM src_kafka_orders; +``` diff --git a/docs/streaming-sql/Sink/filesystem-sink.md b/docs/streaming-sql/Sink/filesystem-sink.md new file mode 100644 index 00000000..3eb825c0 --- /dev/null +++ b/docs/streaming-sql/Sink/filesystem-sink.md @@ -0,0 +1,20 @@ +# Filesystem Sink + +`filesystem` sink writes streaming data to local files. + +## Supported formats + +- `csv` +- `parquet` +- `json` (written as JSON Lines / NDJSON, `.jsonl`) +- `avro` +- `orc` + +## Common `WITH` options + +- `connector='filesystem'` +- `type='sink'` +- `format='csv'|'parquet'|'json'|'avro'|'orc'` +- `path='/path/to/output'` (or `sink.path`) +- `parquet.compression` (only for `parquet`) + diff --git a/docs/streaming-sql/Sink/iceberg-sink-zh.md b/docs/streaming-sql/Sink/iceberg-sink-zh.md new file mode 100644 index 00000000..09667059 --- /dev/null +++ b/docs/streaming-sql/Sink/iceberg-sink-zh.md @@ -0,0 +1,31 @@ +# Iceberg Sink + +`iceberg` connector 对应 Iceberg 数据湖写出通道。 + +## 支持格式 + +- `csv` +- `parquet` + +## 常用 WITH 参数 + +- `connector='iceberg'` +- `type='sink'` +- `format='csv'|'parquet'` +- `path='/data/iceberg/orders'` 或对象存储前缀 +- 可选 S3 参数:`s3.bucket` / `s3.region` / `s3.endpoint` / AKSK +- `parquet.compression`(仅 `parquet`) + +## 示例(CREATE STREAMING TABLE) + +```sql +CREATE STREAMING TABLE st_iceberg_parquet +WITH ( + connector='iceberg', + type='sink', + format='parquet', + path='/tmp/iceberg_orders', + 'parquet.compression'='zstd' +) AS +SELECT * FROM src_kafka_orders; +``` diff --git a/docs/streaming-sql/Sink/iceberg-sink.md b/docs/streaming-sql/Sink/iceberg-sink.md new file mode 100644 index 00000000..0cafe549 --- /dev/null +++ b/docs/streaming-sql/Sink/iceberg-sink.md @@ -0,0 +1,18 @@ +# Iceberg Sink + +`iceberg` sink is the Iceberg Lakehouse write path. + +## Supported formats + +- `csv` +- `parquet` + +## Common `WITH` options + +- `connector='iceberg'` +- `type='sink'` +- `format='csv'|'parquet'` +- `path='/data/iceberg/orders'` or object-store prefix +- Optional S3 options: `s3.bucket` / `s3.region` / `s3.endpoint` / access keys +- `parquet.compression` (only for `parquet`) + diff --git a/docs/streaming-sql/Sink/kafka-sink-zh.md b/docs/streaming-sql/Sink/kafka-sink-zh.md new file mode 100644 index 00000000..e6d3ecbd --- /dev/null +++ b/docs/streaming-sql/Sink/kafka-sink-zh.md @@ -0,0 +1,32 @@ +# Kafka Sink + +`kafka` sink 用于将流数据写入 Kafka topic。 + +## 支持格式 + +- `json` +- `raw_string` +- `raw_bytes` + +## 常用 WITH 参数 + +- `connector='kafka'` +- `type='sink'` +- `topic='topic_name'` +- `bootstrap.servers='host:9092'` +- `format='json'|'raw_string'|'raw_bytes'` +- `sink.commit.mode='at-least-once'|'exactly-once'`(可选) + +## 示例(CREATE STREAMING TABLE) + +```sql +CREATE STREAMING TABLE st_kafka_json +WITH ( + connector='kafka', + type='sink', + topic='topic_out', + 'bootstrap.servers'='127.0.0.1:9092', + format='json' +) AS +SELECT * FROM src_kafka_orders; +``` diff --git a/docs/streaming-sql/Sink/kafka-sink.md b/docs/streaming-sql/Sink/kafka-sink.md new file mode 100644 index 00000000..43fbdb72 --- /dev/null +++ b/docs/streaming-sql/Sink/kafka-sink.md @@ -0,0 +1,33 @@ +# Kafka Sink + +`kafka` sink writes streaming output records to Kafka topics. + +## Supported formats + +- `json` +- `raw_string` +- `raw_bytes` + +## Common `WITH` options + +- `connector='kafka'` +- `type='sink'` +- `topic='topic_name'` +- `bootstrap.servers='host:9092'` +- `format='json'|'raw_string'|'raw_bytes'` +- `sink.commit.mode='at-least-once'|'exactly-once'` (optional) + +## Example + +```sql +CREATE STREAMING TABLE st_kafka_json +WITH ( + connector='kafka', + type='sink', + topic='topic_out', + 'bootstrap.servers'='127.0.0.1:9092', + format='json' +) AS +SELECT * FROM src_kafka_orders; +``` + diff --git a/docs/streaming-sql/Sink/lancedb-sink-zh.md b/docs/streaming-sql/Sink/lancedb-sink-zh.md new file mode 100644 index 00000000..f2dc4d0d --- /dev/null +++ b/docs/streaming-sql/Sink/lancedb-sink-zh.md @@ -0,0 +1,28 @@ +# LanceDB Sink + +`lanceDB` 是 Lance 数据集专用写出 connector。 + +## 支持格式 + +- 仅支持 `lance` + +## 常用 WITH 参数 + +- `connector='lanceDB'` +- `type='sink'` +- `format='lance'` +- `path='/data/lance/orders'`(本地目录)或对象存储前缀 +- 可选 S3 参数:`s3.bucket` / `s3.region` / `s3.endpoint` / AKSK + +## 示例(CREATE STREAMING TABLE) + +```sql +CREATE STREAMING TABLE st_lancedb +WITH ( + connector='lanceDB', + type='sink', + format='lance', + path='/tmp/lance_orders' +) AS +SELECT * FROM src_kafka_orders; +``` diff --git a/docs/streaming-sql/Sink/lancedb-sink.md b/docs/streaming-sql/Sink/lancedb-sink.md new file mode 100644 index 00000000..d1e7fa74 --- /dev/null +++ b/docs/streaming-sql/Sink/lancedb-sink.md @@ -0,0 +1,16 @@ +# LanceDB Sink + +`lanceDB` is a dedicated sink connector for Lance datasets. + +## Supported formats + +- `lance` only + +## Common `WITH` options + +- `connector='lanceDB'` +- `type='sink'` +- `format='lance'` +- `path='/data/lance/orders'` (local directory) or object-store prefix +- Optional S3 options: `s3.bucket` / `s3.region` / `s3.endpoint` / access keys + diff --git a/docs/streaming-sql/Sink/s3-sink-zh.md b/docs/streaming-sql/Sink/s3-sink-zh.md new file mode 100644 index 00000000..34bdfc2f --- /dev/null +++ b/docs/streaming-sql/Sink/s3-sink-zh.md @@ -0,0 +1,35 @@ +# S3 Sink + +`s3` 用于将流数据写入对象存储(AWS S3 / S3 兼容存储)。 + +## 支持格式 + +- `csv` +- `parquet` + +## 常用 WITH 参数 + +- `connector='s3'` +- `type='sink'` +- `format='csv'|'parquet'` +- `path='prefix/path'` +- `s3.bucket='your-bucket'` +- `s3.region='us-east-1'` +- `s3.endpoint='http://minio:9000'`(可选,S3 兼容) +- `s3.access_key_id` / `s3.secret_access_key` / `s3.session_token`(可选) +- `parquet.compression`(仅 `parquet`) + +## 示例(CREATE STREAMING TABLE) + +```sql +CREATE STREAMING TABLE st_s3_csv +WITH ( + connector='s3', + type='sink', + format='csv', + path='streaming/orders', + 's3.bucket'='fs-dev', + 's3.region'='us-east-1' +) AS +SELECT * FROM src_kafka_orders; +``` diff --git a/docs/streaming-sql/Sink/s3-sink.md b/docs/streaming-sql/Sink/s3-sink.md new file mode 100644 index 00000000..c4b8f062 --- /dev/null +++ b/docs/streaming-sql/Sink/s3-sink.md @@ -0,0 +1,21 @@ +# S3 Sink + +`s3` sink writes streaming data to object storage (AWS S3 or S3-compatible services). + +## Supported formats + +- `csv` +- `parquet` + +## Common `WITH` options + +- `connector='s3'` +- `type='sink'` +- `format='csv'|'parquet'` +- `path='prefix/path'` +- `s3.bucket='your-bucket'` +- `s3.region='us-east-1'` +- `s3.endpoint='http://minio:9000'` (optional, for S3-compatible storage) +- `s3.access_key_id` / `s3.secret_access_key` / `s3.session_token` (optional) +- `parquet.compression` (only for `parquet`) + diff --git a/docs/streaming-sql/Source/README-zh.md b/docs/streaming-sql/Source/README-zh.md new file mode 100644 index 00000000..e38a763f --- /dev/null +++ b/docs/streaming-sql/Source/README-zh.md @@ -0,0 +1,7 @@ +# Streaming SQL Source 文档 + +当前 Streaming SQL Source 仅支持 Kafka。通常先创建 Source 表,再被 `CREATE STREAMING TABLE ... AS SELECT ...` 引用。 + +## 文档列表 + +- [Kafka Source](kafka-source-zh.md) diff --git a/docs/streaming-sql/Source/README.md b/docs/streaming-sql/Source/README.md new file mode 100644 index 00000000..a84d3437 --- /dev/null +++ b/docs/streaming-sql/Source/README.md @@ -0,0 +1,6 @@ +# Streaming SQL Source Docs + +## Supported sources + +- [Kafka Source](kafka-source.md) + diff --git a/docs/streaming-sql/Source/kafka-source-zh.md b/docs/streaming-sql/Source/kafka-source-zh.md new file mode 100644 index 00000000..e3da7597 --- /dev/null +++ b/docs/streaming-sql/Source/kafka-source-zh.md @@ -0,0 +1,36 @@ +# Kafka Source + +`kafka` source 用于从 Kafka topic 持续消费数据。 + +## 支持格式 + +- `json` +- `raw_string` +- `raw_bytes` + +## 常用 WITH 参数 + +- `connector='kafka'` +- `type='source'`(默认) +- `topic='topic_in'` +- `bootstrap.servers='host:9092'` +- `group.id='consumer_group'` +- `format='json'|'raw_string'|'raw_bytes'` +- `scan.startup.mode='earliest'|'latest'|'group-offsets'`(可选) + +## 示例 + +```sql +CREATE TABLE src_kafka_json ( + user_id BIGINT, + event STRING, + ts TIMESTAMP +) WITH ( + connector='kafka', + type='source', + topic='topic_in', + 'bootstrap.servers'='127.0.0.1:9092', + 'group.id'='fs_demo', + format='json' +); +``` diff --git a/docs/streaming-sql/Source/kafka-source.md b/docs/streaming-sql/Source/kafka-source.md new file mode 100644 index 00000000..505a6f3e --- /dev/null +++ b/docs/streaming-sql/Source/kafka-source.md @@ -0,0 +1,19 @@ +# Kafka Source + +`kafka` source is used to ingest streaming records from Kafka topics. + +## Supported formats + +- `json` +- `raw_string` +- `raw_bytes` + +## Common `WITH` options + +- `connector='kafka'` +- `topic='topic_name'` +- `bootstrap.servers='host:9092'` +- `format='json'|'raw_string'|'raw_bytes'` +- `scan.startup.mode='earliest'|'latest'` (optional) +- `group.id='consumer_group'` (optional) + diff --git a/protocol/proto/function_stream_graph.proto b/protocol/proto/function_stream_graph.proto index 48b68a2f..18903e4f 100644 --- a/protocol/proto/function_stream_graph.proto +++ b/protocol/proto/function_stream_graph.proto @@ -70,7 +70,11 @@ message ConnectorOp { oneof config { KafkaSourceConfig kafka_source = 6; KafkaSinkConfig kafka_sink = 7; - GenericConnectorConfig generic = 8; + FilesystemSinkConfig filesystem_sink = 9; + DeltaSinkConfig delta_sink = 10; + IcebergSinkConfig iceberg_sink = 11; + S3SinkConfig s3_sink = 12; + LanceDbSinkConfig lancedb_sink = 13; } } @@ -103,9 +107,55 @@ message KafkaSinkConfig { optional string value_subject = 9; } -// Fallback for non-Kafka connectors that are not yet strongly typed. -message GenericConnectorConfig { - map properties = 1; +message FilesystemSinkConfig { + string path = 1; + SinkFormatProto format = 2; + optional ParquetCompressionProto parquet_compression = 3; + map extra_properties = 100; + map runtime_properties = 101; +} + +message DeltaSinkConfig { + string path = 1; + SinkFormatProto format = 2; + optional ParquetCompressionProto parquet_compression = 3; + map extra_properties = 100; + map runtime_properties = 101; +} + +message IcebergSinkConfig { + string path = 1; + SinkFormatProto format = 2; + optional ParquetCompressionProto parquet_compression = 3; + map extra_properties = 100; + map runtime_properties = 101; +} + +message S3SinkConfig { + string path = 1; + SinkFormatProto format = 2; + string bucket = 3; + string region = 4; + optional string endpoint = 5; + optional string access_key_id = 6; + optional string secret_access_key = 7; + optional string session_token = 8; + optional ParquetCompressionProto parquet_compression = 9; + map extra_properties = 100; + map runtime_properties = 101; +} + +message LanceDbSinkConfig { + string path = 1; + SinkFormatProto format = 2; + optional string s3_bucket = 3; + optional string s3_region = 4; + optional string s3_endpoint = 5; + optional string s3_access_key_id = 6; + optional string s3_secret_access_key = 7; + optional string s3_session_token = 8; + map extra_properties = 100; + map runtime_properties = 101; } // ─────────────────────── Kafka Auth ─────────────────────── @@ -189,6 +239,26 @@ enum KafkaSinkCommitMode { KAFKA_SINK_EXACTLY_ONCE = 1; } +enum SinkFormatProto { + SINK_FORMAT_UNSPECIFIED = 0; + SINK_FORMAT_CSV = 1; + SINK_FORMAT_JSONL = 2; + SINK_FORMAT_AVRO = 3; + SINK_FORMAT_PARQUET = 4; + SINK_FORMAT_ORC = 5; + SINK_FORMAT_LANCE = 6; +} + +enum ParquetCompressionProto { + PARQUET_COMPRESSION_UNSPECIFIED = 0; + PARQUET_COMPRESSION_UNCOMPRESSED = 1; + PARQUET_COMPRESSION_SNAPPY = 2; + PARQUET_COMPRESSION_GZIP = 3; + PARQUET_COMPRESSION_ZSTD = 4; + PARQUET_COMPRESSION_LZ4 = 5; + PARQUET_COMPRESSION_LZ4_RAW = 6; +} + message ValuePlanOperator { string name = 1; bytes physical_plan = 2; diff --git a/protocol/proto/storage.proto b/protocol/proto/storage.proto index fd021727..719d0f63 100644 --- a/protocol/proto/storage.proto +++ b/protocol/proto/storage.proto @@ -58,9 +58,12 @@ message KafkaSourceSubtaskCheckpoint { repeated KafkaPartitionOffset partitions = 4; } -// Generic source checkpoint payload envelope (enum-like via oneof). -message SourceCheckpointPayload { - oneof checkpoint { +// Aggregated, epoch-aligned per-source checkpoint info persisted by the coordinator +// (one entry per source subtask). The envelope decouples the catalog/meta-store from +// any specific source implementation so new source types can be added by registering a +// new `CheckpointAggregator` in the runtime without touching storage schema layouts. +message SourceCheckpointInfo { + oneof info { KafkaSourceSubtaskCheckpoint kafka = 1; } } @@ -81,10 +84,10 @@ message StreamingTableDefinition { // Updated by JobManager after all operators ACK. Used for crash recovery. uint64 latest_checkpoint_epoch = 6; - // Kafka source per-subtask offsets at the same committed epoch as `latest_checkpoint_epoch`. - // Populated by the runtime coordinator from source checkpoint ACKs. Optional `.bin` files under - // the job state dir may exist only for local recovery materialization from this field. - repeated KafkaSourceSubtaskCheckpoint kafka_source_checkpoints = 7; + // Source-agnostic per-subtask checkpoint entries aligned with `latest_checkpoint_epoch`. + // Populated by the runtime CheckpointCoordinator via the configured aggregator registry; + // storage no longer knows the internal layout of any particular source type. + repeated SourceCheckpointInfo source_checkpoints = 7; } // ============================================================================= diff --git a/src/config/global_config.rs b/src/config/global_config.rs index dcfbcf5c..3c6eab67 100644 --- a/src/config/global_config.rs +++ b/src/config/global_config.rs @@ -25,6 +25,8 @@ pub const DEFAULT_STREAMING_RUNTIME_MEMORY_BYTES: u64 = 10 * 1024 * 1024; /// Default for [`StreamingConfig::operator_state_store_memory_bytes`] when unset. **5 MiB** per stateful operator cap. pub const DEFAULT_OPERATOR_STATE_STORE_MEMORY_BYTES: u64 = 5 * 1024 * 1024; +/// Default sink in-memory buffer reservation. **64 MiB**. +pub const DEFAULT_SINK_BUFFER_MEMORY_BYTES: u64 = 64 * 1024 * 1024; #[derive(Debug, Clone, Serialize, Deserialize, Default)] pub struct StreamingConfig { diff --git a/src/coordinator/dataset/show_catalog_tables_result.rs b/src/coordinator/dataset/show_catalog_tables_result.rs index 9811ff82..87c5c3ee 100644 --- a/src/coordinator/dataset/show_catalog_tables_result.rs +++ b/src/coordinator/dataset/show_catalog_tables_result.rs @@ -17,7 +17,8 @@ use arrow_schema::{DataType, Field, Schema}; use datafusion::arrow::datatypes::Schema as DfSchema; use super::DataSet; -use crate::sql::schema::table::Table as CatalogTable; +use crate::sql::schema::catalog::ExternalTable; +use crate::sql::schema::table::CatalogEntity; use crate::sql::schema::{catalog_table_row_detail, schema_columns_one_line}; #[derive(Clone, Debug)] @@ -30,7 +31,7 @@ pub struct ShowCatalogTablesResult { } impl ShowCatalogTablesResult { - pub fn from_tables(tables: &[Arc]) -> Self { + pub fn from_tables(tables: &[Arc]) -> Self { let mut names = Vec::with_capacity(tables.len()); let mut kinds = Vec::with_capacity(tables.len()); let mut column_counts = Vec::with_capacity(tables.len()); @@ -39,18 +40,19 @@ impl ShowCatalogTablesResult { for t in tables { let schema = match t.as_ref() { - CatalogTable::ConnectorTable(source) | CatalogTable::LookupTable(source) => { - source.produce_physical_schema() - } - CatalogTable::TableFromQuery { .. } => DfSchema::new(t.get_fields()), + CatalogEntity::ExternalConnector(b) => b.as_ref().produce_physical_schema(), + CatalogEntity::ComputedTable { .. } => DfSchema::new(t.get_fields()), }; let ncols = schema.fields().len() as i32; names.push(t.name().to_string()); kinds.push( match t.as_ref() { - CatalogTable::ConnectorTable(_) => "SOURCE", - CatalogTable::LookupTable(_) => "LOOKUP", - CatalogTable::TableFromQuery { .. } => "QUERY", + CatalogEntity::ExternalConnector(b) => match b.as_ref() { + ExternalTable::Source(_) => "SOURCE", + ExternalTable::Sink(_) => "SINK", + ExternalTable::Lookup(_) => "LOOKUP", + }, + CatalogEntity::ComputedTable { .. } => "QUERY", } .to_string(), ); diff --git a/src/coordinator/execution/executor.rs b/src/coordinator/execution/executor.rs index 6fb03134..22c5c1b7 100644 --- a/src/coordinator/execution/executor.rs +++ b/src/coordinator/execution/executor.rs @@ -21,9 +21,9 @@ use crate::coordinator::dataset::{ ShowFunctionsResult, ShowStreamingTablesResult, empty_record_batch, }; use crate::coordinator::plan::{ - CreateFunctionPlan, CreatePythonFunctionPlan, CreateTablePlan, CreateTablePlanBody, - DropFunctionPlan, DropStreamingTablePlan, DropTablePlan, LookupTablePlan, PlanNode, - PlanVisitor, PlanVisitorContext, PlanVisitorResult, ShowCatalogTablesPlan, + CompileErrorPlan, CreateFunctionPlan, CreatePythonFunctionPlan, CreateTablePlan, + CreateTablePlanBody, DropFunctionPlan, DropStreamingTablePlan, DropTablePlan, LookupTablePlan, + PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult, ShowCatalogTablesPlan, ShowCreateStreamingTablePlan, ShowCreateTablePlan, ShowFunctionsPlan, ShowStreamingTablesPlan, StartFunctionPlan, StopFunctionPlan, StreamingTable, StreamingTableConnectorPlan, }; @@ -34,8 +34,9 @@ use crate::coordinator::streaming_table_options::{ use crate::runtime::streaming::job::JobManager; use crate::runtime::streaming::protocol::control::StopMode; use crate::runtime::wasm::taskexecutor::TaskManager; +use crate::sql::schema::catalog::ExternalTable; use crate::sql::schema::show_create_catalog_table; -use crate::sql::schema::table::Table as CatalogTable; +use crate::sql::schema::table::CatalogEntity; use crate::storage::stream_catalog::CatalogManager; #[derive(Error, Debug)] @@ -89,6 +90,14 @@ impl Executor { } impl PlanVisitor for Executor { + fn visit_compile_error_plan( + &self, + plan: &CompileErrorPlan, + _context: &PlanVisitorContext, + ) -> PlanVisitorResult { + PlanVisitorResult::Execute(Err(ExecuteError::Validation(plan.message.clone()))) + } + fn visit_create_function( &self, plan: &CreateFunctionPlan, @@ -275,13 +284,17 @@ impl PlanVisitor for Executor { ) -> PlanVisitorResult { let execute = || -> Result { let (table_name, if_not_exists, catalog_table) = match &plan.body { - CreateTablePlanBody::ConnectorSource { - source_table, + CreateTablePlanBody::External { + table, if_not_exists, } => { - let table_name = source_table.name().to_string(); - let table_instance = - CatalogTable::ConnectorTable(source_table.as_ref().clone()); + if matches!(table.as_ref(), ExternalTable::Sink(_)) { + return Err(ExecuteError::Internal( + "`CREATE TABLE` cannot produce a Sink; use `CREATE STREAMING TABLE ... AS SELECT`".into(), + )); + } + let table_name = table.name().to_string(); + let table_instance = CatalogEntity::external(table.as_ref().clone()); (table_name, *if_not_exists, table_instance) } CreateTablePlanBody::DataFusion(_) => { @@ -302,13 +315,12 @@ impl PlanVisitor for Executor { .add_catalog_table(catalog_table) .map_err(|e| { ExecuteError::Internal(format!( - "Failed to register connector source table '{}': {}", - table_name, e + "Failed to register external table '{table_name}': {e}" )) })?; Ok(ExecuteResult::ok(format!( - "Created connector source table '{table_name}'" + "Created external table '{table_name}'" ))) }; @@ -356,6 +368,7 @@ impl PlanVisitor for Executor { fs_program, custom_interval, None, + vec![], )) }) .map_err(|e| ExecuteError::Internal(format!("Failed to submit streaming job: {e}")))?; diff --git a/src/coordinator/plan/ast_utils.rs b/src/coordinator/plan/ast_utils.rs new file mode 100644 index 00000000..d2432888 --- /dev/null +++ b/src/coordinator/plan/ast_utils.rs @@ -0,0 +1,91 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Pure AST inspection helpers. +//! +//! This module is deliberately stateless: it only knows how to pull data out +//! of `sqlparser` AST nodes. Nothing here touches `StreamSchemaProvider`, +//! connectors, Builders, or logical plans. Put any piece of code that would +//! be "fine" to unit test on a naked AST here — and only here. + +use datafusion::common::{Result, plan_err}; +use datafusion::sql::sqlparser::ast::{Expr as SqlExpr, SqlOption, TableConstraint}; + +use crate::sql::common::with_option_keys as opt; + +/// Namespace for AST extraction helpers. +pub struct AstUtils; + +impl AstUtils { + /// Extract the single PRIMARY KEY column list from the constraint + /// clauses of a `CREATE TABLE`. Rejects multiple PRIMARY KEY declarations. + pub fn parse_primary_keys(constraints: &[TableConstraint]) -> Result> { + let mut keys = None; + for constraint in constraints { + if let TableConstraint::PrimaryKey { columns, .. } = constraint { + if keys.is_some() { + return plan_err!( + "Constraint Violation: Multiple PRIMARY KEY constraints are forbidden" + ); + } + keys = Some(columns.iter().map(|ident| ident.value.clone()).collect()); + } + } + Ok(keys.unwrap_or_default()) + } + + /// Extract the (at most one) `WATERMARK FOR col [AS expr]` clause from + /// the constraint list. The resulting tuple is `(column_name, opt_expr)` + /// so the caller can decide whether it is legal in its context. + pub fn parse_watermark_strategy( + constraints: &[TableConstraint], + ) -> Result)>> { + let mut strategy = None; + for constraint in constraints { + if let TableConstraint::Watermark { + column_name, + watermark_expr, + } = constraint + { + if strategy.is_some() { + return plan_err!( + "Constraint Violation: Only a single WATERMARK FOR clause is permitted" + ); + } + strategy = Some((column_name.value.clone(), watermark_expr.clone())); + } + } + Ok(strategy) + } + + /// True iff the WITH clause declares a `connector=` property. Used by + /// the router to decide whether to hand off to the external-table + /// compiler. + pub fn contains_connector_property(options: &[SqlOption]) -> bool { + options.iter().any(|o| match o { + SqlOption::KeyValue { key, .. } => key.value.eq_ignore_ascii_case(opt::CONNECTOR), + _ => false, + }) + } + + /// Peek at the declared `type` in the WITH clause without consuming it, + /// returning its lowercased value. Used by the router to split the + /// source- and sink-compile paths before either Builder runs. + pub fn peek_table_role(with_options: &[SqlOption]) -> Option { + with_options.iter().find_map(|o| match o { + SqlOption::KeyValue { key, value } if key.value.eq_ignore_ascii_case(opt::TYPE) => { + Some(value.to_string().trim_matches('\'').to_ascii_lowercase()) + } + _ => None, + }) + } +} diff --git a/src/sql/schema/table_execution_unit.rs b/src/coordinator/plan/compile_error_plan.rs similarity index 54% rename from src/sql/schema/table_execution_unit.rs rename to src/coordinator/plan/compile_error_plan.rs index c23dda7a..52b6e481 100644 --- a/src/sql/schema/table_execution_unit.rs +++ b/src/coordinator/plan/compile_error_plan.rs @@ -10,24 +10,21 @@ // See the License for the specific language governing permissions and // limitations under the License. -use super::temporal_pipeline_config::TemporalPipelineConfig; +use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult}; -#[derive(Debug, Clone)] -pub struct EngineDescriptor { - pub engine_type: String, - pub raw_payload: String, +#[derive(Debug)] +pub struct CompileErrorPlan { + pub message: String, } -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub enum SyncMode { - AppendOnly, - Incremental, +impl CompileErrorPlan { + pub fn new(message: String) -> Self { + Self { message } + } } -#[derive(Debug, Clone)] -pub struct TableExecutionUnit { - pub label: String, - pub engine_meta: EngineDescriptor, - pub sync_mode: SyncMode, - pub temporal_offset: TemporalPipelineConfig, +impl PlanNode for CompileErrorPlan { + fn accept(&self, visitor: &dyn PlanVisitor, context: &PlanVisitorContext) -> PlanVisitorResult { + visitor.visit_compile_error_plan(self, context) + } } diff --git a/src/coordinator/plan/create_table_plan.rs b/src/coordinator/plan/create_table_plan.rs index 11ae14a4..7bb47a53 100644 --- a/src/coordinator/plan/create_table_plan.rs +++ b/src/coordinator/plan/create_table_plan.rs @@ -12,16 +12,16 @@ use datafusion::logical_expr::LogicalPlan; -use crate::sql::schema::SourceTable; +use crate::sql::schema::ExternalTable; use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult}; -/// Payload for [`CreateTablePlan`]: either a DataFusion DDL plan or a connector `CREATE TABLE` (no `AS SELECT`). +/// DataFusion DDL, or `CREATE TABLE` with connector → [`ExternalTable`]. #[derive(Debug, Clone)] pub enum CreateTablePlanBody { DataFusion(Box), - ConnectorSource { - source_table: Box, + External { + table: Box, if_not_exists: bool, }, } @@ -38,10 +38,11 @@ impl CreateTablePlan { } } - pub fn connector_source(source_table: SourceTable, if_not_exists: bool) -> Self { + /// [`ExternalTable`] from the DDL compiler. + pub fn external_table(table: ExternalTable, if_not_exists: bool) -> Self { Self { - body: CreateTablePlanBody::ConnectorSource { - source_table: Box::new(source_table), + body: CreateTablePlanBody::External { + table: Box::new(table), if_not_exists, }, } diff --git a/src/coordinator/plan/ddl_compiler.rs b/src/coordinator/plan/ddl_compiler.rs new file mode 100644 index 00000000..5c5af39e --- /dev/null +++ b/src/coordinator/plan/ddl_compiler.rs @@ -0,0 +1,397 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Compiles `CREATE TABLE ... WITH (...)` into Source or Lookup tables. + +use std::time::Duration; + +use datafusion::arrow::datatypes::{DataType, Field, Schema}; +use datafusion::common::{DFSchema, Result, plan_datafusion_err, plan_err}; +use datafusion::error::DataFusionError; +use datafusion::logical_expr::Expr; +use datafusion::sql::TableReference; +use datafusion::sql::planner::{PlannerContext, SqlToRel}; +use datafusion::sql::sqlparser::ast; +use datafusion::sql::sqlparser::ast::CreateTable as SqlCreateTable; +use datafusion_expr::ExprSchemable; +use tracing::warn; + +use super::ast_utils::AstUtils; +use crate::coordinator::tool::ConnectorOptions; +use crate::sql::analysis::StreamSchemaProvider; +use crate::sql::common::constants::{connection_table_role, connector_type, sql_field}; +use crate::sql::common::with_option_keys as opt; +use crate::sql::common::{BadData, Format, Framing, JsonCompression, JsonFormat}; +use crate::sql::connector::registry::REGISTRY; +use crate::sql::schema::ColumnDescriptor; +use crate::sql::schema::catalog::{ExternalTable, LookupTable, SourceTable}; +use crate::sql::schema::data_encoding_format::DataEncodingFormat; +use crate::sql::schema::table_role::{apply_adapter_specific_rules, validate_adapter_availability}; +use crate::sql::schema::temporal_pipeline_config::TemporalPipelineConfig; + +pub struct DdlCompiler<'a> { + schema_provider: &'a StreamSchemaProvider, +} + +impl<'a> DdlCompiler<'a> { + pub fn new(schema_provider: &'a StreamSchemaProvider) -> Self { + Self { schema_provider } + } + + pub fn compile( + &self, + stmt: &SqlCreateTable, + declared_role: Option<&str>, + ) -> Result { + Self::assert_ddl_flags(stmt)?; + + match declared_role { + Some(connection_table_role::LOOKUP) => self.compile_lookup(stmt), + Some(connection_table_role::SOURCE) | None => self.compile_source(stmt), + Some(connection_table_role::SINK) => plan_err!( + "`CREATE TABLE ... WITH (type='sink')` is not supported; use `CREATE STREAMING TABLE ... AS SELECT`" + ), + Some(other) => { + plan_err!("Invalid connection type '{other}' — expected 'source' or 'lookup'") + } + } + } + + fn compile_source(&self, stmt: &SqlCreateTable) -> Result { + let target_name = stmt.name.to_string(); + let description = stmt + .comment + .clone() + .map(|c| c.to_string()) + .unwrap_or_default(); + + let mut columns = self.extract_columns(stmt)?; + + let mut options = ConnectorOptions::new(&stmt.with_options, &None)?; + let adapter_type = Self::extract_adapter(&mut options)?; + Self::assert_connector_match(&mut options, &adapter_type)?; + Self::absorb_type_option(&mut options, connection_table_role::SOURCE)?; + + validate_adapter_availability(&adapter_type)?; + + let pk_constraints = AstUtils::parse_primary_keys(&stmt.constraints)?; + let catalog_with_options = options.snapshot_for_catalog(); + + let format = Format::from_opts(&mut options)?; + Self::assert_format_compatibility(&format, &adapter_type)?; + let _framing = Framing::from_opts(&mut options)?; + let bad_data = BadData::from_opts(&mut options)?; + + let encoding = DataEncodingFormat::from_format(format.as_ref()); + + columns = apply_adapter_specific_rules(&adapter_type, columns); + columns = encoding.apply_envelope(columns)?; + + if encoding.supports_delta_updates() && pk_constraints.is_empty() { + return plan_err!("CDC source requires at least one PRIMARY KEY field"); + } + + let watermark = AstUtils::parse_watermark_strategy(&stmt.constraints)?; + let mut temporal_config = resolve_source_watermark( + &target_name, + &mut columns, + watermark, + &mut options, + self.schema_provider, + )?; + + let idle_from_micros = options + .pull_opt_i64(opt::IDLE_MICROS)? + .filter(|t| *t > 0) + .map(|t| Duration::from_micros(t as u64)); + let idle_from_duration = options.pull_opt_duration(opt::IDLE_TIME)?; + temporal_config.liveness_timeout = idle_from_micros.or(idle_from_duration); + + let provider = REGISTRY.get_source(&adapter_type)?; + let connector_config = provider.build_source_config(&mut options, &format, bad_data)?; + + Self::assert_options_fully_consumed( + &options, + connection_table_role::SOURCE, + &adapter_type, + )?; + + Ok(ExternalTable::Source(SourceTable { + table_identifier: target_name, + adapter_type, + schema_specs: columns, + connector_config, + temporal_config, + key_constraints: pk_constraints, + payload_format: Some(encoding), + connection_format: format, + description, + catalog_with_options, + registry_id: None, + inferred_fields: None, + })) + } + + fn compile_lookup(&self, stmt: &SqlCreateTable) -> Result { + let target_name = stmt.name.to_string(); + + if AstUtils::parse_watermark_strategy(&stmt.constraints)?.is_some() { + return plan_err!( + "Syntax Error: WATERMARK FOR cannot be defined on a Lookup table (`{}`)", + target_name + ); + } + + let description = stmt + .comment + .clone() + .map(|c| c.to_string()) + .unwrap_or_default(); + + let columns = self.extract_columns(stmt)?; + + let mut options = ConnectorOptions::new(&stmt.with_options, &None)?; + let adapter_type = Self::extract_adapter(&mut options)?; + Self::assert_connector_match(&mut options, &adapter_type)?; + Self::absorb_type_option(&mut options, connection_table_role::LOOKUP)?; + + validate_adapter_availability(&adapter_type)?; + + let pk_constraints = AstUtils::parse_primary_keys(&stmt.constraints)?; + let catalog_with_options = options.snapshot_for_catalog(); + + let connection_format = Format::from_opts(&mut options)?; + Self::assert_format_compatibility(&connection_format, &adapter_type)?; + let bad_data = BadData::from_opts(&mut options)?; + + let lookup_cache_max_bytes = options.pull_opt_u64(opt::LOOKUP_CACHE_MAX_BYTES)?; + let lookup_cache_ttl = options.pull_opt_duration(opt::LOOKUP_CACHE_TTL)?; + + let provider = REGISTRY.get_source(&adapter_type)?; + let connector_config = + provider.build_source_config(&mut options, &connection_format, bad_data)?; + + Self::assert_options_fully_consumed( + &options, + connection_table_role::LOOKUP, + &adapter_type, + )?; + + Ok(ExternalTable::Lookup(LookupTable { + table_identifier: target_name, + adapter_type, + schema_specs: columns, + connector_config, + key_constraints: pk_constraints, + lookup_cache_max_bytes, + lookup_cache_ttl, + connection_format, + description, + catalog_with_options, + registry_id: None, + inferred_fields: None, + })) + } + + fn extract_adapter(options: &mut ConnectorOptions) -> Result { + options.pull_opt_str(opt::CONNECTOR)?.ok_or_else(|| { + plan_datafusion_err!( + "Configuration Error: Missing required property '{}' in WITH clause", + opt::CONNECTOR + ) + }) + } + + fn extract_columns(&self, stmt: &SqlCreateTable) -> Result> { + let schema_compiler = datafusion::sql::planner::SqlToRel::new(self.schema_provider); + let arrow_schema = schema_compiler.build_schema(stmt.columns.clone())?; + Ok(arrow_schema + .fields() + .iter() + .map(|f| ColumnDescriptor::from((**f).clone())) + .collect()) + } + + fn assert_ddl_flags(stmt: &SqlCreateTable) -> Result<()> { + if stmt.query.is_some() { + return plan_err!( + "Syntax Error: CREATE TABLE ... AS SELECT combined with WITH ('connector'=...) is invalid. Use CREATE STREAMING TABLE instead." + ); + } + if stmt.or_replace { + return plan_err!( + "Syntax Error: OR REPLACE is not supported for external connector tables." + ); + } + if stmt.temporary { + return plan_err!( + "Syntax Error: TEMPORARY is not supported for external connector tables." + ); + } + if stmt.external { + return plan_err!( + "Syntax Error: EXTERNAL keyword is redundant and unsupported for connector configurations." + ); + } + Ok(()) + } + + fn assert_connector_match(options: &mut ConnectorOptions, connector_name: &str) -> Result<()> { + if let Some(c) = options.pull_opt_str(opt::CONNECTOR)? + && c != connector_name + { + return plan_err!( + "WITH option `connector` is '{c}' but table uses connector '{connector_name}'" + ); + } + Ok(()) + } + + fn absorb_type_option(options: &mut ConnectorOptions, expected_role: &str) -> Result<()> { + let Some(raw) = options.pull_opt_str(opt::TYPE)? else { + return Ok(()); + }; + if !raw.eq_ignore_ascii_case(expected_role) { + return plan_err!( + "Role mismatch: WITH option 'type' = '{raw}' is incompatible with the compiled role '{expected_role}'" + ); + } + Ok(()) + } + + fn assert_format_compatibility(format: &Option, adapter_type: &str) -> Result<()> { + if let Some(Format::Json(JsonFormat { compression, .. })) = format + && !matches!(compression, JsonCompression::Uncompressed) + && adapter_type != connector_type::FILESYSTEM + { + return plan_err!("'json.compression' is only supported for the filesystem connector"); + } + Ok(()) + } + + fn assert_options_fully_consumed( + options: &ConnectorOptions, + role: &str, + adapter_type: &str, + ) -> Result<()> { + if !options.is_empty() { + let unknown_keys: Vec = options.keys().cloned().collect(); + return plan_err!( + "Unknown options for {role} connector '{adapter_type}': {unknown_keys:?}" + ); + } + Ok(()) + } +} + +fn resolve_source_watermark( + table_identifier: &str, + columns: &mut Vec, + watermark: Option<(String, Option)>, + options: &mut ConnectorOptions, + schema_provider: &StreamSchemaProvider, +) -> Result { + let mut config = TemporalPipelineConfig::default(); + + if let Some(event_time_field) = options.pull_opt_field(opt::EVENT_TIME_FIELD)? { + warn!("`event_time_field` WITH option is deprecated; use WATERMARK FOR syntax"); + config.event_column = Some(event_time_field); + } + if let Some(watermark_field) = options.pull_opt_field(opt::WATERMARK_FIELD)? { + warn!("`watermark_field` WITH option is deprecated; use WATERMARK FOR syntax"); + config.watermark_strategy_column = Some(watermark_field); + } + + let Some((time_field, watermark_expr)) = watermark else { + return Ok(config); + }; + + let declared_field = columns + .iter() + .find(|c| c.arrow_field().name().as_str() == time_field.as_str()) + .ok_or_else(|| { + plan_datafusion_err!( + "WATERMARK FOR field `{}` does not exist in table", + time_field + ) + })?; + + if !matches!( + declared_field.arrow_field().data_type(), + DataType::Timestamp(_, None) + ) { + return plan_err!( + "WATERMARK FOR field `{time_field}` has type {}, but expected TIMESTAMP", + declared_field.arrow_field().data_type() + ); + } + + for col in columns.iter_mut() { + if col.arrow_field().name().as_str() == time_field.as_str() { + col.set_nullable(false); + break; + } + } + + config.event_column = Some(time_field.clone()); + + match watermark_expr { + Some(expr) => { + let table_ref = TableReference::bare(table_identifier.to_string()); + let physical_schema = Schema::new( + columns + .iter() + .filter(|c| !c.is_computed()) + .map(|c| c.arrow_field().clone()) + .collect::>(), + ); + let df_schema = DFSchema::try_from_qualified_schema(table_ref, &physical_schema)?; + + let logical_expr = + plan_generating_expr(&expr, &df_schema, schema_provider).map_err(|e| { + DataFusionError::Plan(format!("could not plan watermark expression: {e}")) + })?; + + let (data_type, _nullable) = logical_expr.data_type_and_nullable(&df_schema)?; + if !matches!(data_type, DataType::Timestamp(_, _)) { + return plan_err!( + "the type of the WATERMARK FOR expression must be TIMESTAMP, but was {data_type}" + ); + } + + columns.push(ColumnDescriptor::new_computed( + Field::new( + sql_field::COMPUTED_WATERMARK, + logical_expr.get_type(&df_schema)?, + false, + ), + logical_expr, + )); + config.watermark_strategy_column = Some(sql_field::COMPUTED_WATERMARK.to_string()); + } + None => { + config.watermark_strategy_column = Some(time_field); + } + } + + Ok(config) +} + +fn plan_generating_expr( + ast_expr: &ast::Expr, + df_schema: &DFSchema, + schema_provider: &StreamSchemaProvider, +) -> Result { + let planner = SqlToRel::new(schema_provider); + let mut ctx = PlannerContext::new(); + planner.sql_to_expr(ast_expr.clone(), df_schema, &mut ctx) +} diff --git a/src/coordinator/plan/logical_plan_visitor.rs b/src/coordinator/plan/logical_plan_visitor.rs index d49d0314..cb22d24e 100644 --- a/src/coordinator/plan/logical_plan_visitor.rs +++ b/src/coordinator/plan/logical_plan_visitor.rs @@ -10,26 +10,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::Arc; - -use datafusion::common::{Result, plan_datafusion_err, plan_err}; -use datafusion::execution::SessionStateBuilder; -use datafusion::sql::sqlparser::ast::{ - CreateTable as SqlCreateTable, Expr as SqlExpr, ObjectType, SqlOption, - Statement as DFStatement, TableConstraint, -}; -use datafusion_common::TableReference; -use datafusion_execution::config::SessionConfig; -use datafusion_expr::{Expr, Extension, LogicalPlan, col}; +use datafusion::sql::sqlparser::ast::{ObjectType, Statement as DFStatement}; use sqlparser::ast::Statement; use tracing::debug; use crate::coordinator::analyze::analysis::Analysis; use crate::coordinator::plan::{ - CreateFunctionPlan, CreatePythonFunctionPlan, CreateTablePlan, DropFunctionPlan, - DropStreamingTablePlan, DropTablePlan, PlanNode, ShowCatalogTablesPlan, + CompileErrorPlan, CreateFunctionPlan, CreatePythonFunctionPlan, CreateTablePlan, + DropFunctionPlan, DropStreamingTablePlan, DropTablePlan, PlanNode, ShowCatalogTablesPlan, ShowCreateStreamingTablePlan, ShowCreateTablePlan, ShowFunctionsPlan, ShowStreamingTablesPlan, - StartFunctionPlan, StopFunctionPlan, StreamingTable, + StartFunctionPlan, StopFunctionPlan, }; use crate::coordinator::statement::{ CreateFunction, CreatePythonFunction, CreateTable, DropFunction, DropStreamingTableStatement, @@ -37,17 +27,11 @@ use crate::coordinator::statement::{ ShowFunctions, ShowStreamingTables, StartFunction, StatementVisitor, StatementVisitorContext, StatementVisitorResult, StopFunction, StreamingTableStatement, }; -use crate::coordinator::tool::ConnectorOptions; -use crate::sql::analysis::{StreamSchemaProvider, maybe_add_key_extension_to_sink, rewrite_sinks}; -use crate::sql::common::with_option_keys as opt; -use crate::sql::functions::{is_json_union, serialize_outgoing_json}; -use crate::sql::logical_node::logical::{LogicalProgram, ProgramConfig}; -use crate::sql::logical_node::sink::StreamEgressNode; -use crate::sql::logical_planner::optimizers::{ChainingOptimizer, produce_optimized_plan}; -use crate::sql::logical_planner::planner::PlanToGraphVisitor; -use crate::sql::rewrite_plan; -use crate::sql::schema::source_table::SourceTable; -use crate::sql::schema::{ColumnDescriptor, ConnectionType, Table}; +use crate::sql::analysis::StreamSchemaProvider; + +use super::ast_utils::AstUtils; +use super::ddl_compiler::DdlCompiler; +use super::streaming_compiler::StreamingCompiler; #[derive(Clone)] pub struct LogicalPlanVisitor { @@ -65,280 +49,11 @@ impl LogicalPlanVisitor { match stmt.accept(self, &context) { StatementVisitorResult::Plan(plan) => plan, - _ => panic!("Fatal: LogicalPlanVisitor must yield a PlanNode variant"), - } - } - - pub fn build_streaming_table( - schema_provider: &StreamSchemaProvider, - stmt: &StreamingTableStatement, - ) -> Result { - Self::new(schema_provider.clone()).compile_streaming_sink(stmt) - } - - fn compile_streaming_sink(&self, stmt: &StreamingTableStatement) -> Result { - let DFStatement::CreateStreamingTable { - name, - with_options, - comment, - query, - } = &stmt.statement - else { - return plan_err!("Statement mismatch: Expected CREATE STREAMING TABLE AST node"); - }; - - let sink_table_name = name.to_string(); - debug!( - "Initiating streaming sink compilation for identifier: {}", - sink_table_name - ); - - let mut sink_properties = ConnectorOptions::new(with_options, &None)?; - let connector_type = sink_properties - .pull_opt_str(opt::CONNECTOR)? - .ok_or_else(|| { - plan_datafusion_err!( - "Validation Error: Streaming table '{}' requires the '{}' property", - sink_table_name, - opt::CONNECTOR - ) - })?; - - let partition_keys = Self::extract_partitioning_keys(&mut sink_properties)?; - - let sink_description = comment - .as_deref() - .map(str::trim) - .filter(|s| !s.is_empty()) - .map(str::to_string) - .unwrap_or_else(|| format!("sink `{}` ({connector_type})", sink_table_name)); - - let mut query_logical_plan = rewrite_plan( - produce_optimized_plan(&Statement::Query(query.clone()), &self.schema_provider)?, - &self.schema_provider, - )?; - - if query_logical_plan - .schema() - .fields() - .iter() - .any(|f| is_json_union(f.data_type())) - { - query_logical_plan = - serialize_outgoing_json(&self.schema_provider, Arc::new(query_logical_plan)); - } - - let output_schema_fields = query_logical_plan - .schema() - .fields() - .iter() - .map(|f| ColumnDescriptor::from((**f).clone())) - .collect::>(); - - let mut sink_definition = SourceTable::from_options( - &sink_table_name, - &connector_type, - false, - output_schema_fields, - vec![], - None, - &mut sink_properties, - None, - &self.schema_provider, - Some(ConnectionType::Sink), - sink_description, - )?; - sink_definition.partition_exprs = Arc::new(partition_keys); - - let output_schema = query_logical_plan.schema().clone(); - let sink_plan_node = StreamEgressNode::try_new( - TableReference::bare(sink_table_name.clone()), - Table::ConnectorTable(sink_definition.clone()), - output_schema, - query_logical_plan, - )?; - - let mut rewritten_plans = rewrite_sinks(vec![maybe_add_key_extension_to_sink( - LogicalPlan::Extension(Extension { - node: Arc::new(sink_plan_node), - }), - )?])?; - - let final_logical_plan = rewritten_plans.remove(0); - - let validated_program = self.validate_graph_topology(&final_logical_plan)?; - - let streaming_with_options: Option> = - if with_options.is_empty() { - None - } else { - let map: std::collections::HashMap = with_options - .iter() - .filter_map(|opt| match opt { - SqlOption::KeyValue { key, value } => Some(( - key.value.clone(), - value.to_string().trim_matches('\'').to_string(), - )), - _ => None, - }) - .collect(); - if map.is_empty() { None } else { Some(map) } - }; - - Ok(StreamingTable { - name: sink_table_name, - comment: comment.clone(), - program: validated_program, - with_options: streaming_with_options, - }) - } - - fn validate_graph_topology(&self, logical_plan: &LogicalPlan) -> Result { - let mut session_config = SessionConfig::new(); - let opts = session_config.options_mut(); - opts.optimizer.enable_round_robin_repartition = false; - opts.optimizer.repartition_aggregations = false; - opts.optimizer.repartition_windows = false; - opts.optimizer.repartition_sorts = false; - opts.optimizer.repartition_joins = false; - opts.execution.target_partitions = 1; - - let session_state = SessionStateBuilder::new() - .with_config(session_config) - .with_default_features() - .with_physical_optimizer_rules(vec![]) - .build(); - - let mut graph_compiler = PlanToGraphVisitor::new(&self.schema_provider, &session_state); - graph_compiler.add_plan(logical_plan.clone())?; - - let mut executable_program = - LogicalProgram::new(graph_compiler.into_graph(), ProgramConfig::default()); - executable_program.optimize(&ChainingOptimizer {}); - - Ok(executable_program) - } - - fn extract_partitioning_keys(options: &mut ConnectorOptions) -> Result>> { - options - .pull_opt_str(opt::PARTITION_BY)? - .map(|raw_cols| raw_cols.split(',').map(|c| col(c.trim())).collect()) - .map(Ok) - .transpose() - } - - fn contains_connector_property(options: &[SqlOption]) -> bool { - options.iter().any(|opt| match opt { - SqlOption::KeyValue { key, .. } => key.value.eq_ignore_ascii_case(opt::CONNECTOR), - _ => false, - }) - } - - fn parse_primary_keys(constraints: &[TableConstraint]) -> Result> { - let mut keys = None; - for constraint in constraints { - if let TableConstraint::PrimaryKey { columns, .. } = constraint { - if keys.is_some() { - return plan_err!( - "Constraint Violation: Multiple PRIMARY KEY constraints are forbidden" - ); - } - keys = Some(columns.iter().map(|ident| ident.value.clone()).collect()); - } - } - Ok(keys.unwrap_or_default()) - } - - fn parse_watermark_strategy( - constraints: &[TableConstraint], - ) -> Result)>> { - let mut strategy = None; - for constraint in constraints { - if let TableConstraint::Watermark { - column_name, - watermark_expr, - } = constraint - { - if strategy.is_some() { - return plan_err!( - "Constraint Violation: Only a single WATERMARK FOR clause is permitted" - ); - } - strategy = Some((column_name.value.clone(), watermark_expr.clone())); - } + _ => Box::new(CompileErrorPlan::new( + "LogicalPlanVisitor did not yield a PlanNode variant for the given statement" + .to_string(), + )), } - Ok(strategy) - } - - fn compile_connector_source_plan(&self, stmt: &SqlCreateTable) -> Result { - if stmt.query.is_some() { - return plan_err!( - "Syntax Error: CREATE TABLE ... AS SELECT combined with WITH ('connector'=...) is invalid. Use CREATE STREAMING TABLE instead." - ); - } - if stmt.or_replace { - return plan_err!( - "Syntax Error: OR REPLACE is not supported for external connector tables." - ); - } - if stmt.temporary { - return plan_err!( - "Syntax Error: TEMPORARY is not supported for external connector tables." - ); - } - if stmt.external { - return plan_err!( - "Syntax Error: EXTERNAL keyword is redundant and unsupported for connector configurations." - ); - } - - let target_name = stmt.name.to_string(); - let table_description = stmt - .comment - .clone() - .map(|c| c.to_string()) - .unwrap_or_default(); - - let schema_compiler = datafusion::sql::planner::SqlToRel::new(&self.schema_provider); - let arrow_schema = schema_compiler.build_schema(stmt.columns.clone())?; - - let schema_descriptors = arrow_schema - .fields() - .iter() - .map(|f| ColumnDescriptor::from((**f).clone())) - .collect::>(); - - let mut connector_options = ConnectorOptions::new(&stmt.with_options, &None)?; - let adapter_type = connector_options - .pull_opt_str(opt::CONNECTOR)? - .ok_or_else(|| { - plan_datafusion_err!( - "Configuration Error: Missing required property '{}' in WITH clause", - opt::CONNECTOR - ) - })?; - - let pk_constraints = Self::parse_primary_keys(&stmt.constraints)?; - let watermark_strategy = Self::parse_watermark_strategy(&stmt.constraints)?; - - let source_definition = SourceTable::from_options( - &target_name, - &adapter_type, - false, - schema_descriptors, - pk_constraints, - watermark_strategy, - &mut connector_options, - None, - &self.schema_provider, - Some(ConnectionType::Source), - table_description, - )?; - - Ok(CreateTablePlan::connector_source( - source_definition, - stmt.if_not_exists, - )) } } @@ -422,16 +137,18 @@ impl StatementVisitor for LogicalPlanVisitor { ) -> StatementVisitorResult { if let Statement::CreateTable(ast_node) = &stmt.statement && ast_node.query.is_none() - && Self::contains_connector_property(&ast_node.with_options) + && AstUtils::contains_connector_property(&ast_node.with_options) { - let execution_plan = - self.compile_connector_source_plan(ast_node) - .unwrap_or_else(|err| { - panic!( - "Fatal Compiler Error: Connector source resolution failed - {err:#}" - ); - }); - return StatementVisitorResult::Plan(Box::new(execution_plan)); + let declared_role = AstUtils::peek_table_role(&ast_node.with_options); + let compiler = DdlCompiler::new(&self.schema_provider); + return match compiler.compile(ast_node, declared_role.as_deref()) { + Ok(external_table) => StatementVisitorResult::Plan(Box::new( + CreateTablePlan::external_table(external_table, ast_node.if_not_exists), + )), + Err(err) => StatementVisitorResult::Plan(Box::new(CompileErrorPlan::new(format!( + "Ingest table resolution failed - {err:#}" + )))), + }; } let schema_compiler = datafusion::sql::planner::SqlToRel::new(&self.schema_provider); @@ -443,7 +160,9 @@ impl StatementVisitor for LogicalPlanVisitor { ); StatementVisitorResult::Plan(Box::new(CreateTablePlan::new(logical_plan))) } - Err(err) => panic!("Fatal Compiler Error: Logical plan translation failed - {err}"), + Err(err) => StatementVisitorResult::Plan(Box::new(CompileErrorPlan::new(format!( + "Logical plan translation failed - {err}" + )))), } } @@ -452,10 +171,13 @@ impl StatementVisitor for LogicalPlanVisitor { stmt: &StreamingTableStatement, _ctx: &StatementVisitorContext, ) -> StatementVisitorResult { - let execution_plan = self.compile_streaming_sink(stmt).unwrap_or_else(|err| { - panic!("Fatal Compiler Error: Streaming sink compilation aborted - {err}"); - }); - StatementVisitorResult::Plan(Box::new(execution_plan)) + let compiler = StreamingCompiler::new(&self.schema_provider); + match compiler.compile(stmt) { + Ok(execution_plan) => StatementVisitorResult::Plan(Box::new(execution_plan)), + Err(err) => StatementVisitorResult::Plan(Box::new(CompileErrorPlan::new(format!( + "Streaming sink compilation aborted - {err}" + )))), + } } fn visit_drop_table_statement( @@ -470,16 +192,20 @@ impl StatementVisitor for LogicalPlanVisitor { .. } = &stmt.statement else { - panic!("Fatal Compiler Error: AST mismatch on DropTableStatement"); + return StatementVisitorResult::Plan(Box::new(CompileErrorPlan::new( + "AST mismatch: expected DROP statement for DropTableStatement".to_string(), + ))); }; if *object_type != ObjectType::Table { - panic!("Fatal Compiler Error: Drop target must be of type TABLE"); + return StatementVisitorResult::Plan(Box::new(CompileErrorPlan::new(format!( + "Drop target must be of type TABLE, got {object_type:?}" + )))); } if names.len() != 1 { - panic!( - "Fatal Compiler Error: Bulk drop operations are not supported. Specify exactly one table." - ); + return StatementVisitorResult::Plan(Box::new(CompileErrorPlan::new( + "Bulk drop operations are not supported. Specify exactly one table.".to_string(), + ))); } StatementVisitorResult::Plan(Box::new(DropTablePlan::new( diff --git a/src/coordinator/plan/lookup_table_plan.rs b/src/coordinator/plan/lookup_table_plan.rs index 65103b61..9d1fea93 100644 --- a/src/coordinator/plan/lookup_table_plan.rs +++ b/src/coordinator/plan/lookup_table_plan.rs @@ -10,14 +10,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::sql::schema::source_table::SourceTable; +use crate::sql::schema::LookupTable; use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult}; -/// Plan node that exposes a lookup table config as a logical plan input. #[derive(Debug)] pub struct LookupTablePlan { - pub table: SourceTable, + pub table: LookupTable, } impl PlanNode for LookupTablePlan { diff --git a/src/coordinator/plan/mod.rs b/src/coordinator/plan/mod.rs index 8166d444..95978467 100644 --- a/src/coordinator/plan/mod.rs +++ b/src/coordinator/plan/mod.rs @@ -10,9 +10,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +mod ast_utils; +mod compile_error_plan; mod create_function_plan; mod create_python_function_plan; mod create_table_plan; +mod ddl_compiler; mod drop_function_plan; mod drop_streaming_table_plan; mod drop_table_plan; @@ -26,10 +29,12 @@ mod show_functions_plan; mod show_streaming_tables_plan; mod start_function_plan; mod stop_function_plan; +mod streaming_compiler; mod streaming_table_connector_plan; mod streaming_table_plan; mod visitor; +pub use compile_error_plan::CompileErrorPlan; pub use create_function_plan::CreateFunctionPlan; pub use create_python_function_plan::CreatePythonFunctionPlan; pub use create_table_plan::{CreateTablePlan, CreateTablePlanBody}; diff --git a/src/coordinator/plan/streaming_compiler.rs b/src/coordinator/plan/streaming_compiler.rs new file mode 100644 index 00000000..dbdd4dec --- /dev/null +++ b/src/coordinator/plan/streaming_compiler.rs @@ -0,0 +1,234 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! `CREATE STREAMING TABLE ... AS SELECT` → [`SinkTable`], egress, [`LogicalProgram`]. + +use std::collections::HashMap; +use std::sync::Arc; + +use datafusion::common::{Result, plan_datafusion_err, plan_err}; +use datafusion::execution::SessionStateBuilder; +use datafusion::sql::sqlparser::ast::{SqlOption, Statement as DFStatement}; +use datafusion_common::TableReference; +use datafusion_execution::config::SessionConfig; +use datafusion_expr::{Expr, Extension, LogicalPlan, col}; +use sqlparser::ast::Statement; +use tracing::debug; + +use super::StreamingTable; +use crate::coordinator::statement::StreamingTableStatement; +use crate::coordinator::tool::ConnectorOptions; +use crate::sql::analysis::{StreamSchemaProvider, maybe_add_key_extension_to_sink, rewrite_sinks}; +use crate::sql::common::constants::connector_type; +use crate::sql::common::with_option_keys as opt; +use crate::sql::common::{Format, JsonCompression, JsonFormat}; +use crate::sql::connector::registry::REGISTRY; +use crate::sql::connector::sink::runtime_config::SinkRuntimeConfig; +use crate::sql::functions::{is_json_union, serialize_outgoing_json}; +use crate::sql::logical_node::logical::{LogicalProgram, ProgramConfig}; +use crate::sql::logical_node::sink::StreamEgressNode; +use crate::sql::logical_planner::optimizers::{ChainingOptimizer, produce_optimized_plan}; +use crate::sql::logical_planner::planner::PlanToGraphVisitor; +use crate::sql::rewrite_plan; +use crate::sql::schema::ColumnDescriptor; +use crate::sql::schema::catalog::{ExternalTable, SinkTable}; +use crate::sql::schema::table::CatalogEntity; +use crate::sql::schema::table_role::validate_adapter_availability; + +pub struct StreamingCompiler<'a> { + schema_provider: &'a StreamSchemaProvider, +} + +impl<'a> StreamingCompiler<'a> { + pub fn new(schema_provider: &'a StreamSchemaProvider) -> Self { + Self { schema_provider } + } + + pub fn compile(&self, stmt: &StreamingTableStatement) -> Result { + let DFStatement::CreateStreamingTable { + name, + with_options, + comment, + query, + } = &stmt.statement + else { + return plan_err!("Statement mismatch: Expected CREATE STREAMING TABLE AST node"); + }; + + let sink_table_name = name.to_string(); + debug!( + "Initiating streaming sink compilation for identifier: {}", + sink_table_name + ); + + let mut sink_properties = ConnectorOptions::new(with_options, &None)?; + let adapter_type = sink_properties + .pull_opt_str(opt::CONNECTOR)? + .ok_or_else(|| { + plan_datafusion_err!( + "Validation Error: Streaming table '{}' requires the '{}' property", + sink_table_name, + opt::CONNECTOR + ) + })?; + validate_adapter_availability(&adapter_type)?; + + let partition_keys = Self::extract_partitioning_keys(&mut sink_properties)?; + let catalog_with_options = sink_properties.snapshot_for_catalog(); + + let connection_format = Format::from_opts(&mut sink_properties)?; + Self::assert_format_compatibility(&connection_format, &adapter_type)?; + + let sink_description = comment + .as_deref() + .map(str::trim) + .filter(|s| !s.is_empty()) + .map(str::to_string) + .unwrap_or_else(|| format!("sink `{}` ({adapter_type})", sink_table_name)); + + let mut query_logical_plan = rewrite_plan( + produce_optimized_plan(&Statement::Query(query.clone()), self.schema_provider)?, + self.schema_provider, + )?; + + if query_logical_plan + .schema() + .fields() + .iter() + .any(|f| is_json_union(f.data_type())) + { + query_logical_plan = + serialize_outgoing_json(self.schema_provider, Arc::new(query_logical_plan)); + } + + let output_schema_fields: Vec = query_logical_plan + .schema() + .fields() + .iter() + .map(|f| ColumnDescriptor::from((**f).clone())) + .collect(); + + let runtime_props = + SinkRuntimeConfig::extract_from_options(&mut sink_properties)?.to_runtime_properties(); + let provider = REGISTRY.get_sink(&adapter_type)?; + let connector_config = + provider.build_sink_config(&mut sink_properties, &connection_format, &runtime_props)?; + + if !sink_properties.is_empty() { + let unknown_keys: Vec = sink_properties.keys().cloned().collect(); + return plan_err!( + "Unknown options for streaming sink connector '{adapter_type}': {unknown_keys:?}" + ); + } + + let sink_table = SinkTable { + table_identifier: sink_table_name.clone(), + adapter_type, + schema_specs: output_schema_fields, + connector_config, + partition_exprs: Arc::new(partition_keys), + key_constraints: Vec::new(), + connection_format, + description: sink_description, + catalog_with_options, + }; + + let output_schema = query_logical_plan.schema().clone(); + let sink_plan_node = StreamEgressNode::try_new( + TableReference::bare(sink_table_name.clone()), + CatalogEntity::external(ExternalTable::Sink(sink_table)), + output_schema, + query_logical_plan, + )?; + + let mut rewritten_plans = rewrite_sinks(vec![maybe_add_key_extension_to_sink( + LogicalPlan::Extension(Extension { + node: Arc::new(sink_plan_node), + }), + )?])?; + + let final_logical_plan = rewritten_plans.remove(0); + let validated_program = self.validate_graph_topology(&final_logical_plan)?; + + Ok(StreamingTable { + name: sink_table_name, + comment: comment.clone(), + program: validated_program, + with_options: Self::echo_with_options(with_options), + }) + } + + /// Compile the final logical plan into the executable streaming program, + /// disabling DataFusion's batch-oriented repartition heuristics. + fn validate_graph_topology(&self, logical_plan: &LogicalPlan) -> Result { + let mut session_config = SessionConfig::new(); + let opts = session_config.options_mut(); + opts.optimizer.enable_round_robin_repartition = false; + opts.optimizer.repartition_aggregations = false; + opts.optimizer.repartition_windows = false; + opts.optimizer.repartition_sorts = false; + opts.optimizer.repartition_joins = false; + opts.execution.target_partitions = 1; + + let session_state = SessionStateBuilder::new() + .with_config(session_config) + .with_default_features() + .with_physical_optimizer_rules(vec![]) + .build(); + + let mut graph_compiler = PlanToGraphVisitor::new(self.schema_provider, &session_state); + graph_compiler.add_plan(logical_plan.clone())?; + + let mut executable_program = + LogicalProgram::new(graph_compiler.into_graph(), ProgramConfig::default()); + executable_program.optimize(&ChainingOptimizer {}); + + Ok(executable_program) + } + + fn extract_partitioning_keys(options: &mut ConnectorOptions) -> Result>> { + options + .pull_opt_str(opt::PARTITION_BY)? + .map(|raw_cols| raw_cols.split(',').map(|c| col(c.trim())).collect()) + .map(Ok) + .transpose() + } + + fn assert_format_compatibility(format: &Option, adapter_type: &str) -> Result<()> { + if let Some(Format::Json(JsonFormat { compression, .. })) = format + && !matches!(compression, JsonCompression::Uncompressed) + && adapter_type != connector_type::FILESYSTEM + { + return plan_err!("'json.compression' is only supported for the filesystem connector"); + } + Ok(()) + } + + /// Snapshot the original WITH options so the streaming-table catalog + /// can later reproduce the exact DDL for `SHOW CREATE STREAMING TABLE`. + fn echo_with_options(with_options: &[SqlOption]) -> Option> { + if with_options.is_empty() { + return None; + } + let map: HashMap = with_options + .iter() + .filter_map(|o| match o { + SqlOption::KeyValue { key, value } => Some(( + key.value.clone(), + value.to_string().trim_matches('\'').to_string(), + )), + _ => None, + }) + .collect(); + if map.is_empty() { None } else { Some(map) } + } +} diff --git a/src/coordinator/plan/streaming_table_connector_plan.rs b/src/coordinator/plan/streaming_table_connector_plan.rs index 214e2e15..105f9bd7 100644 --- a/src/coordinator/plan/streaming_table_connector_plan.rs +++ b/src/coordinator/plan/streaming_table_connector_plan.rs @@ -10,14 +10,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::sql::schema::source_table::SourceTable; +use crate::sql::schema::SinkTable; use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult}; -/// Plan node that exposes a connector table config as a logical plan input. #[derive(Debug)] pub struct StreamingTableConnectorPlan { - pub table: SourceTable, + pub table: SinkTable, } impl PlanNode for StreamingTableConnectorPlan { diff --git a/src/coordinator/plan/visitor.rs b/src/coordinator/plan/visitor.rs index a94d761f..5c566b74 100644 --- a/src/coordinator/plan/visitor.rs +++ b/src/coordinator/plan/visitor.rs @@ -11,10 +11,11 @@ // limitations under the License. use super::{ - CreateFunctionPlan, CreatePythonFunctionPlan, CreateTablePlan, DropFunctionPlan, - DropStreamingTablePlan, DropTablePlan, LookupTablePlan, ShowCatalogTablesPlan, - ShowCreateStreamingTablePlan, ShowCreateTablePlan, ShowFunctionsPlan, ShowStreamingTablesPlan, - StartFunctionPlan, StopFunctionPlan, StreamingTable, StreamingTableConnectorPlan, + CompileErrorPlan, CreateFunctionPlan, CreatePythonFunctionPlan, CreateTablePlan, + DropFunctionPlan, DropStreamingTablePlan, DropTablePlan, LookupTablePlan, + ShowCatalogTablesPlan, ShowCreateStreamingTablePlan, ShowCreateTablePlan, ShowFunctionsPlan, + ShowStreamingTablesPlan, StartFunctionPlan, StopFunctionPlan, StreamingTable, + StreamingTableConnectorPlan, }; /// Context passed to PlanVisitor methods @@ -51,6 +52,12 @@ pub enum PlanVisitorResult { } pub trait PlanVisitor { + fn visit_compile_error_plan( + &self, + plan: &CompileErrorPlan, + context: &PlanVisitorContext, + ) -> PlanVisitorResult; + fn visit_create_function( &self, plan: &CreateFunctionPlan, diff --git a/src/coordinator/streaming_table_options.rs b/src/coordinator/streaming_table_options.rs index 51e020b0..4d9db7b8 100644 --- a/src/coordinator/streaming_table_options.rs +++ b/src/coordinator/streaming_table_options.rs @@ -12,6 +12,8 @@ use std::collections::HashMap; +use crate::sql::common::with_option_keys as opt; + fn parse_positive_u64(raw: &str) -> Option { let t = raw.trim().trim_matches('\''); t.parse::().ok().filter(|&v| v > 0) @@ -23,12 +25,12 @@ fn parse_positive_u32(raw: &str) -> Option { } pub fn parse_checkpoint_interval_ms(opts: Option<&HashMap>) -> Option { - opts.and_then(|m| m.get("checkpoint.interval")) + opts.and_then(|m| m.get(opt::CHECKPOINT_INTERVAL_MS)) .and_then(|s| parse_positive_u64(s)) } pub fn parse_pipeline_parallelism(opts: Option<&HashMap>) -> Option { - opts.and_then(|m| m.get("parallelism")) + opts.and_then(|m| m.get(opt::PIPELINE_PARALLELISM)) .and_then(|s| parse_positive_u32(s)) } @@ -39,8 +41,8 @@ mod tests { #[test] fn parses_checkpoint_and_parallelism() { let mut m = HashMap::new(); - m.insert("checkpoint.interval".to_string(), "30000".to_string()); - m.insert("parallelism".to_string(), "2".to_string()); + m.insert(opt::CHECKPOINT_INTERVAL_MS.to_string(), "30000".to_string()); + m.insert(opt::PIPELINE_PARALLELISM.to_string(), "2".to_string()); assert_eq!(parse_checkpoint_interval_ms(Some(&m)), Some(30_000)); assert_eq!(parse_pipeline_parallelism(Some(&m)), Some(2)); } diff --git a/src/runtime/streaming/api/context.rs b/src/runtime/streaming/api/context.rs index 8b778502..f2557e7a 100644 --- a/src/runtime/streaming/api/context.rs +++ b/src/runtime/streaming/api/context.rs @@ -16,7 +16,7 @@ use std::time::{Duration, SystemTime}; use anyhow::{Context, Result, anyhow}; use arrow_array::RecordBatch; -use protocol::storage::SourceCheckpointPayload; +use protocol::storage::SourceCheckpointInfo; use tokio::sync::mpsc; use crate::runtime::memory::{MemoryBlock, MemoryPool, get_array_memory_size}; @@ -78,7 +78,7 @@ pub struct TaskContext { /// Last globally-committed safe epoch for crash recovery. safe_epoch: u64, - /// When set, pipelines report checkpoint completion (and optional Kafka offsets) to the job coordinator. + /// When set, pipelines report checkpoint completion to the job coordinator. checkpoint_ack_tx: Option>, } @@ -128,17 +128,13 @@ impl TaskContext { } /// Notify the job checkpoint coordinator that this pipeline has finished the barrier for `epoch`. - pub async fn send_checkpoint_ack( - &self, - epoch: u64, - source_payloads: Vec, - ) { + pub async fn send_checkpoint_ack(&self, epoch: u64, source_infos: Vec) { if let Some(tx) = &self.checkpoint_ack_tx { let _ = tx .send(JobMasterEvent::CheckpointAck { pipeline_id: self.pipeline_id, epoch, - source_payloads, + source_infos, }) .await; } diff --git a/src/runtime/streaming/api/operator.rs b/src/runtime/streaming/api/operator.rs index 8eb9e8c4..fc75e475 100644 --- a/src/runtime/streaming/api/operator.rs +++ b/src/runtime/streaming/api/operator.rs @@ -66,7 +66,7 @@ pub trait Operator: Send + 'static { /// `commit_transaction` on the producer stashed during [`Self::snapshot_state`]. async fn commit_checkpoint( &mut self, - epoch: u32, + epoch: u64, _ctx: &mut TaskContext, ) -> anyhow::Result<()> { let _ = epoch; @@ -76,7 +76,7 @@ pub trait Operator: Send + 'static { /// Global checkpoint **rollback** when phase 2 must not commit (e.g. catalog persist failed). /// /// Default is no-op. Transactional Kafka sink overrides with `abort_transaction` on the stashed producer. - async fn abort_checkpoint(&mut self, epoch: u32, _ctx: &mut TaskContext) -> anyhow::Result<()> { + async fn abort_checkpoint(&mut self, epoch: u64, _ctx: &mut TaskContext) -> anyhow::Result<()> { let _ = epoch; Ok(()) } diff --git a/src/runtime/streaming/api/source.rs b/src/runtime/streaming/api/source.rs index 26851eb2..9c531f2c 100644 --- a/src/runtime/streaming/api/source.rs +++ b/src/runtime/streaming/api/source.rs @@ -15,7 +15,7 @@ use crate::sql::common::{CheckpointBarrier, Watermark}; use arrow_array::RecordBatch; use async_trait::async_trait; use protocol::storage::{ - KafkaSourceSubtaskCheckpoint, SourceCheckpointPayload, source_checkpoint_payload, + KafkaSourceSubtaskCheckpoint, SourceCheckpointInfo, source_checkpoint_info, }; #[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] @@ -34,17 +34,19 @@ pub enum SourceEvent { EndOfStream, } -/// Optional metadata returned when a source completes a checkpoint barrier snapshot. +/// Checkpoint metadata produced by a source subtask during a barrier snapshot. +/// Sources fill this directly with [`SourceCheckpointInfo`] — the coordinator collects +/// and persists these entries without any further translation step. #[derive(Debug, Default, Clone)] pub struct SourceCheckpointReport { - pub payloads: Vec, + pub infos: Vec, } impl SourceCheckpointReport { pub fn from_kafka_checkpoint(kafka: KafkaSourceSubtaskCheckpoint) -> Self { Self { - payloads: vec![SourceCheckpointPayload { - checkpoint: Some(source_checkpoint_payload::Checkpoint::Kafka(kafka)), + infos: vec![SourceCheckpointInfo { + info: Some(source_checkpoint_info::Info::Kafka(kafka)), }], } } @@ -54,6 +56,11 @@ impl SourceCheckpointReport { pub trait SourceOperator: Send + 'static { fn name(&self) -> &str; + /// Inject persisted checkpoint records before the source is started. + /// Called by the engine after the operator is constructed and before [`Self::on_start`]. + /// Default implementation is a no-op; sources with stateful recovery override this. + fn set_recovery_checkpoint(&mut self, _infos: Vec) {} + async fn on_start(&mut self, _ctx: &mut TaskContext) -> anyhow::Result<()> { Ok(()) } @@ -74,7 +81,7 @@ pub trait SourceOperator: Send + 'static { /// Kafka source keeps the default: offsets are reported at the barrier in [`Self::snapshot_state`]. async fn commit_checkpoint( &mut self, - epoch: u32, + epoch: u64, _ctx: &mut TaskContext, ) -> anyhow::Result<()> { let _ = epoch; @@ -82,7 +89,7 @@ pub trait SourceOperator: Send + 'static { } /// Same rollback hook as [`super::operator::Operator::abort_checkpoint`]. - async fn abort_checkpoint(&mut self, epoch: u32, _ctx: &mut TaskContext) -> anyhow::Result<()> { + async fn abort_checkpoint(&mut self, epoch: u64, _ctx: &mut TaskContext) -> anyhow::Result<()> { let _ = epoch; Ok(()) } diff --git a/src/runtime/streaming/execution/pipeline.rs b/src/runtime/streaming/execution/pipeline.rs index 91309a48..7c2ca17a 100644 --- a/src/runtime/streaming/execution/pipeline.rs +++ b/src/runtime/streaming/execution/pipeline.rs @@ -110,7 +110,7 @@ impl Pipeline { } } AlignmentStatus::Complete => { - let epoch = barrier.epoch as u64; + let epoch = barrier.epoch; self.chain_head .process_event( idx, diff --git a/src/runtime/streaming/execution/source_driver.rs b/src/runtime/streaming/execution/source_driver.rs index b4e7d327..0118c4ee 100644 --- a/src/runtime/streaming/execution/source_driver.rs +++ b/src/runtime/streaming/execution/source_driver.rs @@ -161,7 +161,7 @@ impl SourceDriver { let b: CheckpointBarrier = barrier.clone().into(); let report = self.operator.snapshot_state(b, &mut self.ctx).await?; self.dispatch_event(StreamEvent::Barrier(b)).await?; - pending_source_checkpoint = Some((b.epoch as u64, report)); + pending_source_checkpoint = Some((b.epoch, report)); } ControlCommand::Commit { epoch } => { self.operator @@ -186,7 +186,7 @@ impl SourceDriver { } if let Some((epoch, report)) = pending_source_checkpoint { - self.ctx.send_checkpoint_ack(epoch, report.payloads).await; + self.ctx.send_checkpoint_ack(epoch, report.infos).await; } Ok(stop) diff --git a/src/runtime/streaming/execution/tracker/barrier_aligner.rs b/src/runtime/streaming/execution/tracker/barrier_aligner.rs index 4f954a7d..dbc30cd1 100644 --- a/src/runtime/streaming/execution/tracker/barrier_aligner.rs +++ b/src/runtime/streaming/execution/tracker/barrier_aligner.rs @@ -23,7 +23,7 @@ pub enum AlignmentStatus { #[derive(Debug)] pub struct BarrierAligner { input_count: usize, - current_epoch: Option, + current_epoch: Option, reached_inputs: HashSet, } diff --git a/src/runtime/streaming/factory/connector/delta.rs b/src/runtime/streaming/factory/connector/delta.rs new file mode 100644 index 00000000..726f87ef --- /dev/null +++ b/src/runtime/streaming/factory/connector/delta.rs @@ -0,0 +1,82 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::sync::Arc; + +use anyhow::{Context, Result, bail}; +use prost::Message; +use protocol::function_stream_graph::ConnectorOp; +use protocol::function_stream_graph::connector_op::Config; + +use crate::runtime::streaming::api::operator::ConstructedOperator; +use crate::runtime::streaming::factory::connector::sink_props_codec::{ + apply_common_sink_fields, normalized_props, parse_sink_memory_bytes, +}; +use crate::runtime::streaming::factory::global::Registry; +use crate::runtime::streaming::factory::operator_constructor::OperatorConstructor; +use crate::runtime::streaming::operators::sink::delta::{DeltaFormat, DeltaSinkOperator}; +use crate::runtime::streaming::operators::sink::filesystem::compression_from_str; +use crate::sql::common::constants::connection_format_value; +use crate::sql::common::with_option_keys as opt; + +pub struct DeltaSinkDispatcher; + +impl OperatorConstructor for DeltaSinkDispatcher { + fn with_config(&self, payload: &[u8], _registry: Arc) -> Result { + let op = ConnectorOp::decode(payload).context("failed to decode connector op")?; + let props = match op.config { + Some(Config::DeltaSink(cfg)) => delta_props(cfg), + _ => bail!("delta sink expects DeltaSinkConfig"), + }; + let props = normalized_props(props); + + let format = props + .get(opt::FORMAT) + .map(String::as_str) + .unwrap_or(connection_format_value::PARQUET) + .to_ascii_lowercase(); + let path = props + .get(opt::PATH) + .cloned() + .or_else(|| props.get(opt::SINK_PATH).cloned()) + .unwrap_or_else(|| ".".to_string()); + let compression = + compression_from_str(props.get(opt::PARQUET_COMPRESSION).map(String::as_str))?; + let sink_memory_bytes = parse_sink_memory_bytes(&props)?; + let format = match format.as_str() { + connection_format_value::CSV => DeltaFormat::Csv, + connection_format_value::PARQUET => DeltaFormat::Parquet, + connection_format_value::JSON => DeltaFormat::JsonL, + connection_format_value::AVRO => DeltaFormat::Avro, + connection_format_value::ORC => DeltaFormat::Orc, + other => bail!("unsupported delta sink format '{other}'"), + }; + Ok(ConstructedOperator::Operator(Box::new( + DeltaSinkOperator::try_new( + op.name, + path, + format, + compression, + sink_memory_bytes, + props, + )?, + ))) + } +} + +fn delta_props(cfg: protocol::function_stream_graph::DeltaSinkConfig) -> HashMap { + let mut props = cfg.extra_properties; + props.extend(cfg.runtime_properties); + apply_common_sink_fields(&mut props, cfg.path, cfg.format, cfg.parquet_compression); + props +} diff --git a/src/runtime/streaming/factory/connector/dispatchers.rs b/src/runtime/streaming/factory/connector/dispatchers.rs index 430a49f9..7d626600 100644 --- a/src/runtime/streaming/factory/connector/dispatchers.rs +++ b/src/runtime/streaming/factory/connector/dispatchers.rs @@ -12,19 +12,30 @@ use std::sync::Arc; -use anyhow::Result; +use anyhow::{Context, Result, bail}; +use prost::Message; +use protocol::function_stream_graph::ConnectorOp; use crate::runtime::streaming::api::operator::ConstructedOperator; use crate::runtime::streaming::factory::global::Registry; use crate::runtime::streaming::factory::operator_constructor::OperatorConstructor; +use crate::sql::common::constants::connector_type; -use super::kafka::KafkaConnectorDispatcher; +use super::{ + DeltaSinkDispatcher, FilesystemSinkDispatcher, IcebergSinkDispatcher, LanceDbSinkDispatcher, + S3SinkDispatcher, kafka::KafkaConnectorDispatcher, +}; pub struct ConnectorSourceDispatcher; impl OperatorConstructor for ConnectorSourceDispatcher { fn with_config(&self, config: &[u8], registry: Arc) -> Result { - KafkaConnectorDispatcher.with_config(config, registry) + let op = ConnectorOp::decode(config) + .context("failed decoding connector op for source dispatch")?; + match op.connector.to_ascii_lowercase().as_str() { + connector_type::KAFKA => KafkaConnectorDispatcher.with_config(config, registry), + _ => bail!("unsupported source connector '{}'", op.connector), + } } } @@ -32,6 +43,16 @@ pub struct ConnectorSinkDispatcher; impl OperatorConstructor for ConnectorSinkDispatcher { fn with_config(&self, config: &[u8], registry: Arc) -> Result { - KafkaConnectorDispatcher.with_config(config, registry) + let op = ConnectorOp::decode(config) + .context("failed decoding connector op for sink dispatch")?; + match op.connector.to_ascii_lowercase().as_str() { + connector_type::KAFKA => KafkaConnectorDispatcher.with_config(config, registry), + connector_type::FILESYSTEM => FilesystemSinkDispatcher.with_config(config, registry), + connector_type::DELTA => DeltaSinkDispatcher.with_config(config, registry), + connector_type::ICEBERG => IcebergSinkDispatcher.with_config(config, registry), + connector_type::S3 => S3SinkDispatcher.with_config(config, registry), + "lancedb" => LanceDbSinkDispatcher.with_config(config, registry), + _ => bail!("unsupported sink connector '{}'", op.connector), + } } } diff --git a/src/runtime/streaming/factory/connector/filesystem.rs b/src/runtime/streaming/factory/connector/filesystem.rs new file mode 100644 index 00000000..94101407 --- /dev/null +++ b/src/runtime/streaming/factory/connector/filesystem.rs @@ -0,0 +1,78 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::sync::Arc; + +use anyhow::{Context, Result, bail}; +use prost::Message; +use protocol::function_stream_graph::ConnectorOp; +use protocol::function_stream_graph::connector_op::Config; + +use crate::runtime::streaming::api::operator::ConstructedOperator; +use crate::runtime::streaming::factory::connector::sink_props_codec::{ + apply_common_sink_fields, normalized_props, parse_sink_memory_bytes, +}; +use crate::runtime::streaming::factory::global::Registry; +use crate::runtime::streaming::factory::operator_constructor::OperatorConstructor; +use crate::runtime::streaming::operators::sink::filesystem::{ + FilesystemFormat, FilesystemSinkOperator, compression_from_str, +}; +use crate::sql::common::constants::connection_format_value; +use crate::sql::common::with_option_keys as opt; + +pub struct FilesystemSinkDispatcher; + +impl OperatorConstructor for FilesystemSinkDispatcher { + fn with_config(&self, payload: &[u8], _registry: Arc) -> Result { + let op = ConnectorOp::decode(payload).context("failed to decode connector op")?; + let props = match op.config { + Some(Config::FilesystemSink(cfg)) => filesystem_props(cfg), + _ => bail!("filesystem sink expects FilesystemSinkConfig"), + }; + let props = normalized_props(props); + + let format = props + .get(opt::FORMAT) + .map(String::as_str) + .unwrap_or(connection_format_value::PARQUET) + .to_ascii_lowercase(); + let path = props + .get(opt::PATH) + .cloned() + .or_else(|| props.get(opt::SINK_PATH).cloned()) + .unwrap_or_else(|| ".".to_string()); + let compression = + compression_from_str(props.get(opt::PARQUET_COMPRESSION).map(String::as_str))?; + let sink_memory_bytes = parse_sink_memory_bytes(&props)?; + let format = match format.as_str() { + connection_format_value::CSV => FilesystemFormat::Csv, + connection_format_value::PARQUET => FilesystemFormat::Parquet, + connection_format_value::JSON => FilesystemFormat::JsonL, + connection_format_value::AVRO => FilesystemFormat::Avro, + connection_format_value::ORC => FilesystemFormat::Orc, + other => bail!("unsupported filesystem sink format '{other}'"), + }; + Ok(ConstructedOperator::Operator(Box::new( + FilesystemSinkOperator::try_new(op.name, path, format, compression, sink_memory_bytes)?, + ))) + } +} + +fn filesystem_props( + cfg: protocol::function_stream_graph::FilesystemSinkConfig, +) -> HashMap { + let mut props = cfg.extra_properties; + props.extend(cfg.runtime_properties); + apply_common_sink_fields(&mut props, cfg.path, cfg.format, cfg.parquet_compression); + props +} diff --git a/src/runtime/streaming/factory/connector/iceberg.rs b/src/runtime/streaming/factory/connector/iceberg.rs new file mode 100644 index 00000000..58e0809f --- /dev/null +++ b/src/runtime/streaming/factory/connector/iceberg.rs @@ -0,0 +1,81 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::sync::Arc; + +use anyhow::{Context, Result, bail}; +use prost::Message; +use protocol::function_stream_graph::ConnectorOp; +use protocol::function_stream_graph::connector_op::Config; + +use crate::runtime::streaming::api::operator::ConstructedOperator; +use crate::runtime::streaming::factory::connector::sink_props_codec::{ + apply_common_sink_fields, normalized_props, parse_sink_memory_bytes, +}; +use crate::runtime::streaming::factory::global::Registry; +use crate::runtime::streaming::factory::operator_constructor::OperatorConstructor; +use crate::runtime::streaming::operators::sink::filesystem::compression_from_str; +use crate::runtime::streaming::operators::sink::iceberg::{IcebergFormat, IcebergSinkOperator}; +use crate::sql::common::constants::connection_format_value; +use crate::sql::common::with_option_keys as opt; + +pub struct IcebergSinkDispatcher; + +impl OperatorConstructor for IcebergSinkDispatcher { + fn with_config(&self, payload: &[u8], _registry: Arc) -> Result { + let op = ConnectorOp::decode(payload).context("failed to decode connector op")?; + let props = match op.config { + Some(Config::IcebergSink(cfg)) => iceberg_props(cfg), + _ => bail!("iceberg sink expects IcebergSinkConfig"), + }; + let props = normalized_props(props); + + let format = props + .get(opt::FORMAT) + .map(String::as_str) + .unwrap_or(connection_format_value::PARQUET) + .to_ascii_lowercase(); + let path = props + .get(opt::PATH) + .cloned() + .or_else(|| props.get(opt::SINK_PATH).cloned()) + .unwrap_or_else(|| ".".to_string()); + let compression = + compression_from_str(props.get(opt::PARQUET_COMPRESSION).map(String::as_str))?; + let sink_memory_bytes = parse_sink_memory_bytes(&props)?; + let format = match format.as_str() { + connection_format_value::CSV => IcebergFormat::Csv, + connection_format_value::PARQUET => IcebergFormat::Parquet, + other => bail!("unsupported iceberg sink format '{other}'"), + }; + Ok(ConstructedOperator::Operator(Box::new( + IcebergSinkOperator::try_new( + op.name, + path, + format, + compression, + sink_memory_bytes, + props, + )?, + ))) + } +} + +fn iceberg_props( + cfg: protocol::function_stream_graph::IcebergSinkConfig, +) -> HashMap { + let mut props = cfg.extra_properties; + props.extend(cfg.runtime_properties); + apply_common_sink_fields(&mut props, cfg.path, cfg.format, cfg.parquet_compression); + props +} diff --git a/src/runtime/streaming/factory/connector/kafka.rs b/src/runtime/streaming/factory/connector/kafka.rs index 9d2f114d..17838e3e 100644 --- a/src/runtime/streaming/factory/connector/kafka.rs +++ b/src/runtime/streaming/factory/connector/kafka.rs @@ -136,8 +136,8 @@ impl OperatorConstructor for KafkaConnectorDispatcher { Self::build_kafka_source(&op.name, cfg, fs_schema) } Some(Config::KafkaSink(ref cfg)) => Self::build_kafka_sink(&op.name, cfg, fs_schema), - Some(Config::Generic(_)) => bail!( - "ConnectorOp '{}': GenericConnectorConfig dispatch not yet implemented", + Some(_) => bail!( + "ConnectorOp '{}': received non-kafka connector config for Kafka dispatcher", op.name ), None => bail!("ConnectorOp '{}' has no configuration payload", op.name), diff --git a/src/runtime/streaming/factory/connector/lancedb.rs b/src/runtime/streaming/factory/connector/lancedb.rs new file mode 100644 index 00000000..51f298f3 --- /dev/null +++ b/src/runtime/streaming/factory/connector/lancedb.rs @@ -0,0 +1,106 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::sync::Arc; + +use anyhow::{Context, Result, bail}; +use prost::Message; +use protocol::function_stream_graph::ConnectorOp; +use protocol::function_stream_graph::connector_op::Config; + +use crate::runtime::streaming::api::operator::ConstructedOperator; +use crate::runtime::streaming::factory::connector::sink_props_codec::{ + apply_common_sink_fields, normalized_props, +}; +use crate::runtime::streaming::factory::global::Registry; +use crate::runtime::streaming::factory::operator_constructor::OperatorConstructor; +use crate::runtime::streaming::operators::sink::lancedb::LanceDbSinkOperator; +use crate::sql::common::constants::connection_format_value; +use crate::sql::common::with_option_keys as opt; + +pub struct LanceDbSinkDispatcher; + +impl OperatorConstructor for LanceDbSinkDispatcher { + fn with_config(&self, payload: &[u8], _registry: Arc) -> Result { + let op = ConnectorOp::decode(payload).context("failed to decode connector op")?; + let props = match op.config { + Some(Config::LancedbSink(cfg)) => lancedb_props(cfg), + _ => bail!("lanceDB connector expects LanceDbSinkConfig"), + }; + let props = normalized_props(props); + + let format = props + .get(opt::FORMAT) + .map(String::as_str) + .unwrap_or(connection_format_value::LANCE) + .to_ascii_lowercase(); + if format != connection_format_value::LANCE { + bail!("lanceDB requires format='lance', got '{format}'"); + } + + let dataset_uri = resolve_lance_uri(&props)?; + let sink = LanceDbSinkOperator::new(op.name, dataset_uri); + Ok(ConstructedOperator::Operator(Box::new(sink))) + } +} + +fn resolve_lance_uri(props: &HashMap) -> Result { + let path = props + .get(opt::PATH) + .cloned() + .or_else(|| props.get(opt::SINK_PATH).cloned()) + .unwrap_or_else(|| ".".to_string()); + + // If path already contains a fully-qualified URI scheme, use it as-is. + if path.contains("://") { + return Ok(path); + } + + if let Some(bucket) = props.get(opt::S3_BUCKET) { + let trimmed = path.trim_matches('/'); + if trimmed.is_empty() { + Ok(format!("s3://{bucket}")) + } else { + Ok(format!("s3://{bucket}/{trimmed}")) + } + } else { + Ok(path) + } +} + +fn lancedb_props( + cfg: protocol::function_stream_graph::LanceDbSinkConfig, +) -> HashMap { + let mut props = cfg.extra_properties; + props.extend(cfg.runtime_properties); + apply_common_sink_fields(&mut props, cfg.path, cfg.format, None); + if let Some(v) = cfg.s3_bucket { + props.insert(opt::S3_BUCKET.to_string(), v); + } + if let Some(v) = cfg.s3_region { + props.insert(opt::S3_REGION.to_string(), v); + } + if let Some(v) = cfg.s3_endpoint { + props.insert(opt::S3_ENDPOINT.to_string(), v); + } + if let Some(v) = cfg.s3_access_key_id { + props.insert(opt::S3_ACCESS_KEY_ID.to_string(), v); + } + if let Some(v) = cfg.s3_secret_access_key { + props.insert(opt::S3_SECRET_ACCESS_KEY.to_string(), v); + } + if let Some(v) = cfg.s3_session_token { + props.insert(opt::S3_SESSION_TOKEN.to_string(), v); + } + props +} diff --git a/src/runtime/streaming/factory/connector/mod.rs b/src/runtime/streaming/factory/connector/mod.rs index 381de89c..d1a6d7f3 100644 --- a/src/runtime/streaming/factory/connector/mod.rs +++ b/src/runtime/streaming/factory/connector/mod.rs @@ -10,8 +10,19 @@ // See the License for the specific language governing permissions and // limitations under the License. +pub mod delta; mod dispatchers; +pub mod filesystem; +pub mod iceberg; pub mod kafka; +pub mod lancedb; +pub mod s3; +pub mod sink_props_codec; +pub use delta::DeltaSinkDispatcher; pub use dispatchers::{ConnectorSinkDispatcher, ConnectorSourceDispatcher}; +pub use filesystem::FilesystemSinkDispatcher; +pub use iceberg::IcebergSinkDispatcher; pub use kafka::KafkaConnectorDispatcher; +pub use lancedb::LanceDbSinkDispatcher; +pub use s3::S3SinkDispatcher; diff --git a/src/runtime/streaming/factory/connector/s3.rs b/src/runtime/streaming/factory/connector/s3.rs new file mode 100644 index 00000000..4b67fb9e --- /dev/null +++ b/src/runtime/streaming/factory/connector/s3.rs @@ -0,0 +1,90 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::sync::Arc; + +use anyhow::{Context, Result, bail}; +use prost::Message; +use protocol::function_stream_graph::ConnectorOp; +use protocol::function_stream_graph::connector_op::Config; + +use crate::runtime::streaming::api::operator::ConstructedOperator; +use crate::runtime::streaming::factory::connector::sink_props_codec::{ + apply_common_sink_fields, normalized_props, +}; +use crate::runtime::streaming::factory::global::Registry; +use crate::runtime::streaming::factory::operator_constructor::OperatorConstructor; +use crate::runtime::streaming::operators::sink::s3::{ + S3Format, S3SinkOperator, compression_from_str, +}; +use crate::sql::common::constants::connection_format_value; +use crate::sql::common::with_option_keys as opt; + +pub struct S3SinkDispatcher; + +impl OperatorConstructor for S3SinkDispatcher { + fn with_config(&self, payload: &[u8], _registry: Arc) -> Result { + let op = ConnectorOp::decode(payload).context("failed to decode connector op")?; + let props = match op.config { + Some(Config::S3Sink(cfg)) => s3_props(cfg), + _ => bail!("s3 sink expects S3SinkConfig"), + }; + let props = normalized_props(props); + + let format = props + .get(opt::FORMAT) + .map(String::as_str) + .unwrap_or(connection_format_value::PARQUET) + .to_ascii_lowercase(); + let path = props + .get(opt::PATH) + .cloned() + .or_else(|| props.get(opt::SINK_PATH).cloned()) + .unwrap_or_default(); + let compression = + compression_from_str(props.get(opt::PARQUET_COMPRESSION).map(String::as_str))?; + let format = match format.as_str() { + connection_format_value::CSV => S3Format::Csv, + connection_format_value::PARQUET => S3Format::Parquet, + other => bail!("unsupported s3 sink format '{other}'"), + }; + Ok(ConstructedOperator::Operator(Box::new( + S3SinkOperator::try_new(op.name, path, format, compression, props)?, + ))) + } +} + +fn s3_props(cfg: protocol::function_stream_graph::S3SinkConfig) -> HashMap { + let mut props = cfg.extra_properties; + props.extend(cfg.runtime_properties); + apply_common_sink_fields(&mut props, cfg.path, cfg.format, cfg.parquet_compression); + if !cfg.bucket.is_empty() { + props.insert(opt::S3_BUCKET.to_string(), cfg.bucket); + } + if !cfg.region.is_empty() { + props.insert(opt::S3_REGION.to_string(), cfg.region); + } + if let Some(v) = cfg.endpoint { + props.insert(opt::S3_ENDPOINT.to_string(), v); + } + if let Some(v) = cfg.access_key_id { + props.insert(opt::S3_ACCESS_KEY_ID.to_string(), v); + } + if let Some(v) = cfg.secret_access_key { + props.insert(opt::S3_SECRET_ACCESS_KEY.to_string(), v); + } + if let Some(v) = cfg.session_token { + props.insert(opt::S3_SESSION_TOKEN.to_string(), v); + } + props +} diff --git a/src/runtime/streaming/factory/connector/sink_props_codec.rs b/src/runtime/streaming/factory/connector/sink_props_codec.rs new file mode 100644 index 00000000..a4128f24 --- /dev/null +++ b/src/runtime/streaming/factory/connector/sink_props_codec.rs @@ -0,0 +1,97 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use anyhow::{Context, Result, bail}; +use protocol::function_stream_graph::{ParquetCompressionProto, SinkFormatProto}; + +use crate::config::global_config::DEFAULT_SINK_BUFFER_MEMORY_BYTES; +use crate::sql::common::constants::{connection_format_value, parquet_compression_value}; +use crate::sql::common::with_option_keys as opt; + +pub fn normalized_props(raw: HashMap) -> HashMap { + raw.into_iter() + .map(|(k, v)| (k.to_ascii_lowercase(), normalize_value(&v))) + .collect() +} + +pub fn apply_common_sink_fields( + props: &mut HashMap, + path: String, + format: i32, + parquet_compression: Option, +) { + if !path.is_empty() { + props.insert(opt::PATH.to_string(), path); + } + if let Some(fmt) = SinkFormatProto::try_from(format) + .ok() + .and_then(sink_format_as_str) + { + props.insert(opt::FORMAT.to_string(), fmt.to_string()); + } + if let Some(comp) = parquet_compression + .and_then(|c| ParquetCompressionProto::try_from(c).ok()) + .and_then(parquet_compression_as_str) + { + props.insert(opt::PARQUET_COMPRESSION.to_string(), comp.to_string()); + } +} + +pub fn sink_format_as_str(v: SinkFormatProto) -> Option<&'static str> { + use SinkFormatProto as F; + match v { + F::SinkFormatCsv => Some(connection_format_value::CSV), + F::SinkFormatJsonl => Some(connection_format_value::JSONL), + F::SinkFormatAvro => Some(connection_format_value::AVRO), + F::SinkFormatParquet => Some(connection_format_value::PARQUET), + F::SinkFormatOrc => Some(connection_format_value::ORC), + F::SinkFormatLance => Some(connection_format_value::LANCE), + F::SinkFormatUnspecified => None, + } +} + +pub fn parquet_compression_as_str(v: ParquetCompressionProto) -> Option<&'static str> { + use ParquetCompressionProto as C; + match v { + C::ParquetCompressionUncompressed => Some(parquet_compression_value::UNCOMPRESSED), + C::ParquetCompressionSnappy => Some(parquet_compression_value::SNAPPY), + C::ParquetCompressionGzip => Some(parquet_compression_value::GZIP), + C::ParquetCompressionZstd => Some(parquet_compression_value::ZSTD), + C::ParquetCompressionLz4 => Some(parquet_compression_value::LZ4), + C::ParquetCompressionLz4Raw => Some(parquet_compression_value::LZ4_RAW), + C::ParquetCompressionUnspecified => None, + } +} + +pub fn parse_sink_memory_bytes(props: &HashMap) -> Result { + let parsed = match props.get(opt::SINK_MEMORY_BYTES) { + Some(raw) => raw + .parse::() + .with_context(|| format!("invalid '{}' value '{}'", opt::SINK_MEMORY_BYTES, raw))?, + None => DEFAULT_SINK_BUFFER_MEMORY_BYTES, + }; + if parsed == 0 { + bail!("'{}' must be > 0", opt::SINK_MEMORY_BYTES); + } + Ok(parsed) +} + +fn normalize_value(v: &str) -> String { + let s = v.trim(); + if (s.starts_with('\'') && s.ends_with('\'')) || (s.starts_with('"') && s.ends_with('"')) { + s[1..s.len() - 1].to_string() + } else { + s.to_string() + } +} diff --git a/src/runtime/streaming/format/encoder.rs b/src/runtime/streaming/format/encoder.rs new file mode 100644 index 00000000..198b06b2 --- /dev/null +++ b/src/runtime/streaming/format/encoder.rs @@ -0,0 +1,427 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::io::Cursor; + +use anyhow::{Context, Result, bail}; +use apache_avro::types::Value as AvroValue; +use apache_avro::{Codec as AvroCodec, Schema as AvroSchema, Writer as AvroWriter}; +use arrow::csv::WriterBuilder as CsvWriterBuilder; +use arrow::json::LineDelimitedWriter; +use arrow_array::{ + Array, BinaryArray, BooleanArray, Float32Array, Float64Array, Int8Array, Int16Array, + Int32Array, Int64Array, LargeBinaryArray, LargeStringArray, RecordBatch, StringArray, + TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, + TimestampSecondArray, UInt8Array, UInt16Array, UInt32Array, UInt64Array, +}; +use arrow_ipc::writer::FileWriter as ArrowIpcFileWriter; +use arrow_schema::{DataType, Field, TimeUnit}; +use parquet::arrow::ArrowWriter; +use parquet::basic::Compression; +use parquet::file::properties::WriterProperties; +use serde_json::{Map as JsonMap, Value as JsonValue}; + +/// Pure in-memory format encoder for sink payload generation. +/// It only converts `RecordBatch` into bytes and does not perform any I/O. +pub struct FormatEncoder; + +impl FormatEncoder { + /// Encode batches into CSV bytes. + pub fn encode_csv(batches: &[RecordBatch]) -> Result> { + if batches.is_empty() { + return Ok(Vec::new()); + } + + let mut out = Vec::new(); + let mut writer = CsvWriterBuilder::new().with_header(true).build(&mut out); + for batch in batches { + writer + .write(batch) + .context("failed to encode record batch to CSV")?; + } + drop(writer); + Ok(out) + } + + /// Encode batches into Parquet bytes. + pub fn encode_parquet(batches: &[RecordBatch], compression: Compression) -> Result> { + if batches.is_empty() { + return Ok(Vec::new()); + } + + let schema = batches[0].schema(); + let props = WriterProperties::builder() + .set_compression(compression) + .build(); + let mut cursor = Cursor::new(Vec::new()); + let mut writer = ArrowWriter::try_new(&mut cursor, schema, Some(props)) + .context("failed to init parquet writer")?; + + for batch in batches { + writer + .write(batch) + .context("failed to encode record batch to parquet")?; + } + writer.close().context("failed to finalize parquet")?; + Ok(cursor.into_inner()) + } + + /// Encode batches into NDJSON (JSON Lines) bytes. + pub fn encode_jsonl(batches: &[RecordBatch]) -> Result> { + if batches.is_empty() { + return Ok(Vec::new()); + } + + let mut out = Vec::new(); + let mut writer = LineDelimitedWriter::new(&mut out); + for batch in batches { + writer + .write(batch) + .context("failed to encode record batch to JSONL")?; + } + writer.finish().context("failed to finalize JSONL stream")?; + Ok(out) + } + + pub fn encode_avro(batches: &[RecordBatch]) -> Result> { + if batches.is_empty() { + return Ok(Vec::new()); + } + + let schema_json = build_avro_schema_json(&batches[0])?; + let avro_schema = + AvroSchema::parse_str(&schema_json).context("failed to parse generated avro schema")?; + let mut writer = AvroWriter::with_codec(&avro_schema, Vec::new(), AvroCodec::Null); + + for batch in batches { + let accessors = build_column_accessors(batch)?; + let schema = batch.schema(); + let fields = schema.fields(); + let col_names: Vec = fields.iter().map(|f| f.name().clone()).collect(); + + for row_idx in 0..batch.num_rows() { + let mut row_records = Vec::with_capacity(accessors.len()); + for (col_idx, accessor) in accessors.iter().enumerate() { + let nullable = fields[col_idx].is_nullable(); + let val = accessor.avro_value_at(row_idx, nullable)?; + row_records.push((col_names[col_idx].clone(), val)); + } + writer + .append(AvroValue::Record(row_records)) + .map_err(|e| anyhow::anyhow!("failed to append row into avro writer: {e}"))?; + } + } + + writer.flush().context("failed to flush avro writer")?; + writer + .into_inner() + .context("failed to finalize avro container bytes") + } + + pub fn encode_orc(batches: &[RecordBatch]) -> Result> { + if batches.is_empty() { + return Ok(Vec::new()); + } + + let schema = batches[0].schema(); + let mut out = Cursor::new(Vec::new()); + let mut writer = ArrowIpcFileWriter::try_new(&mut out, &schema) + .context("failed to init ORC-compatible file writer")?; + + for batch in batches { + writer + .write(batch) + .context("failed to encode record batch into ORC-compatible payload")?; + } + + writer + .finish() + .context("failed to finalize ORC-compatible payload")?; + Ok(out.into_inner()) + } +} + +fn build_avro_schema_json(batch: &RecordBatch) -> Result { + let fields = batch + .schema() + .fields() + .iter() + .map(|field| { + let avro_type = avro_type_for_field(field)?; + let field_type = if field.is_nullable() { + JsonValue::Array(vec![JsonValue::String("null".to_string()), avro_type]) + } else { + avro_type + }; + Ok(JsonValue::Object(JsonMap::from_iter([ + ( + "name".to_string(), + JsonValue::String(field.name().to_string()), + ), + ("type".to_string(), field_type), + ("default".to_string(), JsonValue::Null), + ]))) + }) + .collect::>>()?; + + let schema = JsonValue::Object(JsonMap::from_iter([ + ("type".to_string(), JsonValue::String("record".to_string())), + ( + "name".to_string(), + JsonValue::String("FunctionStreamRecord".to_string()), + ), + ("fields".to_string(), JsonValue::Array(fields)), + ])); + + serde_json::to_string(&schema).context("failed to serialize avro schema json") +} + +fn avro_type_for_field(field: &Field) -> Result { + let ty = match field.data_type() { + DataType::Boolean => JsonValue::String("boolean".to_string()), + DataType::Int8 | DataType::Int16 | DataType::Int32 => JsonValue::String("int".to_string()), + DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::Int64 + | DataType::UInt64 => JsonValue::String("long".to_string()), + DataType::Float32 => JsonValue::String("float".to_string()), + DataType::Float64 => JsonValue::String("double".to_string()), + DataType::Utf8 | DataType::LargeUtf8 => JsonValue::String("string".to_string()), + DataType::Binary | DataType::LargeBinary => JsonValue::String("bytes".to_string()), + DataType::Date32 => JsonValue::Object(JsonMap::from_iter([ + ("type".to_string(), JsonValue::String("int".to_string())), + ( + "logicalType".to_string(), + JsonValue::String("date".to_string()), + ), + ])), + DataType::Timestamp(unit, _) => { + let logical = match unit { + TimeUnit::Second | TimeUnit::Millisecond => "timestamp-millis", + TimeUnit::Microsecond | TimeUnit::Nanosecond => "timestamp-micros", + }; + JsonValue::Object(JsonMap::from_iter([ + ("type".to_string(), JsonValue::String("long".to_string())), + ( + "logicalType".to_string(), + JsonValue::String(logical.to_string()), + ), + ])) + } + other => bail!("unsupported data type for avro encoding: {other:?}"), + }; + Ok(ty) +} + +/// Downcasts each column once per batch; row iteration only matches on this enum. +enum ColumnAccessor<'a> { + Boolean(&'a BooleanArray), + Int8(&'a Int8Array), + Int16(&'a Int16Array), + Int32(&'a Int32Array), + Int64(&'a Int64Array), + UInt8(&'a UInt8Array), + UInt16(&'a UInt16Array), + UInt32(&'a UInt32Array), + UInt64(&'a UInt64Array), + Float32(&'a Float32Array), + Float64(&'a Float64Array), + Utf8(&'a StringArray), + LargeUtf8(&'a LargeStringArray), + Binary(&'a BinaryArray), + LargeBinary(&'a LargeBinaryArray), + Date32(&'a Int32Array), + TimestampSec(&'a TimestampSecondArray), + TimestampMs(&'a TimestampMillisecondArray), + TimestampUs(&'a TimestampMicrosecondArray), + TimestampNs(&'a TimestampNanosecondArray), +} + +impl<'a> ColumnAccessor<'a> { + fn avro_value_at(&self, row: usize, nullable: bool) -> Result { + let is_null = match self { + Self::Boolean(a) => a.is_null(row), + Self::Int8(a) => a.is_null(row), + Self::Int16(a) => a.is_null(row), + Self::Int32(a) => a.is_null(row), + Self::Int64(a) => a.is_null(row), + Self::UInt8(a) => a.is_null(row), + Self::UInt16(a) => a.is_null(row), + Self::UInt32(a) => a.is_null(row), + Self::UInt64(a) => a.is_null(row), + Self::Float32(a) => a.is_null(row), + Self::Float64(a) => a.is_null(row), + Self::Utf8(a) => a.is_null(row), + Self::LargeUtf8(a) => a.is_null(row), + Self::Binary(a) => a.is_null(row), + Self::LargeBinary(a) => a.is_null(row), + Self::Date32(a) => a.is_null(row), + Self::TimestampSec(a) => a.is_null(row), + Self::TimestampMs(a) => a.is_null(row), + Self::TimestampUs(a) => a.is_null(row), + Self::TimestampNs(a) => a.is_null(row), + }; + + if is_null { + if !nullable { + bail!("null value in non-nullable avro field at row {row}"); + } + return Ok(AvroValue::Union(0, Box::new(AvroValue::Null))); + } + + let raw = match self { + Self::Boolean(a) => AvroValue::Boolean(a.value(row)), + Self::Int8(a) => AvroValue::Int(i32::from(a.value(row))), + Self::Int16(a) => AvroValue::Int(i32::from(a.value(row))), + Self::Int32(a) => AvroValue::Int(a.value(row)), + Self::Int64(a) => AvroValue::Long(a.value(row)), + Self::UInt8(a) => AvroValue::Int(i32::from(a.value(row))), + Self::UInt16(a) => AvroValue::Int(i32::from(a.value(row))), + Self::UInt32(a) => AvroValue::Long(i64::from(a.value(row))), + Self::UInt64(a) => { + let v = a.value(row); + AvroValue::Long(i64::try_from(v).with_context(|| { + format!("UInt64 value {v} does not fit Avro long for row {row}") + })?) + } + Self::Float32(a) => { + let v = a.value(row); + if !v.is_finite() { + bail!("non-finite f32 at row {row}: {v}"); + } + AvroValue::Float(v) + } + Self::Float64(a) => { + let v = a.value(row); + if !v.is_finite() { + bail!("non-finite f64 at row {row}: {v}"); + } + AvroValue::Double(v) + } + Self::Utf8(a) => AvroValue::String(a.value(row).to_string()), + Self::LargeUtf8(a) => AvroValue::String(a.value(row).to_string()), + Self::Binary(a) => AvroValue::Bytes(a.value(row).to_vec()), + Self::LargeBinary(a) => AvroValue::Bytes(a.value(row).to_vec()), + Self::Date32(a) => AvroValue::Int(a.value(row)), + Self::TimestampSec(a) => AvroValue::Long(a.value(row).saturating_mul(1000)), + Self::TimestampMs(a) => AvroValue::Long(a.value(row)), + Self::TimestampUs(a) => AvroValue::Long(a.value(row)), + Self::TimestampNs(a) => AvroValue::Long(a.value(row) / 1000), + }; + + Ok(if nullable { + AvroValue::Union(1, Box::new(raw)) + } else { + raw + }) + } +} + +fn build_column_accessors(batch: &RecordBatch) -> Result>> { + let mut accessors = Vec::with_capacity(batch.num_columns()); + for col in batch.columns() { + let accessor = match col.data_type() { + DataType::Boolean => ColumnAccessor::Boolean( + col.as_any() + .downcast_ref() + .context("expected BooleanArray")?, + ), + DataType::Int8 => { + ColumnAccessor::Int8(col.as_any().downcast_ref().context("expected Int8Array")?) + } + DataType::Int16 => { + ColumnAccessor::Int16(col.as_any().downcast_ref().context("expected Int16Array")?) + } + DataType::Int32 => { + ColumnAccessor::Int32(col.as_any().downcast_ref().context("expected Int32Array")?) + } + DataType::Int64 => { + ColumnAccessor::Int64(col.as_any().downcast_ref().context("expected Int64Array")?) + } + DataType::UInt8 => { + ColumnAccessor::UInt8(col.as_any().downcast_ref().context("expected UInt8Array")?) + } + DataType::UInt16 => ColumnAccessor::UInt16( + col.as_any() + .downcast_ref() + .context("expected UInt16Array")?, + ), + DataType::UInt32 => ColumnAccessor::UInt32( + col.as_any() + .downcast_ref() + .context("expected UInt32Array")?, + ), + DataType::UInt64 => ColumnAccessor::UInt64( + col.as_any() + .downcast_ref() + .context("expected UInt64Array")?, + ), + DataType::Float32 => ColumnAccessor::Float32( + col.as_any() + .downcast_ref() + .context("expected Float32Array")?, + ), + DataType::Float64 => ColumnAccessor::Float64( + col.as_any() + .downcast_ref() + .context("expected Float64Array")?, + ), + DataType::Utf8 => ColumnAccessor::Utf8( + col.as_any() + .downcast_ref() + .context("expected StringArray")?, + ), + DataType::LargeUtf8 => ColumnAccessor::LargeUtf8( + col.as_any() + .downcast_ref() + .context("expected LargeStringArray")?, + ), + DataType::Binary => ColumnAccessor::Binary( + col.as_any() + .downcast_ref() + .context("expected BinaryArray")?, + ), + DataType::LargeBinary => ColumnAccessor::LargeBinary( + col.as_any() + .downcast_ref() + .context("expected LargeBinaryArray")?, + ), + DataType::Date32 => { + ColumnAccessor::Date32(col.as_any().downcast_ref().context("expected Int32Array")?) + } + DataType::Timestamp(TimeUnit::Second, _) => ColumnAccessor::TimestampSec( + col.as_any() + .downcast_ref() + .context("expected TimestampSecondArray")?, + ), + DataType::Timestamp(TimeUnit::Millisecond, _) => ColumnAccessor::TimestampMs( + col.as_any() + .downcast_ref() + .context("expected TimestampMillisecondArray")?, + ), + DataType::Timestamp(TimeUnit::Microsecond, _) => ColumnAccessor::TimestampUs( + col.as_any() + .downcast_ref() + .context("expected TimestampMicrosecondArray")?, + ), + DataType::Timestamp(TimeUnit::Nanosecond, _) => ColumnAccessor::TimestampNs( + col.as_any() + .downcast_ref() + .context("expected TimestampNanosecondArray")?, + ), + other => bail!("unsupported data type for avro column accessor: {other:?}"), + }; + accessors.push(accessor); + } + Ok(accessors) +} diff --git a/src/runtime/streaming/format/mod.rs b/src/runtime/streaming/format/mod.rs index d5e63a9d..3112a1fe 100644 --- a/src/runtime/streaming/format/mod.rs +++ b/src/runtime/streaming/format/mod.rs @@ -12,6 +12,7 @@ pub mod config; pub mod deserializer; +pub mod encoder; pub mod json_encoder; pub mod serializer; diff --git a/src/runtime/streaming/job/job_manager.rs b/src/runtime/streaming/job/job_manager.rs index a9bc546f..e4b9916b 100644 --- a/src/runtime/streaming/job/job_manager.rs +++ b/src/runtime/streaming/job/job_manager.rs @@ -24,9 +24,7 @@ use tokio_stream::wrappers::ReceiverStream; use tracing::{error, info, warn}; use protocol::function_stream_graph::{ChainedOperator, FsProgram}; -use protocol::storage::{ - KafkaSourceSubtaskCheckpoint, SourceCheckpointPayload, source_checkpoint_payload, -}; +use protocol::storage::{SourceCheckpointInfo, source_checkpoint_info}; use crate::config::{ DEFAULT_CHECKPOINT_INTERVAL_MS, DEFAULT_OPERATOR_STATE_STORE_MEMORY_BYTES, @@ -157,7 +155,6 @@ struct CheckpointCoordinatorConfig { expected_pipeline_ids: HashSet, interval_ms: u64, start_epoch: u64, - job_state_dir: PathBuf, timeout: Duration, } @@ -170,25 +167,6 @@ impl PipelineRunner { } } -fn decode_kafka_checkpoints_from_source_payloads( - payloads: Vec, - epoch: u64, -) -> Vec { - let mut out = Vec::new(); - for p in payloads { - match p.checkpoint { - Some(source_checkpoint_payload::Checkpoint::Kafka(mut cp)) => { - if cp.checkpoint_epoch != epoch { - cp.checkpoint_epoch = epoch; - } - out.push(cp); - } - None => warn!("Skip empty source checkpoint payload"), - } - } - out -} - impl JobManager { pub fn new( operator_factory: Arc, @@ -255,7 +233,7 @@ impl JobManager { self.state_config.pipeline_parallelism } - /// Per-job state directory (Kafka offset snapshots, operator state roots, etc.). + /// Per-job state directory (source offset snapshots, operator state roots, etc.). #[inline] pub fn job_state_directory(&self, job_id: &str) -> PathBuf { self.state_base_dir.join(job_id) @@ -267,6 +245,7 @@ impl JobManager { program: FsProgram, custom_checkpoint_interval_ms: Option, recovery_epoch: Option, + source_checkpoint_infos: Vec, ) -> Result { let mut edge_manager = EdgeManager::build(&program.nodes, &program.edges); let mut pipelines = HashMap::with_capacity(program.nodes.len()); @@ -295,6 +274,7 @@ impl JobManager { &job_state_dir, job_master_tx.clone(), safe_epoch, + &source_checkpoint_infos, ) .with_context(|| { format!( @@ -322,7 +302,6 @@ impl JobManager { expected_pipeline_ids, interval_ms, start_epoch: safe_epoch + 1, - job_state_dir: job_state_dir.clone(), timeout: Duration::from_millis(interval_ms.max(1) * 3), }); @@ -530,6 +509,7 @@ impl JobManager { job_state_dir: &Path, job_master_tx: mpsc::Sender, recovery_epoch: u64, + source_checkpoint_infos: &[SourceCheckpointInfo], ) -> Result<(PhysicalPipeline, bool)> { let (raw_inboxes, raw_outboxes) = edge_manager.take_endpoints(pipeline_id).with_context(|| { @@ -610,7 +590,20 @@ impl JobManager { Some(job_master_tx.clone()), ); - let runner = if let Some(source) = chain.source { + let runner = if let Some(mut source) = chain.source { + // Filter checkpoint records for this pipeline and inject into the source operator + // so it can restore partition offsets in on_start without touching TaskContext. + let pipeline_checkpoint_infos: Vec = source_checkpoint_infos + .iter() + .filter(|info| match &info.info { + Some(source_checkpoint_info::Info::Kafka(cp)) => cp.pipeline_id == pipeline_id, + None => false, + }) + .cloned() + .collect(); + if !pipeline_checkpoint_infos.is_empty() { + source.set_recovery_checkpoint(pipeline_checkpoint_infos); + } let chain_head = ChainBuilder::build(chain.operators); PipelineRunner::Source(SourceDriver::new(source, chain_head, ctx, control_rx)) } else { @@ -733,7 +726,6 @@ impl JobManager { expected_pipeline_ids, interval_ms, start_epoch, - job_state_dir, timeout, } = cfg; if interval_ms == 0 { @@ -749,7 +741,7 @@ impl JobManager { epoch: u64, missing_acks: HashSet, start_time: Instant, - source_reports: Vec, + source_infos: Vec, } let mut active_checkpoint: Option = None; @@ -768,15 +760,15 @@ impl JobManager { JobMasterEvent::CheckpointAck { pipeline_id, epoch, - source_payloads, + source_infos, } => { if let Some(pending) = &mut active_checkpoint { if pending.epoch != epoch { continue; } pending.missing_acks.remove(&pipeline_id); - if !source_payloads.is_empty() { - pending.source_reports.extend(source_payloads); + if !source_infos.is_empty() { + pending.source_infos.extend(source_infos); } if pending.missing_acks.is_empty() { @@ -786,16 +778,13 @@ impl JobManager { ); let completed = active_checkpoint.take().expect("active checkpoint exists"); - let kf = decode_kafka_checkpoints_from_source_payloads(completed.source_reports, epoch); - let epoch_u32 = u32::try_from(epoch).unwrap_or(u32::MAX); let mut catalog_ok = true; if let Some(catalog) = CatalogManager::try_global() { if let Err(e) = catalog.commit_job_checkpoint( &job_id, epoch, - &job_state_dir, - kf, + completed.source_infos, ) { catalog_ok = false; error!( @@ -812,9 +801,9 @@ impl JobManager { } let phase2 = if catalog_ok { - ControlCommand::Commit { epoch: epoch_u32 } + ControlCommand::Commit { epoch } } else { - ControlCommand::AbortCheckpoint { epoch: epoch_u32 } + ControlCommand::AbortCheckpoint { epoch } }; broadcast_cmd(phase2); } @@ -828,9 +817,7 @@ impl JobManager { job_id = %job_id, epoch = epoch, pipeline_id = pipeline_id, reason = %reason, "Checkpoint FAILED!" ); - broadcast_cmd(ControlCommand::AbortCheckpoint { - epoch: u32::try_from(epoch).unwrap_or(u32::MAX), - }); + broadcast_cmd(ControlCommand::AbortCheckpoint { epoch }); active_checkpoint = None; } } @@ -846,7 +833,7 @@ impl JobManager { "Checkpoint timed out; aborting active epoch" ); broadcast_cmd(ControlCommand::AbortCheckpoint { - epoch: u32::try_from(pending.epoch).unwrap_or(u32::MAX), + epoch: pending.epoch, }); } else { continue; @@ -862,7 +849,7 @@ impl JobManager { info!(job_id = %job_id, epoch = current_epoch, "Triggering global Checkpoint Barrier."); let barrier = CheckpointBarrier { - epoch: current_epoch as u32, + epoch: current_epoch, min_epoch: 0, timestamp: std::time::SystemTime::now(), then_stop: false, @@ -871,7 +858,7 @@ impl JobManager { epoch: current_epoch, missing_acks: expected_pipeline_ids.clone(), start_time: Instant::now(), - source_reports: Vec::new(), + source_infos: Vec::new(), }); for tx in &source_control_txs { diff --git a/src/runtime/streaming/operators/grouping/incremental_aggregate.rs b/src/runtime/streaming/operators/grouping/incremental_aggregate.rs index a2325e7c..ff997c11 100644 --- a/src/runtime/streaming/operators/grouping/incremental_aggregate.rs +++ b/src/runtime/streaming/operators/grouping/incremental_aggregate.rs @@ -939,7 +939,7 @@ impl Operator for IncrementalAggregatingFunc { // Flush to Parquet store - .prepare_checkpoint_epoch(barrier.epoch as u64) + .prepare_checkpoint_epoch(barrier.epoch) .map_err(|e| anyhow!("Snapshot failed: {e}"))?; info!( @@ -952,11 +952,11 @@ impl Operator for IncrementalAggregatingFunc { Ok(()) } - async fn commit_checkpoint(&mut self, epoch: u32, _ctx: &mut TaskContext) -> Result<()> { + async fn commit_checkpoint(&mut self, epoch: u64, _ctx: &mut TaskContext) -> Result<()> { self.state_store .as_ref() .expect("state store not initialized") - .commit_checkpoint_epoch(epoch as u64) + .commit_checkpoint_epoch(epoch) .map_err(|e| anyhow!("Commit checkpoint failed: {e}"))?; Ok(()) } diff --git a/src/runtime/streaming/operators/joins/join_instance.rs b/src/runtime/streaming/operators/joins/join_instance.rs index 098e5a73..5c04c1fb 100644 --- a/src/runtime/streaming/operators/joins/join_instance.rs +++ b/src/runtime/streaming/operators/joins/join_instance.rs @@ -360,16 +360,16 @@ impl Operator for InstantJoinOperator { self.state_store .as_ref() .unwrap() - .prepare_checkpoint_epoch(barrier.epoch as u64) + .prepare_checkpoint_epoch(barrier.epoch) .map_err(|e| anyhow!("Snapshot failed: {e}"))?; Ok(()) } - async fn commit_checkpoint(&mut self, epoch: u32, _ctx: &mut TaskContext) -> Result<()> { + async fn commit_checkpoint(&mut self, epoch: u64, _ctx: &mut TaskContext) -> Result<()> { self.state_store .as_ref() .unwrap() - .commit_checkpoint_epoch(epoch as u64) + .commit_checkpoint_epoch(epoch) .map_err(|e| anyhow!("Commit checkpoint failed: {e}"))?; Ok(()) } diff --git a/src/runtime/streaming/operators/joins/join_with_expiration.rs b/src/runtime/streaming/operators/joins/join_with_expiration.rs index 6a2a240c..5ed8dfa3 100644 --- a/src/runtime/streaming/operators/joins/join_with_expiration.rs +++ b/src/runtime/streaming/operators/joins/join_with_expiration.rs @@ -314,18 +314,18 @@ impl Operator for JoinWithExpirationOperator { .expect("State store not initialized"); store - .prepare_checkpoint_epoch(barrier.epoch as u64) + .prepare_checkpoint_epoch(barrier.epoch) .map_err(|e| anyhow!("Snapshot failed: {e}"))?; info!(epoch = barrier.epoch, "Join Operator snapshotted state."); Ok(()) } - async fn commit_checkpoint(&mut self, epoch: u32, _ctx: &mut TaskContext) -> Result<()> { + async fn commit_checkpoint(&mut self, epoch: u64, _ctx: &mut TaskContext) -> Result<()> { self.state_store .as_ref() .expect("State store not initialized") - .commit_checkpoint_epoch(epoch as u64) + .commit_checkpoint_epoch(epoch) .map_err(|e| anyhow!("Commit checkpoint failed: {e}"))?; Ok(()) } diff --git a/src/runtime/streaming/operators/sink/delta/mod.rs b/src/runtime/streaming/operators/sink/delta/mod.rs new file mode 100644 index 00000000..4df6b3b5 --- /dev/null +++ b/src/runtime/streaming/operators/sink/delta/mod.rs @@ -0,0 +1,273 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::fs::create_dir_all; +use std::path::PathBuf; +use std::sync::Arc; + +use anyhow::{Context, Result}; +use arrow_array::RecordBatch; +use async_trait::async_trait; +use bytes::Bytes; +use object_store::aws::AmazonS3Builder; +use object_store::path::Path as ObjectStorePath; +use object_store::{ObjectStore, PutPayload}; +use parquet::basic::Compression; +use tokio::io::AsyncWriteExt; +use tracing::{debug, info, warn}; + +use crate::runtime::memory::{MemoryBlock, try_global_memory_pool}; +use crate::runtime::streaming::StreamOutput; +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::{Collector, Operator}; +use crate::runtime::streaming::format::encoder::FormatEncoder; +use crate::sql::common::constants::factory_operator_name; +use crate::sql::common::with_option_keys as opt; +use crate::sql::common::{CheckpointBarrier, Watermark}; + +/// Flush early when buffered batches exceed this size. +const DEFAULT_MAX_BUFFER_BYTES: usize = 64 * 1024 * 1024; + +enum DeltaDestination { + Local(PathBuf), + S3 { + prefix: String, + client: Arc, + }, +} + +pub struct DeltaSinkOperator { + table_name: String, + destination: DeltaDestination, + parquet_compression: Compression, + pending: Vec, + pending_bytes: usize, + sink_memory_block: Option>, + early_flush_threshold_bytes: usize, + file_counter: u64, + format: DeltaFormat, +} + +#[derive(Debug, Clone, Copy)] +pub enum DeltaFormat { + Csv, + Parquet, + JsonL, + Avro, + Orc, +} + +impl DeltaSinkOperator { + pub fn try_new( + table_name: String, + path: String, + format: DeltaFormat, + parquet_compression: Compression, + sink_memory_bytes: u64, + options: HashMap, + ) -> Result { + let destination = if let Some(bucket) = options.get(opt::S3_BUCKET) { + let region = options + .get(opt::S3_REGION) + .cloned() + .unwrap_or_else(|| "us-east-1".to_string()); + let mut builder = AmazonS3Builder::new() + .with_bucket_name(bucket.clone()) + .with_region(region); + if let Some(endpoint) = options.get(opt::S3_ENDPOINT) { + builder = builder.with_endpoint(endpoint); + if endpoint.to_ascii_lowercase().starts_with("http://") { + builder = builder.with_allow_http(true); + } + } + if let Some(v) = options.get(opt::S3_ACCESS_KEY_ID) { + builder = builder.with_access_key_id(v); + } + if let Some(v) = options.get(opt::S3_SECRET_ACCESS_KEY) { + builder = builder.with_secret_access_key(v); + } + if let Some(v) = options.get(opt::S3_SESSION_TOKEN) { + builder = builder.with_token(v); + } + let client = builder + .build() + .context("failed to build s3 client for delta sink")?; + DeltaDestination::S3 { + prefix: path.trim_matches('/').to_string(), + client: Arc::new(client), + } + } else { + let root = PathBuf::from(path.clone()); + create_dir_all(&root) + .with_context(|| format!("failed to create delta sink dir {}", root.display()))?; + DeltaDestination::Local(root) + }; + + let mut sink_memory_block = None; + let reserve_bytes = usize::try_from(sink_memory_bytes).unwrap_or(DEFAULT_MAX_BUFFER_BYTES); + let mut early_flush_threshold_bytes = reserve_bytes; + if let Ok(pool) = try_global_memory_pool() + && let Ok(block) = pool.try_request_block(reserve_bytes as u64) + { + early_flush_threshold_bytes = ((block.capacity() as usize) * 8) / 10; + sink_memory_block = Some(block); + } + + Ok(Self { + table_name, + destination, + parquet_compression, + pending: Vec::new(), + pending_bytes: 0, + sink_memory_block, + early_flush_threshold_bytes, + file_counter: 0, + format, + }) + } + + async fn flush_epoch(&mut self, epoch: u64, subtask_idx: usize) -> Result<()> { + if self.pending.is_empty() { + return Ok(()); + } + + let batches = std::mem::take(&mut self.pending); + let format = self.format; + let compression = self.parquet_compression; + let bytes = tokio::task::spawn_blocking(move || -> Result> { + match format { + DeltaFormat::Csv => FormatEncoder::encode_csv(&batches), + DeltaFormat::Parquet => FormatEncoder::encode_parquet(&batches, compression), + DeltaFormat::JsonL => FormatEncoder::encode_jsonl(&batches), + DeltaFormat::Avro => FormatEncoder::encode_avro(&batches), + DeltaFormat::Orc => FormatEncoder::encode_orc(&batches), + } + }) + .await + .context("tokio blocking task panicked during serialization")??; + + self.file_counter += 1; + let file_name = format!( + "delta-part-{:05}-epoch-{:010}-{:06}.{}", + subtask_idx, + epoch, + self.file_counter, + match self.format { + DeltaFormat::Csv => "csv", + DeltaFormat::Parquet => "parquet", + DeltaFormat::JsonL => "jsonl", + DeltaFormat::Avro => "avro", + DeltaFormat::Orc => "orc", + } + ); + match &self.destination { + DeltaDestination::Local(root) => { + let out = root.join(file_name); + let mut f = tokio::fs::File::create(&out).await.with_context(|| { + format!("failed creating delta sink file {}", out.display()) + })?; + f.write_all(&bytes) + .await + .with_context(|| format!("failed writing delta sink file {}", out.display()))?; + } + DeltaDestination::S3 { prefix, client } => { + let key = if prefix.is_empty() { + file_name + } else { + format!("{prefix}/{file_name}") + }; + client + .put( + &ObjectStorePath::from(key), + PutPayload::from(Bytes::from(bytes)), + ) + .await + .context("failed writing object to s3")?; + } + } + self.pending_bytes = 0; + Ok(()) + } +} + +#[async_trait] +impl Operator for DeltaSinkOperator { + fn name(&self) -> &str { + factory_operator_name::CONNECTOR_SINK + } + + async fn on_start(&mut self, _ctx: &mut TaskContext) -> Result<()> { + let reserved_block_bytes = self + .sink_memory_block + .as_ref() + .map(|b| b.capacity()) + .unwrap_or(0); + info!( + table = %self.table_name, + format = ?self.format, + reserved_block_bytes, + early_flush_threshold_bytes = self.early_flush_threshold_bytes, + "Starting delta sink operator" + ); + Ok(()) + } + + async fn process_data( + &mut self, + _input_idx: usize, + batch: RecordBatch, + ctx: &mut TaskContext, + _collector: &mut dyn Collector, + ) -> Result<()> { + let batch_size = batch.get_array_memory_size(); + self.pending.push(batch); + self.pending_bytes += batch_size; + + if self.pending_bytes > self.early_flush_threshold_bytes { + debug!( + table = %self.table_name, + bytes = self.pending_bytes, + "memory watermark reached, triggering early flush" + ); + self.flush_epoch(0, ctx.subtask_index as usize).await?; + } + Ok(()) + } + + async fn process_watermark( + &mut self, + _watermark: Watermark, + _ctx: &mut TaskContext, + _collector: &mut dyn Collector, + ) -> Result<()> { + Ok(()) + } + + async fn snapshot_state( + &mut self, + barrier: CheckpointBarrier, + ctx: &mut TaskContext, + ) -> Result<()> { + self.flush_epoch(barrier.epoch, ctx.subtask_index as usize) + .await + } + + async fn on_close(&mut self, ctx: &mut TaskContext) -> Result> { + if !self.pending.is_empty() { + warn!(table = %self.table_name, "flushing remaining delta sink batches on close"); + self.flush_epoch(u64::MAX, ctx.subtask_index as usize) + .await?; + } + Ok(vec![]) + } +} diff --git a/src/runtime/streaming/operators/sink/filesystem/mod.rs b/src/runtime/streaming/operators/sink/filesystem/mod.rs new file mode 100644 index 00000000..a865a752 --- /dev/null +++ b/src/runtime/streaming/operators/sink/filesystem/mod.rs @@ -0,0 +1,233 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fs::create_dir_all; +use std::path::PathBuf; + +use anyhow::{Context, Result, bail}; +use arrow_array::RecordBatch; +use async_trait::async_trait; +use parquet::basic::Compression; +use tokio::io::AsyncWriteExt; +use tracing::{debug, info, warn}; + +use crate::runtime::memory::{MemoryBlock, try_global_memory_pool}; +use crate::runtime::streaming::StreamOutput; +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::{Collector, Operator}; +use crate::runtime::streaming::format::encoder::FormatEncoder; +use crate::sql::common::constants::factory_operator_name; +use crate::sql::common::{CheckpointBarrier, Watermark}; + +const DEFAULT_MAX_BUFFER_BYTES: usize = 64 * 1024 * 1024; + +#[derive(Debug, Clone, Copy)] +pub enum FilesystemFormat { + Csv, + Parquet, + JsonL, + Avro, + Orc, +} + +pub struct FilesystemSinkOperator { + table_name: String, + output_dir: PathBuf, + format: FilesystemFormat, + parquet_compression: Compression, + pending: Vec, + pending_bytes: usize, + sink_memory_block: Option>, + early_flush_threshold_bytes: usize, + file_counter: u64, +} + +impl FilesystemSinkOperator { + pub fn try_new( + table_name: String, + output_dir: String, + format: FilesystemFormat, + parquet_compression: Compression, + sink_memory_bytes: u64, + ) -> Result { + let output_dir_path = PathBuf::from(&output_dir); + create_dir_all(&output_dir_path).with_context(|| { + format!( + "failed to create filesystem sink directory {}", + output_dir_path.display() + ) + })?; + + let mut sink_memory_block = None; + let reserve_bytes = usize::try_from(sink_memory_bytes).unwrap_or(DEFAULT_MAX_BUFFER_BYTES); + let mut early_flush_threshold_bytes = reserve_bytes; + if let Ok(pool) = try_global_memory_pool() + && let Ok(block) = pool.try_request_block(reserve_bytes as u64) + { + early_flush_threshold_bytes = ((block.capacity() as usize) * 8) / 10; + sink_memory_block = Some(block); + } + + Ok(Self { + table_name, + output_dir: output_dir_path, + format, + parquet_compression, + pending: Vec::new(), + pending_bytes: 0, + sink_memory_block, + early_flush_threshold_bytes, + file_counter: 0, + }) + } + + fn extension(&self) -> &'static str { + match self.format { + FilesystemFormat::Csv => "csv", + FilesystemFormat::Parquet => "parquet", + FilesystemFormat::JsonL => "jsonl", + FilesystemFormat::Avro => "avro", + FilesystemFormat::Orc => "orc", + } + } + + async fn flush_file_epoch(&mut self, epoch: u64, subtask_idx: usize) -> Result<()> { + if self.pending.is_empty() { + return Ok(()); + } + + let batches = std::mem::take(&mut self.pending); + let format = self.format; + let compression = self.parquet_compression; + let bytes = tokio::task::spawn_blocking(move || -> Result> { + match format { + FilesystemFormat::Csv => FormatEncoder::encode_csv(&batches), + FilesystemFormat::Parquet => FormatEncoder::encode_parquet(&batches, compression), + FilesystemFormat::JsonL => FormatEncoder::encode_jsonl(&batches), + FilesystemFormat::Avro => FormatEncoder::encode_avro(&batches), + FilesystemFormat::Orc => FormatEncoder::encode_orc(&batches), + } + }) + .await + .context("tokio blocking task panicked during serialization")??; + + if bytes.is_empty() { + self.pending_bytes = 0; + return Ok(()); + } + + self.file_counter += 1; + let file_name = format!( + "part-{:05}-epoch-{:010}-{:06}.{}", + subtask_idx, + epoch, + self.file_counter, + self.extension() + ); + let output_file = self.output_dir.join(file_name); + let mut f = tokio::fs::File::create(&output_file) + .await + .with_context(|| format!("failed creating sink file {}", output_file.display()))?; + f.write_all(&bytes) + .await + .with_context(|| format!("failed writing sink file {}", output_file.display()))?; + self.pending_bytes = 0; + Ok(()) + } +} + +#[async_trait] +impl Operator for FilesystemSinkOperator { + fn name(&self) -> &str { + factory_operator_name::CONNECTOR_SINK + } + + async fn on_start(&mut self, _ctx: &mut TaskContext) -> Result<()> { + let reserved_block_bytes = self + .sink_memory_block + .as_ref() + .map(|b| b.capacity()) + .unwrap_or(0); + info!( + table = %self.table_name, + path = %self.output_dir.display(), + format = ?self.format, + reserved_block_bytes, + early_flush_threshold_bytes = self.early_flush_threshold_bytes, + "Starting filesystem sink operator" + ); + Ok(()) + } + + async fn process_data( + &mut self, + _input_idx: usize, + batch: RecordBatch, + ctx: &mut TaskContext, + _collector: &mut dyn Collector, + ) -> Result<()> { + let batch_size = batch.get_array_memory_size(); + self.pending.push(batch); + self.pending_bytes += batch_size; + if self.pending_bytes > self.early_flush_threshold_bytes { + debug!( + table = %self.table_name, + bytes = self.pending_bytes, + "memory watermark reached, triggering early flush to filesystem" + ); + self.flush_file_epoch(0, ctx.subtask_index as usize).await?; + } + Ok(()) + } + + async fn process_watermark( + &mut self, + _watermark: Watermark, + _ctx: &mut TaskContext, + _collector: &mut dyn Collector, + ) -> Result<()> { + Ok(()) + } + + async fn snapshot_state( + &mut self, + barrier: CheckpointBarrier, + ctx: &mut TaskContext, + ) -> Result<()> { + self.flush_file_epoch(barrier.epoch, ctx.subtask_index as usize) + .await + } + + async fn on_close(&mut self, ctx: &mut TaskContext) -> Result> { + if !self.pending.is_empty() { + warn!( + table = %self.table_name, + "flushing remaining filesystem sink batches on close" + ); + self.flush_file_epoch(u64::MAX, ctx.subtask_index as usize) + .await?; + } + Ok(vec![]) + } +} + +pub fn compression_from_str(v: Option<&str>) -> Result { + match v.unwrap_or("zstd").to_ascii_lowercase().as_str() { + "uncompressed" => Ok(Compression::UNCOMPRESSED), + "snappy" => Ok(Compression::SNAPPY), + "gzip" => Ok(Compression::GZIP(Default::default())), + "zstd" => Ok(Compression::ZSTD(Default::default())), + "lz4" => Ok(Compression::LZ4), + "lz4_raw" => Ok(Compression::LZ4_RAW), + other => bail!("unsupported parquet compression '{other}'"), + } +} diff --git a/src/runtime/streaming/operators/sink/iceberg/mod.rs b/src/runtime/streaming/operators/sink/iceberg/mod.rs new file mode 100644 index 00000000..b6c17414 --- /dev/null +++ b/src/runtime/streaming/operators/sink/iceberg/mod.rs @@ -0,0 +1,271 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::fs::create_dir_all; +use std::path::PathBuf; +use std::sync::Arc; + +use anyhow::{Context, Result}; +use arrow_array::RecordBatch; +use async_trait::async_trait; +use bytes::Bytes; +use object_store::aws::AmazonS3Builder; +use object_store::path::Path as ObjectStorePath; +use object_store::{ObjectStore, PutPayload}; +use parquet::basic::Compression; +use tokio::io::AsyncWriteExt; +use tracing::{debug, info, warn}; + +use crate::runtime::memory::{MemoryBlock, try_global_memory_pool}; +use crate::runtime::streaming::StreamOutput; +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::{Collector, Operator}; +use crate::runtime::streaming::format::encoder::FormatEncoder; +use crate::sql::common::constants::factory_operator_name; +use crate::sql::common::with_option_keys as opt; +use crate::sql::common::{CheckpointBarrier, Watermark}; + +const DEFAULT_MAX_BUFFER_BYTES: usize = 64 * 1024 * 1024; + +enum IcebergDestination { + Local(PathBuf), + S3 { + prefix: String, + client: Arc, + }, +} + +pub struct IcebergSinkOperator { + table_name: String, + destination: IcebergDestination, + parquet_compression: Compression, + pending: Vec, + pending_bytes: usize, + sink_memory_block: Option>, + early_flush_threshold_bytes: usize, + file_counter: u64, + format: IcebergFormat, +} + +#[derive(Debug, Clone, Copy)] +pub enum IcebergFormat { + Csv, + Parquet, +} + +impl IcebergSinkOperator { + pub fn try_new( + table_name: String, + path: String, + format: IcebergFormat, + parquet_compression: Compression, + sink_memory_bytes: u64, + options: HashMap, + ) -> Result { + let destination = if let Some(bucket) = options.get(opt::S3_BUCKET) { + let region = options + .get(opt::S3_REGION) + .cloned() + .unwrap_or_else(|| "us-east-1".to_string()); + let mut builder = AmazonS3Builder::new() + .with_bucket_name(bucket.clone()) + .with_region(region); + if let Some(endpoint) = options.get(opt::S3_ENDPOINT) { + builder = builder.with_endpoint(endpoint); + if endpoint.to_ascii_lowercase().starts_with("http://") { + builder = builder.with_allow_http(true); + } + } + if let Some(v) = options.get(opt::S3_ACCESS_KEY_ID) { + builder = builder.with_access_key_id(v); + } + if let Some(v) = options.get(opt::S3_SECRET_ACCESS_KEY) { + builder = builder.with_secret_access_key(v); + } + if let Some(v) = options.get(opt::S3_SESSION_TOKEN) { + builder = builder.with_token(v); + } + let client = builder + .build() + .context("failed to build s3 client for iceberg sink")?; + IcebergDestination::S3 { + prefix: path.trim_matches('/').to_string(), + client: Arc::new(client), + } + } else { + let root = PathBuf::from(path.clone()); + create_dir_all(&root) + .with_context(|| format!("failed to create iceberg sink dir {}", root.display()))?; + IcebergDestination::Local(root) + }; + + let mut sink_memory_block = None; + let reserve_bytes = usize::try_from(sink_memory_bytes).unwrap_or(DEFAULT_MAX_BUFFER_BYTES); + let mut early_flush_threshold_bytes = reserve_bytes; + if let Ok(pool) = try_global_memory_pool() + && let Ok(block) = pool.try_request_block(reserve_bytes as u64) + { + early_flush_threshold_bytes = ((block.capacity() as usize) * 8) / 10; + sink_memory_block = Some(block); + } + + Ok(Self { + table_name, + destination, + parquet_compression, + pending: Vec::new(), + pending_bytes: 0, + sink_memory_block, + early_flush_threshold_bytes, + file_counter: 0, + format, + }) + } + + async fn flush_epoch(&mut self, epoch: u64, subtask_idx: usize) -> Result<()> { + if self.pending.is_empty() { + return Ok(()); + } + + let batches = std::mem::take(&mut self.pending); + let format = self.format; + let compression = self.parquet_compression; + let bytes = tokio::task::spawn_blocking(move || -> Result> { + match format { + IcebergFormat::Csv => FormatEncoder::encode_csv(&batches), + IcebergFormat::Parquet => FormatEncoder::encode_parquet(&batches, compression), + } + }) + .await + .context("tokio blocking task panicked during serialization")??; + + if bytes.is_empty() { + self.pending_bytes = 0; + return Ok(()); + } + + self.file_counter += 1; + let file_name = format!( + "iceberg-part-{:05}-epoch-{:010}-{:06}.{}", + subtask_idx, + epoch, + self.file_counter, + match self.format { + IcebergFormat::Csv => "csv", + IcebergFormat::Parquet => "parquet", + } + ); + match &self.destination { + IcebergDestination::Local(root) => { + let out = root.join(file_name); + let mut f = tokio::fs::File::create(&out).await.with_context(|| { + format!("failed creating iceberg sink file {}", out.display()) + })?; + f.write_all(&bytes).await.with_context(|| { + format!("failed writing iceberg sink file {}", out.display()) + })?; + } + IcebergDestination::S3 { prefix, client } => { + let key = if prefix.is_empty() { + file_name + } else { + format!("{prefix}/{file_name}") + }; + client + .put( + &ObjectStorePath::from(key), + PutPayload::from(Bytes::from(bytes)), + ) + .await + .context("failed writing iceberg data object to s3")?; + } + } + self.pending_bytes = 0; + Ok(()) + } +} + +#[async_trait] +impl Operator for IcebergSinkOperator { + fn name(&self) -> &str { + factory_operator_name::CONNECTOR_SINK + } + + async fn on_start(&mut self, _ctx: &mut TaskContext) -> Result<()> { + let reserved_block_bytes = self + .sink_memory_block + .as_ref() + .map(|b| b.capacity()) + .unwrap_or(0); + info!( + table = %self.table_name, + format = ?self.format, + reserved_block_bytes, + early_flush_threshold_bytes = self.early_flush_threshold_bytes, + "Starting iceberg sink operator" + ); + Ok(()) + } + + async fn process_data( + &mut self, + _input_idx: usize, + batch: RecordBatch, + ctx: &mut TaskContext, + _collector: &mut dyn Collector, + ) -> Result<()> { + let batch_size = batch.get_array_memory_size(); + self.pending.push(batch); + self.pending_bytes += batch_size; + if self.pending_bytes > self.early_flush_threshold_bytes { + debug!( + table = %self.table_name, + bytes = self.pending_bytes, + threshold = self.early_flush_threshold_bytes, + "memory watermark reached, triggering early flush for iceberg sink" + ); + self.flush_epoch(0, ctx.subtask_index as usize).await?; + } + Ok(()) + } + + async fn process_watermark( + &mut self, + _watermark: Watermark, + _ctx: &mut TaskContext, + _collector: &mut dyn Collector, + ) -> Result<()> { + Ok(()) + } + + async fn snapshot_state( + &mut self, + barrier: CheckpointBarrier, + ctx: &mut TaskContext, + ) -> Result<()> { + self.flush_epoch(barrier.epoch, ctx.subtask_index as usize) + .await + } + + async fn on_close(&mut self, ctx: &mut TaskContext) -> Result> { + if !self.pending.is_empty() { + warn!( + table = %self.table_name, + "flushing remaining iceberg sink batches on close" + ); + self.flush_epoch(u64::MAX, ctx.subtask_index as usize) + .await?; + } + Ok(vec![]) + } +} diff --git a/src/runtime/streaming/operators/sink/kafka/mod.rs b/src/runtime/streaming/operators/sink/kafka/mod.rs index a9c4b50e..b30bc572 100644 --- a/src/runtime/streaming/operators/sink/kafka/mod.rs +++ b/src/runtime/streaming/operators/sink/kafka/mod.rs @@ -336,7 +336,7 @@ impl Operator for KafkaSinkOperator { Ok(()) } - async fn commit_checkpoint(&mut self, epoch: u32, _ctx: &mut TaskContext) -> Result<()> { + async fn commit_checkpoint(&mut self, epoch: u64, _ctx: &mut TaskContext) -> Result<()> { if matches!(self.consistency_mode, ConsistencyMode::AtLeastOnce) { return Ok(()); } @@ -380,7 +380,7 @@ impl Operator for KafkaSinkOperator { Ok(()) } - async fn abort_checkpoint(&mut self, epoch: u32, _ctx: &mut TaskContext) -> Result<()> { + async fn abort_checkpoint(&mut self, epoch: u64, _ctx: &mut TaskContext) -> Result<()> { if matches!(self.consistency_mode, ConsistencyMode::AtLeastOnce) { return Ok(()); } diff --git a/src/runtime/streaming/operators/sink/lancedb/mod.rs b/src/runtime/streaming/operators/sink/lancedb/mod.rs new file mode 100644 index 00000000..f8a082e1 --- /dev/null +++ b/src/runtime/streaming/operators/sink/lancedb/mod.rs @@ -0,0 +1,158 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::io::Cursor; + +use anyhow::{Context, Result}; +use arrow_array::RecordBatch; +use arrow_array_lance::RecordBatchIterator as LanceBatchIterator; +use async_trait::async_trait; +use lance::Dataset; +use lance::dataset::{WriteMode, WriteParams}; +use tracing::{info, warn}; + +use crate::runtime::streaming::StreamOutput; +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::{Collector, Operator}; +use crate::sql::common::constants::factory_operator_name; +use crate::sql::common::{CheckpointBarrier, Watermark}; + +pub struct LanceDbSinkOperator { + table_name: String, + dataset_uri: String, + pending: Vec, + initialized: bool, +} + +impl LanceDbSinkOperator { + pub fn new(table_name: String, dataset_uri: String) -> Self { + Self { + table_name, + dataset_uri, + pending: Vec::new(), + initialized: false, + } + } + + fn to_lance_batches( + &self, + batches: &[RecordBatch], + ) -> Result> { + let schema = batches + .first() + .map(|b| b.schema()) + .context("lanceDB sink requires at least one record batch")?; + + let mut ipc_payload = Vec::::new(); + { + let mut writer = arrow_ipc::writer::FileWriter::try_new(&mut ipc_payload, &schema) + .context("failed to build ipc writer for lanceDB conversion")?; + for batch in batches { + writer + .write(batch) + .context("failed writing ipc payload for lanceDB conversion")?; + } + writer + .finish() + .context("failed finishing ipc payload for lanceDB conversion")?; + } + + let reader = arrow_ipc_lance::reader::FileReader::try_new(Cursor::new(ipc_payload), None) + .context("failed reading lance-compatible ipc payload")?; + reader + .collect::, _>>() + .context("failed converting batches into lance-compatible batches") + } + + async fn flush_epoch(&mut self) -> Result<()> { + if self.pending.is_empty() { + return Ok(()); + } + + let lance_batches = self.to_lance_batches(&self.pending)?; + let schema = lance_batches + .first() + .map(|b| b.schema()) + .context("lanceDB sink produced no converted batches")?; + let reader = LanceBatchIterator::new(lance_batches.into_iter().map(Ok), schema); + let params = WriteParams { + mode: if self.initialized { + WriteMode::Append + } else { + WriteMode::Create + }, + ..Default::default() + }; + Dataset::write(reader, &self.dataset_uri, Some(params)) + .await + .with_context(|| format!("failed writing lance dataset '{}'", self.dataset_uri))?; + + self.initialized = true; + self.pending.clear(); + Ok(()) + } +} + +#[async_trait] +impl Operator for LanceDbSinkOperator { + fn name(&self) -> &str { + factory_operator_name::CONNECTOR_SINK + } + + async fn on_start(&mut self, _ctx: &mut TaskContext) -> Result<()> { + info!( + table = %self.table_name, + dataset = %self.dataset_uri, + "Starting lanceDB sink operator" + ); + Ok(()) + } + + async fn process_data( + &mut self, + _input_idx: usize, + batch: RecordBatch, + _ctx: &mut TaskContext, + _collector: &mut dyn Collector, + ) -> Result<()> { + self.pending.push(batch); + Ok(()) + } + + async fn process_watermark( + &mut self, + _watermark: Watermark, + _ctx: &mut TaskContext, + _collector: &mut dyn Collector, + ) -> Result<()> { + Ok(()) + } + + async fn snapshot_state( + &mut self, + _barrier: CheckpointBarrier, + _ctx: &mut TaskContext, + ) -> Result<()> { + self.flush_epoch().await + } + + async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result> { + if !self.pending.is_empty() { + warn!( + table = %self.table_name, + "flushing remaining lanceDB sink batches on close" + ); + self.flush_epoch().await?; + } + Ok(vec![]) + } +} diff --git a/src/runtime/streaming/operators/sink/mod.rs b/src/runtime/streaming/operators/sink/mod.rs index b9574391..ff893d57 100644 --- a/src/runtime/streaming/operators/sink/mod.rs +++ b/src/runtime/streaming/operators/sink/mod.rs @@ -10,4 +10,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +pub mod delta; +pub mod filesystem; +pub mod iceberg; pub mod kafka; +pub mod lancedb; +pub mod s3; diff --git a/src/runtime/streaming/operators/sink/s3/mod.rs b/src/runtime/streaming/operators/sink/s3/mod.rs new file mode 100644 index 00000000..715b5b86 --- /dev/null +++ b/src/runtime/streaming/operators/sink/s3/mod.rs @@ -0,0 +1,259 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::io::Cursor; + +use anyhow::{Context, Result, bail}; +use arrow::csv::WriterBuilder as CsvWriterBuilder; +use arrow_array::RecordBatch; +use async_trait::async_trait; +use bytes::Bytes; +use object_store::aws::AmazonS3Builder; +use object_store::path::Path as ObjectStorePath; +use object_store::{ObjectStore, PutPayload}; +use parquet::arrow::ArrowWriter; +use parquet::basic::Compression; +use parquet::file::properties::WriterProperties; +use tracing::{info, warn}; + +use crate::runtime::streaming::StreamOutput; +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::{Collector, Operator}; +use crate::sql::common::constants::factory_operator_name; +use crate::sql::common::with_option_keys as opt; +use crate::sql::common::{CheckpointBarrier, Watermark}; + +#[derive(Debug, Clone, Copy)] +pub enum S3Format { + Csv, + Parquet, +} + +pub struct S3SinkOperator { + table_name: String, + bucket: String, + prefix: String, + format: S3Format, + parquet_compression: Compression, + client: Box, + pending: Vec, + file_counter: u64, +} + +impl S3SinkOperator { + pub fn try_new( + table_name: String, + path: String, + format: S3Format, + parquet_compression: Compression, + s3_options: HashMap, + ) -> Result { + let bucket = s3_options + .get(opt::S3_BUCKET) + .cloned() + .context("s3 sink requires 's3.bucket'")?; + let region = s3_options + .get(opt::S3_REGION) + .cloned() + .unwrap_or_else(|| "us-east-1".to_string()); + + let mut builder = AmazonS3Builder::new() + .with_bucket_name(bucket.clone()) + .with_region(region); + if let Some(endpoint) = s3_options.get(opt::S3_ENDPOINT) { + builder = builder.with_endpoint(endpoint); + if endpoint.to_ascii_lowercase().starts_with("http://") { + builder = builder.with_allow_http(true); + } + } + if let Some(v) = s3_options.get(opt::S3_ACCESS_KEY_ID) { + builder = builder.with_access_key_id(v); + } + if let Some(v) = s3_options.get(opt::S3_SECRET_ACCESS_KEY) { + builder = builder.with_secret_access_key(v); + } + if let Some(v) = s3_options.get(opt::S3_SESSION_TOKEN) { + builder = builder.with_token(v); + } + let client = builder + .build() + .context("failed to build s3 object-store client")?; + + let prefix = path.trim_matches('/').to_string(); + + Ok(Self { + table_name, + bucket, + prefix, + format, + parquet_compression, + client: Box::new(client), + pending: Vec::new(), + file_counter: 0, + }) + } + + fn extension(&self) -> &'static str { + match self.format { + S3Format::Csv => "csv", + S3Format::Parquet => "parquet", + } + } + + fn serialize_csv(&self) -> Result> { + let mut out = Vec::new(); + let mut writer = CsvWriterBuilder::new().with_header(true).build(&mut out); + for batch in &self.pending { + writer.write(batch).context("failed writing csv batch")?; + } + drop(writer); + Ok(out) + } + + fn serialize_parquet(&self) -> Result> { + let schema = self + .pending + .first() + .map(|b| b.schema()) + .context("parquet serialization requires at least one record batch")?; + let props = WriterProperties::builder() + .set_compression(self.parquet_compression) + .build(); + let mut cursor = Cursor::new(Vec::::new()); + let mut writer = ArrowWriter::try_new(&mut cursor, schema, Some(props)) + .context("failed to initialize parquet writer")?; + for batch in &self.pending { + writer + .write(batch) + .context("failed writing parquet batch")?; + } + writer.close().context("failed to close parquet writer")?; + Ok(cursor.into_inner()) + } + + async fn flush_epoch(&mut self, epoch: u64, subtask_idx: usize, bytes: Vec) -> Result<()> { + if self.pending.is_empty() { + return Ok(()); + } + if bytes.is_empty() { + self.pending.clear(); + return Ok(()); + } + + self.file_counter += 1; + let file_name = format!( + "part-{:05}-epoch-{:010}-{:06}.{}", + subtask_idx, + epoch, + self.file_counter, + self.extension() + ); + let key = if self.prefix.is_empty() { + file_name + } else { + format!("{}/{}", self.prefix, file_name) + }; + self.client + .put( + &ObjectStorePath::from(key), + PutPayload::from(Bytes::from(bytes)), + ) + .await + .context("failed writing object to s3")?; + + self.pending.clear(); + Ok(()) + } +} + +#[async_trait] +impl Operator for S3SinkOperator { + fn name(&self) -> &str { + factory_operator_name::CONNECTOR_SINK + } + + async fn on_start(&mut self, _ctx: &mut TaskContext) -> Result<()> { + info!( + table = %self.table_name, + bucket = %self.bucket, + prefix = %self.prefix, + format = ?self.format, + "Starting s3 sink operator" + ); + Ok(()) + } + + async fn process_data( + &mut self, + _input_idx: usize, + batch: RecordBatch, + _ctx: &mut TaskContext, + _collector: &mut dyn Collector, + ) -> Result<()> { + self.pending.push(batch); + Ok(()) + } + + async fn process_watermark( + &mut self, + _watermark: Watermark, + _ctx: &mut TaskContext, + _collector: &mut dyn Collector, + ) -> Result<()> { + Ok(()) + } + + async fn snapshot_state( + &mut self, + barrier: CheckpointBarrier, + ctx: &mut TaskContext, + ) -> Result<()> { + if self.pending.is_empty() { + return Ok(()); + } + let bytes = match self.format { + S3Format::Csv => self.serialize_csv()?, + S3Format::Parquet => self.serialize_parquet()?, + }; + self.flush_epoch(barrier.epoch, ctx.subtask_index as usize, bytes) + .await + } + + async fn on_close(&mut self, ctx: &mut TaskContext) -> Result> { + if !self.pending.is_empty() { + warn!( + table = %self.table_name, + "flushing remaining s3 sink batches on close" + ); + let bytes = match self.format { + S3Format::Csv => self.serialize_csv()?, + S3Format::Parquet => self.serialize_parquet()?, + }; + self.flush_epoch(0, ctx.subtask_index as usize, bytes) + .await?; + } + Ok(vec![]) + } +} + +pub fn compression_from_str(v: Option<&str>) -> Result { + match v.unwrap_or("zstd").to_ascii_lowercase().as_str() { + "uncompressed" => Ok(Compression::UNCOMPRESSED), + "snappy" => Ok(Compression::SNAPPY), + "gzip" => Ok(Compression::GZIP(Default::default())), + "zstd" => Ok(Compression::ZSTD(Default::default())), + "lz4" => Ok(Compression::LZ4), + "lz4_raw" => Ok(Compression::LZ4_RAW), + other => bail!("unsupported parquet compression '{other}'"), + } +} diff --git a/src/runtime/streaming/operators/source/kafka/mod.rs b/src/runtime/streaming/operators/source/kafka/mod.rs index 9f5b84ad..75edb968 100644 --- a/src/runtime/streaming/operators/source/kafka/mod.rs +++ b/src/runtime/streaming/operators/source/kafka/mod.rs @@ -10,21 +10,24 @@ // See the License for the specific language governing permissions and // limitations under the License. -//! Kafka source checkpointing: `enable.auto.commit=false`, offsets captured at the checkpoint barrier -//! and reported to the job coordinator for catalog persistence; restart rewinds from that snapshot. +//! Kafka source checkpointing: `enable.auto.commit=false`, offsets captured at the checkpoint +//! barrier and reported to the job coordinator for catalog persistence; on restart the catalog +//! records are injected directly into [`TaskContext::source_checkpoint_infos`] — no intermediate +//! on-disk snapshot files are used. use anyhow::{Context as _, Result, anyhow}; use arrow_array::RecordBatch; use arrow_schema::SchemaRef; use async_trait::async_trait; -use bincode::{Decode, Encode}; use governor::{DefaultDirectRateLimiter, Quota, RateLimiter as GovernorRateLimiter}; -use protocol::storage::{KafkaPartitionOffset, KafkaSourceSubtaskCheckpoint}; +use protocol::storage::{ + KafkaPartitionOffset, KafkaSourceSubtaskCheckpoint, SourceCheckpointInfo, + source_checkpoint_info, +}; use rdkafka::consumer::{CommitMode, Consumer, StreamConsumer}; use rdkafka::{ClientConfig, Message as KMessage, Offset, TopicPartitionList}; use std::collections::HashMap; use std::num::NonZeroU32; -use std::path::PathBuf; use std::time::{Duration, Instant}; use tracing::{debug, error, info, warn}; @@ -35,80 +38,6 @@ use crate::runtime::streaming::api::source::{ use crate::runtime::streaming::format::{BadDataPolicy, DataDeserializer, Format}; use crate::sql::common::fs_schema::FieldValueType; use crate::sql::common::{CheckpointBarrier, MetadataField}; -// ============================================================================ -// ============================================================================ - -#[derive(Copy, Clone, Debug, Encode, Decode, PartialEq, PartialOrd)] -pub struct KafkaState { - pub partition: i32, - pub offset: i64, -} - -/// Last committed partition offsets for this source subtask, tied to a checkpoint epoch. -/// Materialized into a `.bin` under the job state dir from catalog before restart; see -/// [`TaskContext::latest_safe_epoch`] and `StreamingTableDefinition` in `storage.proto`. -#[derive(Debug, Encode, Decode)] -pub(crate) struct KafkaSourceSavedOffsets { - /// Same numbering as [`CheckpointBarrier::epoch`] / catalog `latest_checkpoint_epoch` (as u64). - pub(crate) epoch: u64, - pub(crate) partitions: Vec, -} - -pub(crate) fn encode_kafka_offset_snapshot(saved: &KafkaSourceSavedOffsets) -> Result> { - bincode::encode_to_vec(saved, bincode::config::standard()) - .map_err(|e| anyhow!("bincode encode Kafka offset snapshot: {e}")) -} - -pub(crate) fn decode_kafka_offset_snapshot(bytes: &[u8]) -> Result { - let (saved, _) = bincode::decode_from_slice(bytes, bincode::config::standard()) - .map_err(|e| anyhow!("bincode decode Kafka offset snapshot: {e}"))?; - Ok(saved) -} - -pub(crate) fn kafka_snapshot_path( - job_dir: &std::path::Path, - pipeline_id: u32, - subtask_index: u32, -) -> PathBuf { - job_dir.join(format!( - "kafka_source_offsets_pipe{}_sub{}.bin", - pipeline_id, subtask_index - )) -} - -fn kafka_offsets_snapshot_path(ctx: &TaskContext) -> PathBuf { - kafka_snapshot_path(&ctx.state_dir, ctx.pipeline_id, ctx.subtask_index) -} - -fn load_saved_offsets_if_recovering(ctx: &TaskContext) -> Option { - let safe = ctx.latest_safe_epoch(); - if safe == 0 { - return None; - } - let path = kafka_offsets_snapshot_path(ctx); - let bytes = std::fs::read(&path).ok()?; - let saved = match decode_kafka_offset_snapshot(&bytes) { - Ok(v) => v, - Err(e) => { - warn!( - path = %path.display(), - error = %e, - "Failed to decode Kafka offset snapshot" - ); - return None; - } - }; - if saved.epoch > safe { - warn!( - path = %path.display(), - saved_epoch = saved.epoch, - safe_epoch = safe, - "Ignoring Kafka offset snapshot newer than catalog safe epoch" - ); - return None; - } - Some(saved) -} pub trait BatchDeserializer: Send + 'static { fn deserialize_slice( @@ -220,8 +149,10 @@ pub struct KafkaSourceOperator { current_offsets: HashMap, is_empty_assignment: bool, - last_flush_time: Instant, + + /// Checkpoint records injected before `on_start`; consumed once to restore partition offsets. + recovery_checkpoint_infos: Vec, } impl KafkaSourceOperator { @@ -252,14 +183,44 @@ impl KafkaSourceOperator { current_offsets: HashMap::new(), is_empty_assignment: false, last_flush_time: Instant::now(), + recovery_checkpoint_infos: vec![], } } - async fn init_and_assign_consumer( - &mut self, - ctx: &mut TaskContext, - saved_offsets: Option, - ) -> Result<()> { + fn load_recovery_offsets(&mut self, ctx: &TaskContext) -> (bool, HashMap) { + if ctx.latest_safe_epoch() == 0 || self.recovery_checkpoint_infos.is_empty() { + return (false, HashMap::new()); + } + let cp = self.recovery_checkpoint_infos.iter().find_map(|info| { + if let Some(source_checkpoint_info::Info::Kafka(cp)) = &info.info + && cp.subtask_index == ctx.subtask_index + { + return Some(cp); + } + None + }); + match cp { + Some(cp) => { + info!( + job_id = %ctx.job_id, + pipeline_id = ctx.pipeline_id, + subtask = ctx.subtask_index, + epoch = cp.checkpoint_epoch, + partitions = cp.partitions.len(), + "Restoring Kafka source offsets from catalog checkpoint" + ); + let map = cp + .partitions + .iter() + .map(|p| (p.partition, p.offset)) + .collect(); + (true, map) + } + None => (false, HashMap::new()), + } + } + + async fn init_and_assign_consumer(&mut self, ctx: &mut TaskContext) -> Result<()> { info!("Creating kafka consumer for {}", self.bootstrap_servers); let mut client_config = ClientConfig::new(); @@ -282,24 +243,7 @@ impl KafkaSourceOperator { .set("group.id", &group_id) .create()?; - let (has_state, state_map) = if let Some(saved) = saved_offsets { - info!( - job_id = %ctx.job_id, - pipeline_id = ctx.pipeline_id, - subtask = ctx.subtask_index, - epoch = saved.epoch, - safe_epoch = ctx.latest_safe_epoch(), - partitions = saved.partitions.len(), - "Restoring Kafka source offsets from materialized checkpoint snapshot" - ); - let mut m = HashMap::with_capacity(saved.partitions.len()); - for s in saved.partitions { - m.insert(s.partition, s); - } - (true, m) - } else { - (false, HashMap::new()) - }; + let (has_state, saved_offsets_map) = self.load_recovery_offsets(ctx); let metadata = consumer .fetch_metadata(Some(&self.topic), Duration::from_secs(30)) @@ -317,10 +261,10 @@ impl KafkaSourceOperator { for p in partitions { if p.id().rem_euclid(pmax) == ctx.subtask_index as i32 { - // `current_offsets` / snapshot store last consumed offset; resume at next offset. - let offset = state_map + // saved_offsets_map stores last consumed offset; resume at next offset. + let offset = saved_offsets_map .get(&p.id()) - .map(|s| Offset::Offset(s.offset.saturating_add(1))) + .map(|&last| Offset::Offset(last.saturating_add(1))) .unwrap_or_else(|| { if has_state { Offset::Beginning @@ -357,9 +301,12 @@ impl SourceOperator for KafkaSourceOperator { &self.topic } + fn set_recovery_checkpoint(&mut self, infos: Vec) { + self.recovery_checkpoint_infos = infos; + } + async fn on_start(&mut self, ctx: &mut TaskContext) -> Result<()> { - let saved = load_saved_offsets_if_recovering(ctx); - self.init_and_assign_consumer(ctx, saved).await?; + self.init_and_assign_consumer(ctx).await?; self.rate_limiter = Some(GovernorRateLimiter::direct(Quota::per_second( self.messages_per_second, ))); @@ -479,7 +426,7 @@ impl SourceOperator for KafkaSourceOperator { warn!("Failed to commit async offset to Kafka Broker: {:?}", e); } - let epoch = u64::from(barrier.epoch); + let epoch = barrier.epoch; if self.current_offsets.is_empty() { return Ok(SourceCheckpointReport::default()); } diff --git a/src/runtime/streaming/operators/windows/session_aggregating_window.rs b/src/runtime/streaming/operators/windows/session_aggregating_window.rs index 2da2c285..2056cdd9 100644 --- a/src/runtime/streaming/operators/windows/session_aggregating_window.rs +++ b/src/runtime/streaming/operators/windows/session_aggregating_window.rs @@ -871,7 +871,7 @@ impl Operator for SessionWindowOperator { self.state_store .as_ref() .expect("State store not initialized") - .prepare_checkpoint_epoch(barrier.epoch as u64) + .prepare_checkpoint_epoch(barrier.epoch) .map_err(|e| anyhow!("Snapshot failed: {e}"))?; info!( @@ -881,11 +881,11 @@ impl Operator for SessionWindowOperator { Ok(()) } - async fn commit_checkpoint(&mut self, epoch: u32, _ctx: &mut TaskContext) -> Result<()> { + async fn commit_checkpoint(&mut self, epoch: u64, _ctx: &mut TaskContext) -> Result<()> { self.state_store .as_ref() .expect("State store not initialized") - .commit_checkpoint_epoch(epoch as u64) + .commit_checkpoint_epoch(epoch) .map_err(|e| anyhow!("Commit checkpoint failed: {e}"))?; Ok(()) } diff --git a/src/runtime/streaming/operators/windows/sliding_aggregating_window.rs b/src/runtime/streaming/operators/windows/sliding_aggregating_window.rs index 3516e950..f18b3b14 100644 --- a/src/runtime/streaming/operators/windows/sliding_aggregating_window.rs +++ b/src/runtime/streaming/operators/windows/sliding_aggregating_window.rs @@ -613,16 +613,16 @@ impl Operator for SlidingWindowOperator { self.state_store .as_ref() .expect("State store not initialized") - .prepare_checkpoint_epoch(barrier.epoch as u64) + .prepare_checkpoint_epoch(barrier.epoch) .map_err(|e| anyhow!("Snapshot failed: {e}"))?; Ok(()) } - async fn commit_checkpoint(&mut self, epoch: u32, _ctx: &mut TaskContext) -> Result<()> { + async fn commit_checkpoint(&mut self, epoch: u64, _ctx: &mut TaskContext) -> Result<()> { self.state_store .as_ref() .expect("State store not initialized") - .commit_checkpoint_epoch(epoch as u64) + .commit_checkpoint_epoch(epoch) .map_err(|e| anyhow!("Commit checkpoint failed: {e}"))?; Ok(()) } diff --git a/src/runtime/streaming/operators/windows/tumbling_aggregating_window.rs b/src/runtime/streaming/operators/windows/tumbling_aggregating_window.rs index 6b6b6029..5c805625 100644 --- a/src/runtime/streaming/operators/windows/tumbling_aggregating_window.rs +++ b/src/runtime/streaming/operators/windows/tumbling_aggregating_window.rs @@ -393,16 +393,16 @@ impl Operator for TumblingWindowOperator { self.state_store .as_ref() .expect("State store not initialized") - .prepare_checkpoint_epoch(barrier.epoch as u64) + .prepare_checkpoint_epoch(barrier.epoch) .map_err(|e| anyhow!("Snapshot failed: {e}"))?; Ok(()) } - async fn commit_checkpoint(&mut self, epoch: u32, _ctx: &mut TaskContext) -> Result<()> { + async fn commit_checkpoint(&mut self, epoch: u64, _ctx: &mut TaskContext) -> Result<()> { self.state_store .as_ref() .expect("State store not initialized") - .commit_checkpoint_epoch(epoch as u64) + .commit_checkpoint_epoch(epoch) .map_err(|e| anyhow!("Commit checkpoint failed: {e}"))?; Ok(()) } diff --git a/src/runtime/streaming/operators/windows/window_function.rs b/src/runtime/streaming/operators/windows/window_function.rs index 1249233e..37815b78 100644 --- a/src/runtime/streaming/operators/windows/window_function.rs +++ b/src/runtime/streaming/operators/windows/window_function.rs @@ -269,16 +269,16 @@ impl Operator for WindowFunctionOperator { self.state_store .as_ref() .expect("State store not initialized") - .prepare_checkpoint_epoch(barrier.epoch as u64) + .prepare_checkpoint_epoch(barrier.epoch) .map_err(|e| anyhow!("Snapshot failed: {e}"))?; Ok(()) } - async fn commit_checkpoint(&mut self, epoch: u32, _ctx: &mut TaskContext) -> Result<()> { + async fn commit_checkpoint(&mut self, epoch: u64, _ctx: &mut TaskContext) -> Result<()> { self.state_store .as_ref() .expect("State store not initialized") - .commit_checkpoint_epoch(epoch as u64) + .commit_checkpoint_epoch(epoch) .map_err(|e| anyhow!("Commit checkpoint failed: {e}"))?; Ok(()) } diff --git a/src/runtime/streaming/protocol/control.rs b/src/runtime/streaming/protocol/control.rs index 6d0bc492..70ab71be 100644 --- a/src/runtime/streaming/protocol/control.rs +++ b/src/runtime/streaming/protocol/control.rs @@ -11,15 +11,15 @@ // limitations under the License. use super::event::CheckpointBarrier; -use protocol::storage::SourceCheckpointPayload; +use protocol::storage::SourceCheckpointInfo; use serde::{Deserialize, Serialize}; use std::time::Duration; use tokio::sync::mpsc::{self, Receiver, Sender}; #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct CheckpointBarrierWire { - pub epoch: u32, - pub min_epoch: u32, + pub epoch: u64, + pub min_epoch: u64, pub timestamp_secs: u64, pub timestamp_subsec_nanos: u32, pub then_stop: bool, @@ -62,11 +62,11 @@ pub enum ControlCommand { DropState, /// Phase 2 of checkpoint 2PC: metadata durable; transactional Kafka sink should `commit_transaction`. Commit { - epoch: u32, + epoch: u64, }, /// Roll back pre-committed transactional Kafka writes when checkpoint metadata commit failed or barrier declined. AbortCheckpoint { - epoch: u32, + epoch: u64, }, UpdateConfig { config_json: String, @@ -99,8 +99,8 @@ pub enum JobMasterEvent { CheckpointAck { pipeline_id: u32, epoch: u64, - /// Source protocol checkpoint payloads (enum-style oneof envelope). - source_payloads: Vec, + /// Per-subtask checkpoint records produced directly by the source during snapshot. + source_infos: Vec, }, CheckpointDecline { pipeline_id: u32, diff --git a/src/runtime/streaming/protocol/event.rs b/src/runtime/streaming/protocol/event.rs index 21be6852..093d99ba 100644 --- a/src/runtime/streaming/protocol/event.rs +++ b/src/runtime/streaming/protocol/event.rs @@ -27,8 +27,8 @@ pub enum Watermark { #[derive(Debug, Copy, Clone, PartialEq, Eq, Encode, Decode, Serialize, Deserialize)] pub struct CheckpointBarrier { - pub epoch: u32, - pub min_epoch: u32, + pub epoch: u64, + pub min_epoch: u64, pub timestamp: SystemTime, pub then_stop: bool, } diff --git a/src/sql/analysis/source_metadata_visitor.rs b/src/sql/analysis/source_metadata_visitor.rs index 55350301..b37d3a1b 100644 --- a/src/sql/analysis/source_metadata_visitor.rs +++ b/src/sql/analysis/source_metadata_visitor.rs @@ -51,7 +51,11 @@ impl<'a> SourceMetadataVisitor<'a> { let table = self.schema_provider.get_catalog_table(&table_name)?; match table { - crate::sql::schema::table::Table::ConnectorTable(t) => t.registry_id, + crate::sql::schema::table::CatalogEntity::ExternalConnector(b) => match b.as_ref() { + crate::sql::schema::catalog::ExternalTable::Source(t) => t.registry_id, + crate::sql::schema::catalog::ExternalTable::Lookup(t) => t.registry_id, + _ => None, + }, _ => None, } } diff --git a/src/sql/analysis/source_rewriter.rs b/src/sql/analysis/source_rewriter.rs index 0bd15e85..620ea336 100644 --- a/src/sql/analysis/source_rewriter.rs +++ b/src/sql/analysis/source_rewriter.rs @@ -27,8 +27,8 @@ use crate::sql::logical_node::table_source::StreamIngestionNode; use crate::sql::logical_node::watermark_node::EventTimeWatermarkNode; use crate::sql::schema::ColumnDescriptor; use crate::sql::schema::StreamSchemaProvider; -use crate::sql::schema::source_table::SourceTable; -use crate::sql::schema::table::Table; +use crate::sql::schema::catalog::{ExternalTable, SourceTable}; +use crate::sql::schema::table::CatalogEntity; use crate::sql::types::TIMESTAMP_FIELD; /// Rewrites table scans: projections are lifted out of scans into a dedicated projection node @@ -285,15 +285,21 @@ impl TreeNodeRewriter for SourceRewriter<'_> { .ok_or_else(|| DataFusionError::Plan(format!("Table {table_name} not found")))?; match table { - Table::ConnectorTable(table) => self.mutate_connector_table(&table_scan, table), - Table::LookupTable(_table) => { - // TODO: implement LookupSource extension - plan_err!("Lookup tables are not yet supported") - } - Table::TableFromQuery { + CatalogEntity::ExternalConnector(b) => match b.as_ref() { + ExternalTable::Source(source) => self.mutate_connector_table(&table_scan, source), + ExternalTable::Lookup(_) => { + // TODO: implement LookupSource extension + plan_err!("Lookup tables are not yet supported") + } + ExternalTable::Sink(sink) => plan_err!( + "Cannot SELECT from sink table '{}' (sinks are write-only)", + sink.name() + ), + }, + CatalogEntity::ComputedTable { name: _, logical_plan, - } => self.mutate_table_from_query(&table_scan, logical_plan), + } => self.mutate_table_from_query(&table_scan, logical_plan.as_ref()), } } } diff --git a/src/sql/api/mod.rs b/src/sql/api/mod.rs index cdc119b7..9fc6b23f 100644 --- a/src/sql/api/mod.rs +++ b/src/sql/api/mod.rs @@ -25,8 +25,6 @@ pub mod var_str; use serde::{Deserialize, Serialize}; -pub use connections::ConnectionProfile; - #[derive(Serialize, Deserialize, Clone, Debug)] #[serde(rename_all = "camelCase")] pub struct PaginatedCollection { diff --git a/src/sql/common/connector_options.rs b/src/sql/common/connector_options.rs index e2e306b6..0702d945 100644 --- a/src/sql/common/connector_options.rs +++ b/src/sql/common/connector_options.rs @@ -106,6 +106,21 @@ impl ConnectorOptions { } } + pub fn peek_opt_str(&self, name: &str) -> DFResult> { + match self.options.get(name) { + Some(Expr::Value(ValueWithSpan { + value: SqlValue::SingleQuotedString(s), + span: _, + })) => Ok(Some(s.clone())), + Some(e) => Err(plan_datafusion_err!( + "expected with option '{}' to be a single-quoted string, but it was `{:?}`", + name, + e + )), + None => Ok(None), + } + } + pub fn pull_str(&mut self, name: &str) -> DFResult { self.pull_opt_str(name)? .ok_or_else(|| plan_datafusion_err!("required option '{}' not set", name)) diff --git a/src/sql/common/constants.rs b/src/sql/common/constants.rs index 19fdbcb3..8cdb68e3 100644 --- a/src/sql/common/constants.rs +++ b/src/sql/common/constants.rs @@ -139,6 +139,11 @@ pub mod interval_duration_unit { pub mod connection_format_value { pub const JSON: &str = "json"; + pub const CSV: &str = "csv"; + pub const JSONL: &str = "jsonl"; + pub const NDJSON: &str = "ndjson"; + pub const LANCE: &str = "lance"; + pub const ORC: &str = "orc"; pub const DEBEZIUM_JSON: &str = "debezium_json"; pub const AVRO: &str = "avro"; pub const PARQUET: &str = "parquet"; @@ -246,6 +251,8 @@ pub mod connector_type { pub const FILESYSTEM: &str = "filesystem"; pub const DELTA: &str = "delta"; pub const ICEBERG: &str = "iceberg"; + pub const LANCE_DB: &str = "lanceDB"; + pub const S3: &str = "s3"; pub const PULSAR: &str = "pulsar"; pub const NATS: &str = "nats"; pub const REDIS: &str = "redis"; @@ -264,7 +271,14 @@ pub mod connection_table_role { pub const LOOKUP: &str = "lookup"; } -pub const SUPPORTED_CONNECTOR_ADAPTERS: &[&str] = &[connector_type::KAFKA]; +pub const SUPPORTED_CONNECTOR_ADAPTERS: &[&str] = &[ + connector_type::KAFKA, + connector_type::FILESYSTEM, + connector_type::S3, + connector_type::DELTA, + connector_type::ICEBERG, + connector_type::LANCE_DB, +]; pub mod kafka_with_value { pub const SCAN_LATEST: &str = "latest"; diff --git a/src/sql/common/format_from_opts.rs b/src/sql/common/format_from_opts.rs index 276235c1..ffd29572 100644 --- a/src/sql/common/format_from_opts.rs +++ b/src/sql/common/format_from_opts.rs @@ -19,9 +19,9 @@ use datafusion::common::{Result as DFResult, plan_datafusion_err, plan_err}; use super::connector_options::ConnectorOptions; use super::constants::{bad_data_value, connection_format_value, framing_method_value}; use super::formats::{ - AvroFormat, BadData, DecimalEncoding, Format, Framing, JsonCompression, JsonFormat, - NewlineDelimitedFraming, ParquetCompression, ParquetFormat, ProtobufFormat, RawBytesFormat, - RawStringFormat, TimestampFormat, + AvroFormat, BadData, CsvFormat, DecimalEncoding, Format, Framing, JsonCompression, JsonFormat, + LanceFormat, NewlineDelimitedFraming, ParquetCompression, ParquetFormat, ProtobufFormat, + RawBytesFormat, RawStringFormat, TimestampFormat, }; use super::with_option_keys as opt; @@ -61,12 +61,14 @@ impl JsonFormat { impl Format { pub fn from_opts(opts: &mut ConnectorOptions) -> DFResult> { - let Some(name) = opts.pull_opt_str(opt::FORMAT)? else { + let Some(name) = opts.peek_opt_str(opt::FORMAT)? else { return Ok(None); }; let n = name.to_lowercase(); match n.as_str() { connection_format_value::JSON => Ok(Some(Format::Json(JsonFormat::from_opts(opts)?))), + connection_format_value::CSV => Ok(Some(Format::Csv(CsvFormat {}))), + connection_format_value::LANCE => Ok(Some(Format::Lance(LanceFormat {}))), connection_format_value::DEBEZIUM_JSON => { let mut j = JsonFormat::from_opts(opts)?; j.debezium = true; diff --git a/src/sql/common/formats.rs b/src/sql/common/formats.rs index aad3ce18..a47d93cf 100644 --- a/src/sql/common/formats.rs +++ b/src/sql/common/formats.rs @@ -115,6 +115,14 @@ pub struct RawStringFormat {} #[serde(rename_all = "snake_case")] pub struct RawBytesFormat {} +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash, PartialOrd)] +#[serde(rename_all = "snake_case")] +pub struct CsvFormat {} + +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash, PartialOrd)] +#[serde(rename_all = "snake_case")] +pub struct LanceFormat {} + #[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash, PartialOrd)] #[serde(rename_all = "snake_case")] pub struct AvroFormat { @@ -199,6 +207,8 @@ pub struct ProtobufFormat { #[serde(rename_all = "snake_case", tag = "type")] pub enum Format { Json(JsonFormat), + Csv(CsvFormat), + Lance(LanceFormat), Avro(AvroFormat), Protobuf(ProtobufFormat), Parquet(ParquetFormat), @@ -216,6 +226,8 @@ impl Format { pub fn name(&self) -> &'static str { match self { Format::Json(_) => connection_format_value::JSON, + Format::Csv(_) => connection_format_value::CSV, + Format::Lance(_) => connection_format_value::LANCE, Format::Avro(_) => connection_format_value::AVRO, Format::Protobuf(_) => connection_format_value::PROTOBUF, Format::Parquet(_) => connection_format_value::PARQUET, diff --git a/src/sql/common/with_option_keys.rs b/src/sql/common/with_option_keys.rs index b998d1eb..21bfa691 100644 --- a/src/sql/common/with_option_keys.rs +++ b/src/sql/common/with_option_keys.rs @@ -16,6 +16,8 @@ pub const FORMAT: &str = "format"; pub const DEFAULT_FORMAT_VALUE: &str = "json"; pub const BAD_DATA: &str = "bad_data"; pub const PARTITION_BY: &str = "partition_by"; +pub const PATH: &str = "path"; +pub const SINK_PATH: &str = "sink.path"; pub const EVENT_TIME_FIELD: &str = "event_time_field"; pub const WATERMARK_FIELD: &str = "watermark_field"; @@ -71,6 +73,15 @@ pub const AVRO_SCHEMA_ID: &str = "avro.schema_id"; pub const PARQUET_COMPRESSION: &str = "parquet.compression"; pub const PARQUET_ROW_GROUP_BYTES: &str = "parquet.row_group_bytes"; +// ── S3 ──────────────────────────────────────────────────────────────────── + +pub const S3_BUCKET: &str = "s3.bucket"; +pub const S3_REGION: &str = "s3.region"; +pub const S3_ENDPOINT: &str = "s3.endpoint"; +pub const S3_ACCESS_KEY_ID: &str = "s3.access_key_id"; +pub const S3_SECRET_ACCESS_KEY: &str = "s3.secret_access_key"; +pub const S3_SESSION_TOKEN: &str = "s3.session_token"; + // ── Protobuf ──────────────────────────────────────────────────────────────── pub const PROTOBUF_INTO_UNSTRUCTURED_JSON: &str = "protobuf.into_unstructured_json"; @@ -84,3 +95,11 @@ pub const FRAMING_METHOD: &str = "framing.method"; pub const FRAMING_MAX_LINE_LENGTH: &str = "framing.max_line_length"; pub const FORMAT_DEBEZIUM_FLAG: &str = "format.debezium"; + +// ── Streaming runtime common options ─────────────────────────────────────── + +pub const CHECKPOINT_INTERVAL_MS: &str = "checkpoint.interval.ms"; +pub const PIPELINE_PARALLELISM: &str = "pipeline.parallelism"; +pub const KEY_BY_PARALLELISM: &str = "key_by.parallelism"; +pub const OPERATOR_MEMORY_BYTES: &str = "operator.memory.bytes"; +pub const SINK_MEMORY_BYTES: &str = "sink.memory.bytes"; diff --git a/src/sql/connector/config.rs b/src/sql/connector/config.rs new file mode 100644 index 00000000..c3eaf00f --- /dev/null +++ b/src/sql/connector/config.rs @@ -0,0 +1,91 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use protocol::function_stream_graph::{ + DeltaSinkConfig, FilesystemSinkConfig, IcebergSinkConfig, KafkaSinkConfig, KafkaSourceConfig, + LanceDbSinkConfig, S3SinkConfig, connector_op, +}; + +#[derive(Debug, Clone)] +pub enum ConnectorConfig { + KafkaSource(KafkaSourceConfig), + KafkaSink(KafkaSinkConfig), + FilesystemSink(FilesystemSinkConfig), + DeltaSink(DeltaSinkConfig), + IcebergSink(IcebergSinkConfig), + S3Sink(S3SinkConfig), + LanceDbSink(LanceDbSinkConfig), +} + +impl ConnectorConfig { + pub fn to_proto_config(&self) -> connector_op::Config { + match self { + ConnectorConfig::KafkaSource(cfg) => connector_op::Config::KafkaSource(cfg.clone()), + ConnectorConfig::KafkaSink(cfg) => connector_op::Config::KafkaSink(cfg.clone()), + ConnectorConfig::FilesystemSink(cfg) => { + connector_op::Config::FilesystemSink(cfg.clone()) + } + ConnectorConfig::DeltaSink(cfg) => connector_op::Config::DeltaSink(cfg.clone()), + ConnectorConfig::IcebergSink(cfg) => connector_op::Config::IcebergSink(cfg.clone()), + ConnectorConfig::S3Sink(cfg) => connector_op::Config::S3Sink(cfg.clone()), + ConnectorConfig::LanceDbSink(cfg) => connector_op::Config::LancedbSink(cfg.clone()), + } + } +} + +impl PartialEq for ConnectorConfig { + fn eq(&self, other: &Self) -> bool { + use prost::Message; + match (self, other) { + (ConnectorConfig::KafkaSource(a), ConnectorConfig::KafkaSource(b)) => { + a.encode_to_vec() == b.encode_to_vec() + } + (ConnectorConfig::KafkaSink(a), ConnectorConfig::KafkaSink(b)) => { + a.encode_to_vec() == b.encode_to_vec() + } + (ConnectorConfig::FilesystemSink(a), ConnectorConfig::FilesystemSink(b)) => { + a.encode_to_vec() == b.encode_to_vec() + } + (ConnectorConfig::DeltaSink(a), ConnectorConfig::DeltaSink(b)) => { + a.encode_to_vec() == b.encode_to_vec() + } + (ConnectorConfig::IcebergSink(a), ConnectorConfig::IcebergSink(b)) => { + a.encode_to_vec() == b.encode_to_vec() + } + (ConnectorConfig::S3Sink(a), ConnectorConfig::S3Sink(b)) => { + a.encode_to_vec() == b.encode_to_vec() + } + (ConnectorConfig::LanceDbSink(a), ConnectorConfig::LanceDbSink(b)) => { + a.encode_to_vec() == b.encode_to_vec() + } + _ => false, + } + } +} + +impl Eq for ConnectorConfig {} + +impl std::hash::Hash for ConnectorConfig { + fn hash(&self, state: &mut H) { + use prost::Message; + std::mem::discriminant(self).hash(state); + match self { + ConnectorConfig::KafkaSource(cfg) => cfg.encode_to_vec().hash(state), + ConnectorConfig::KafkaSink(cfg) => cfg.encode_to_vec().hash(state), + ConnectorConfig::FilesystemSink(cfg) => cfg.encode_to_vec().hash(state), + ConnectorConfig::DeltaSink(cfg) => cfg.encode_to_vec().hash(state), + ConnectorConfig::IcebergSink(cfg) => cfg.encode_to_vec().hash(state), + ConnectorConfig::S3Sink(cfg) => cfg.encode_to_vec().hash(state), + ConnectorConfig::LanceDbSink(cfg) => cfg.encode_to_vec().hash(state), + } + } +} diff --git a/src/sql/connector/factory.rs b/src/sql/connector/factory.rs new file mode 100644 index 00000000..8c37a15a --- /dev/null +++ b/src/sql/connector/factory.rs @@ -0,0 +1,67 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use datafusion::arrow::datatypes::Schema; +use datafusion::common::Result; + +use super::config::ConnectorConfig; +use super::registry::REGISTRY; +use super::sink::runtime_config::SinkRuntimeConfig; +use crate::sql::common::connector_options::ConnectorOptions; +use crate::sql::common::formats::{BadData, Format}; +use crate::sql::schema::table_role::TableRole; + +pub fn build_connector_config( + connector_name: &str, + role: TableRole, + options: &mut ConnectorOptions, + format: &Option, + bad_data: BadData, +) -> Result { + let runtime_opts_map = options.snapshot_for_catalog().into_iter().collect(); + let runtime_props = + SinkRuntimeConfig::from_options_map(&runtime_opts_map)?.to_runtime_properties(); + match role { + TableRole::Ingestion | TableRole::Reference => REGISTRY + .get_source(connector_name)? + .build_source_config(options, format, bad_data), + TableRole::Egress => { + REGISTRY + .get_sink(connector_name)? + .build_sink_config(options, format, &runtime_props) + } + } +} + +pub fn build_connector_config_from_options( + connector_name: &str, + role: TableRole, + options: &mut ConnectorOptions, + format: &Option, + bad_data: BadData, +) -> Result { + build_connector_config(connector_name, role, options, format, bad_data) +} + +pub fn build_connector_config_from_catalog( + connector_name: &str, + role: TableRole, + opts: HashMap, + _physical_schema: &Schema, +) -> Result { + let mut options = ConnectorOptions::from_flat_string_map(opts)?; + let format = Format::from_opts(&mut options)?; + let bad_data = BadData::from_opts(&mut options)?; + build_connector_config(connector_name, role, &mut options, &format, bad_data) +} diff --git a/src/sql/connector/mod.rs b/src/sql/connector/mod.rs new file mode 100644 index 00000000..f477c976 --- /dev/null +++ b/src/sql/connector/mod.rs @@ -0,0 +1,18 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod config; +pub mod factory; +pub mod provider; +pub mod registry; +pub mod sink; +pub mod source; diff --git a/src/sql/connector/provider.rs b/src/sql/connector/provider.rs new file mode 100644 index 00000000..8875ee0c --- /dev/null +++ b/src/sql/connector/provider.rs @@ -0,0 +1,52 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::common::{DataFusionError, Result}; + +use super::config::ConnectorConfig; +use super::sink::runtime_config::SinkRuntimeProperties; +use crate::sql::common::connector_options::ConnectorOptions; +use crate::sql::common::formats::{BadData, Format}; + +pub trait SourceProvider: Send + Sync { + fn name(&self) -> &'static str; + + fn build_source_config( + &self, + options: &mut ConnectorOptions, + format: &Option, + bad_data: BadData, + ) -> Result; +} + +pub trait SinkProvider: Send + Sync { + fn name(&self) -> &'static str; + fn build_sink_config( + &self, + options: &mut ConnectorOptions, + format: &Option, + runtime_props: &SinkRuntimeProperties, + ) -> Result; +} + +pub fn require_option( + options: &mut ConnectorOptions, + key: &str, + connector_name: &str, +) -> Result { + options.pull_opt_str(key)?.ok_or_else(|| { + DataFusionError::Plan(format!( + "Connector '{}' requires option '{}' to be set", + connector_name, key + )) + }) +} diff --git a/src/sql/connector/registry.rs b/src/sql/connector/registry.rs new file mode 100644 index 00000000..4a8a8c1c --- /dev/null +++ b/src/sql/connector/registry.rs @@ -0,0 +1,86 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::sync::{Arc, LazyLock}; + +use datafusion::common::{DataFusionError, Result}; + +use super::provider::{SinkProvider, SourceProvider}; +use super::sink::delta::DeltaSinkConnector; +use super::sink::filesystem::FilesystemSinkConnector; +use super::sink::iceberg::IcebergSinkConnector; +use super::sink::kafka::KafkaSinkConnector; +use super::sink::lancedb::LanceDbSinkConnector; +use super::sink::s3::S3SinkConnector; +use super::source::kafka::KafkaSourceConnector; + +pub struct ConnectorRegistry { + sources: HashMap>, + sinks: HashMap>, +} + +impl ConnectorRegistry { + fn new() -> Self { + let mut registry = Self { + sources: HashMap::new(), + sinks: HashMap::new(), + }; + + registry.register_source(Arc::new(KafkaSourceConnector)); + + registry.register_sink(Arc::new(KafkaSinkConnector)); + registry.register_sink(Arc::new(S3SinkConnector)); + registry.register_sink(Arc::new(FilesystemSinkConnector)); + registry.register_sink(Arc::new(DeltaSinkConnector)); + registry.register_sink(Arc::new(IcebergSinkConnector)); + registry.register_sink(Arc::new(LanceDbSinkConnector)); + + registry + } + + pub fn register_source(&mut self, provider: Arc) { + self.sources + .insert(provider.name().to_ascii_lowercase(), provider); + } + + pub fn register_sink(&mut self, provider: Arc) { + self.sinks + .insert(provider.name().to_ascii_lowercase(), provider); + } + + pub fn get_source(&self, connector_name: &str) -> Result> { + self.sources + .get(&connector_name.to_ascii_lowercase()) + .cloned() + .ok_or_else(|| { + DataFusionError::Plan(format!( + "Connector '{}' is not registered or does not support being used as a SOURCE", + connector_name + )) + }) + } + + pub fn get_sink(&self, connector_name: &str) -> Result> { + self.sinks + .get(&connector_name.to_ascii_lowercase()) + .cloned() + .ok_or_else(|| { + DataFusionError::Plan(format!( + "Connector '{}' is not registered or does not support being used as a SINK", + connector_name + )) + }) + } +} + +pub static REGISTRY: LazyLock = LazyLock::new(ConnectorRegistry::new); diff --git a/src/sql/connector/sink/delta.rs b/src/sql/connector/sink/delta.rs new file mode 100644 index 00000000..cd86660d --- /dev/null +++ b/src/sql/connector/sink/delta.rs @@ -0,0 +1,60 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::common::Result; +use protocol::function_stream_graph::{DeltaSinkConfig, SinkFormatProto}; + +use crate::sql::common::Format; +use crate::sql::common::connector_options::ConnectorOptions; +use crate::sql::common::constants::connector_type; +use crate::sql::connector::config::ConnectorConfig; +use crate::sql::connector::provider::SinkProvider; +use crate::sql::connector::sink::runtime_config::SinkRuntimeProperties; +use crate::sql::connector::sink::utils::SinkUtils; + +pub struct DeltaSinkConnector; + +impl SinkProvider for DeltaSinkConnector { + fn name(&self) -> &'static str { + connector_type::DELTA + } + + fn build_sink_config( + &self, + options: &mut ConnectorOptions, + format: &Option, + runtime_props: &SinkRuntimeProperties, + ) -> Result { + let path = SinkUtils::require_path(options)?; + let parquet_compression = SinkUtils::extract_parquet_compression(options)?; + let format_proto = SinkUtils::resolve_sink_format( + format, + self.name(), + &[ + SinkFormatProto::SinkFormatCsv, + SinkFormatProto::SinkFormatJsonl, + SinkFormatProto::SinkFormatAvro, + SinkFormatProto::SinkFormatParquet, + SinkFormatProto::SinkFormatOrc, + ], + )?; + let extra_properties = options.drain_remaining_string_values()?; + + Ok(ConnectorConfig::DeltaSink(DeltaSinkConfig { + path, + format: format_proto, + parquet_compression, + extra_properties, + runtime_properties: runtime_props.clone(), + })) + } +} diff --git a/src/sql/connector/sink/filesystem.rs b/src/sql/connector/sink/filesystem.rs new file mode 100644 index 00000000..224b1805 --- /dev/null +++ b/src/sql/connector/sink/filesystem.rs @@ -0,0 +1,60 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::common::Result; +use protocol::function_stream_graph::{FilesystemSinkConfig, SinkFormatProto}; + +use crate::sql::common::Format; +use crate::sql::common::connector_options::ConnectorOptions; +use crate::sql::common::constants::connector_type; +use crate::sql::connector::config::ConnectorConfig; +use crate::sql::connector::provider::SinkProvider; +use crate::sql::connector::sink::runtime_config::SinkRuntimeProperties; +use crate::sql::connector::sink::utils::SinkUtils; + +pub struct FilesystemSinkConnector; + +impl SinkProvider for FilesystemSinkConnector { + fn name(&self) -> &'static str { + connector_type::FILESYSTEM + } + + fn build_sink_config( + &self, + options: &mut ConnectorOptions, + format: &Option, + runtime_props: &SinkRuntimeProperties, + ) -> Result { + let path = SinkUtils::require_path(options)?; + let parquet_compression = SinkUtils::extract_parquet_compression(options)?; + let format_proto = SinkUtils::resolve_sink_format( + format, + self.name(), + &[ + SinkFormatProto::SinkFormatCsv, + SinkFormatProto::SinkFormatJsonl, + SinkFormatProto::SinkFormatAvro, + SinkFormatProto::SinkFormatParquet, + SinkFormatProto::SinkFormatOrc, + ], + )?; + let extra_properties = options.drain_remaining_string_values()?; + + Ok(ConnectorConfig::FilesystemSink(FilesystemSinkConfig { + path, + format: format_proto, + parquet_compression, + extra_properties, + runtime_properties: runtime_props.clone(), + })) + } +} diff --git a/src/sql/connector/sink/iceberg.rs b/src/sql/connector/sink/iceberg.rs new file mode 100644 index 00000000..12f0d378 --- /dev/null +++ b/src/sql/connector/sink/iceberg.rs @@ -0,0 +1,57 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::common::Result; +use protocol::function_stream_graph::{IcebergSinkConfig, SinkFormatProto}; + +use crate::sql::common::Format; +use crate::sql::common::connector_options::ConnectorOptions; +use crate::sql::common::constants::connector_type; +use crate::sql::connector::config::ConnectorConfig; +use crate::sql::connector::provider::SinkProvider; +use crate::sql::connector::sink::runtime_config::SinkRuntimeProperties; +use crate::sql::connector::sink::utils::SinkUtils; + +pub struct IcebergSinkConnector; + +impl SinkProvider for IcebergSinkConnector { + fn name(&self) -> &'static str { + connector_type::ICEBERG + } + + fn build_sink_config( + &self, + options: &mut ConnectorOptions, + format: &Option, + runtime_props: &SinkRuntimeProperties, + ) -> Result { + let path = SinkUtils::require_path(options)?; + let parquet_compression = SinkUtils::extract_parquet_compression(options)?; + let format_proto = SinkUtils::resolve_sink_format( + format, + self.name(), + &[ + SinkFormatProto::SinkFormatCsv, + SinkFormatProto::SinkFormatParquet, + ], + )?; + let extra_properties = options.drain_remaining_string_values()?; + + Ok(ConnectorConfig::IcebergSink(IcebergSinkConfig { + path, + format: format_proto, + parquet_compression, + extra_properties, + runtime_properties: runtime_props.clone(), + })) + } +} diff --git a/src/sql/connector/sink/kafka.rs b/src/sql/connector/sink/kafka.rs new file mode 100644 index 00000000..a6fd115c --- /dev/null +++ b/src/sql/connector/sink/kafka.rs @@ -0,0 +1,159 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::common::{Result, plan_datafusion_err, plan_err}; +use protocol::function_stream_graph::{ + DecimalEncodingProto, FormatConfig, JsonFormatConfig, KafkaAuthConfig, KafkaAuthNone, + KafkaSinkCommitMode, KafkaSinkConfig, RawBytesFormatConfig, RawStringFormatConfig, + TimestampFormatProto, format_config, kafka_auth_config, +}; + +use crate::sql::common::connector_options::ConnectorOptions; +use crate::sql::common::constants::{connector_type, kafka_with_value}; +use crate::sql::common::formats::{ + DecimalEncoding as SqlDecimalEncoding, Format as SqlFormat, + TimestampFormat as SqlTimestampFormat, +}; +use crate::sql::common::with_option_keys as opt; +use crate::sql::connector::config::ConnectorConfig; +use crate::sql::connector::provider::SinkProvider; +use crate::sql::connector::sink::runtime_config::SinkRuntimeProperties; + +pub struct KafkaSinkConnector; + +impl KafkaSinkConnector { + fn sql_format_to_proto(fmt: &SqlFormat) -> Result { + match fmt { + SqlFormat::Json(j) => Ok(FormatConfig { + format: Some(format_config::Format::Json(JsonFormatConfig { + timestamp_format: match j.timestamp_format { + SqlTimestampFormat::RFC3339 => { + TimestampFormatProto::TimestampRfc3339 as i32 + } + SqlTimestampFormat::UnixMillis => { + TimestampFormatProto::TimestampUnixMillis as i32 + } + }, + decimal_encoding: match j.decimal_encoding { + SqlDecimalEncoding::Number => DecimalEncodingProto::DecimalNumber as i32, + SqlDecimalEncoding::String => DecimalEncodingProto::DecimalString as i32, + SqlDecimalEncoding::Bytes => DecimalEncodingProto::DecimalBytes as i32, + }, + include_schema: j.include_schema, + confluent_schema_registry: j.confluent_schema_registry, + schema_id: j.schema_id, + debezium: j.debezium, + unstructured: j.unstructured, + })), + }), + SqlFormat::RawString(_) => Ok(FormatConfig { + format: Some(format_config::Format::RawString(RawStringFormatConfig {})), + }), + SqlFormat::RawBytes(_) => Ok(FormatConfig { + format: Some(format_config::Format::RawBytes(RawBytesFormatConfig {})), + }), + other => plan_err!( + "Kafka sink connector: format '{}' is not supported", + other.name() + ), + } + } +} + +impl SinkProvider for KafkaSinkConnector { + fn name(&self) -> &'static str { + connector_type::KAFKA + } + + fn build_sink_config( + &self, + options: &mut ConnectorOptions, + format: &Option, + _runtime_props: &SinkRuntimeProperties, + ) -> Result { + let bootstrap_servers = match options.pull_opt_str(opt::KAFKA_BOOTSTRAP_SERVERS)? { + Some(s) => s, + None => options + .pull_opt_str(opt::KAFKA_BOOTSTRAP_SERVERS_LEGACY)? + .ok_or_else(|| { + plan_datafusion_err!( + "Kafka connector requires 'bootstrap.servers' in the WITH clause" + ) + })?, + }; + + let topic = options.pull_opt_str(opt::KAFKA_TOPIC)?.ok_or_else(|| { + plan_datafusion_err!("Kafka connector requires 'topic' in the WITH clause") + })?; + + let sql_format = format.as_ref().ok_or_else(|| { + plan_datafusion_err!( + "Kafka sink requires 'format' in the WITH clause (e.g. format = 'json')" + ) + })?; + let proto_format = Self::sql_format_to_proto(sql_format)?; + + let value_subject = options.pull_opt_str(opt::KAFKA_VALUE_SUBJECT)?; + + let commit_mode = match options + .pull_opt_str(opt::KAFKA_SINK_COMMIT_MODE)? + .as_deref() + { + Some(s) + if s == kafka_with_value::SINK_COMMIT_EXACTLY_ONCE_HYPHEN + || s == kafka_with_value::SINK_COMMIT_EXACTLY_ONCE_UNDERSCORE => + { + KafkaSinkCommitMode::KafkaSinkExactlyOnce as i32 + } + Some(s) + if s == kafka_with_value::SINK_COMMIT_AT_LEAST_ONCE_HYPHEN + || s == kafka_with_value::SINK_COMMIT_AT_LEAST_ONCE_UNDERSCORE => + { + KafkaSinkCommitMode::KafkaSinkAtLeastOnce as i32 + } + None => KafkaSinkCommitMode::KafkaSinkAtLeastOnce as i32, + Some(other) => return plan_err!("invalid sink.commit.mode '{other}'"), + }; + + let key_field = match options.pull_opt_str(opt::KAFKA_SINK_KEY_FIELD)? { + Some(s) => Some(s), + None => options.pull_opt_str(opt::KAFKA_KEY_FIELD_LEGACY)?, + }; + let timestamp_field = match options.pull_opt_str(opt::KAFKA_SINK_TIMESTAMP_FIELD)? { + Some(s) => Some(s), + None => options.pull_opt_str(opt::KAFKA_TIMESTAMP_FIELD_LEGACY)?, + }; + + let _ = options.pull_opt_str(opt::TYPE)?; + let _ = options.pull_opt_str(opt::CONNECTOR)?; + + let mut client_configs = options.drain_remaining_string_values()?; + client_configs.remove(opt::CHECKPOINT_INTERVAL_MS); + client_configs.remove(opt::PIPELINE_PARALLELISM); + client_configs.remove(opt::KEY_BY_PARALLELISM); + client_configs.remove(opt::FORMAT); + + Ok(ConnectorConfig::KafkaSink(KafkaSinkConfig { + topic, + bootstrap_servers, + commit_mode, + key_field, + timestamp_field, + auth: Some(KafkaAuthConfig { + auth: Some(kafka_auth_config::Auth::None(KafkaAuthNone {})), + }), + client_configs, + format: Some(proto_format), + value_subject, + })) + } +} diff --git a/src/sql/connector/sink/lancedb.rs b/src/sql/connector/sink/lancedb.rs new file mode 100644 index 00000000..aee79735 --- /dev/null +++ b/src/sql/connector/sink/lancedb.rs @@ -0,0 +1,61 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::common::Result; +use protocol::function_stream_graph::{LanceDbSinkConfig, SinkFormatProto}; + +use crate::sql::common::Format; +use crate::sql::common::connector_options::ConnectorOptions; +use crate::sql::common::with_option_keys as opt; +use crate::sql::connector::config::ConnectorConfig; +use crate::sql::connector::provider::SinkProvider; +use crate::sql::connector::sink::runtime_config::SinkRuntimeProperties; +use crate::sql::connector::sink::utils::SinkUtils; + +pub struct LanceDbSinkConnector; + +impl SinkProvider for LanceDbSinkConnector { + fn name(&self) -> &'static str { + "lancedb" + } + + fn build_sink_config( + &self, + options: &mut ConnectorOptions, + _format: &Option, + runtime_props: &SinkRuntimeProperties, + ) -> Result { + let path = SinkUtils::require_path(options)?; + + let s3_bucket = options.pull_opt_str(opt::S3_BUCKET)?; + let s3_region = options.pull_opt_str(opt::S3_REGION)?; + let s3_endpoint = options.pull_opt_str(opt::S3_ENDPOINT)?; + let s3_access_key_id = options.pull_opt_str(opt::S3_ACCESS_KEY_ID)?; + let s3_secret_access_key = options.pull_opt_str(opt::S3_SECRET_ACCESS_KEY)?; + let s3_session_token = options.pull_opt_str(opt::S3_SESSION_TOKEN)?; + + let extra_properties = options.drain_remaining_string_values()?; + + Ok(ConnectorConfig::LanceDbSink(LanceDbSinkConfig { + path, + format: SinkFormatProto::SinkFormatLance as i32, + s3_bucket, + s3_region, + s3_endpoint, + s3_access_key_id, + s3_secret_access_key, + s3_session_token, + extra_properties, + runtime_properties: runtime_props.clone(), + })) + } +} diff --git a/src/sql/connector/sink/mod.rs b/src/sql/connector/sink/mod.rs new file mode 100644 index 00000000..b7d645ca --- /dev/null +++ b/src/sql/connector/sink/mod.rs @@ -0,0 +1,20 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod delta; +pub mod filesystem; +pub mod iceberg; +pub mod kafka; +pub mod lancedb; +pub mod runtime_config; +pub mod s3; +pub mod utils; diff --git a/src/sql/connector/sink/runtime_config.rs b/src/sql/connector/sink/runtime_config.rs new file mode 100644 index 00000000..e0ffaeee --- /dev/null +++ b/src/sql/connector/sink/runtime_config.rs @@ -0,0 +1,137 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use datafusion::common::{DataFusionError, Result, plan_err}; + +use crate::config::global_config::{ + DEFAULT_OPERATOR_STATE_STORE_MEMORY_BYTES, DEFAULT_SINK_BUFFER_MEMORY_BYTES, +}; +use crate::config::streaming_job::DEFAULT_CHECKPOINT_INTERVAL_MS; +use crate::sql::common::connector_options::ConnectorOptions; +use crate::sql::common::with_option_keys as opt; + +#[derive(Debug, Clone, PartialEq, Eq, Default)] +pub struct SinkRuntimeConfig { + pub pipeline_parallelism: Option, + pub key_by_parallelism: Option, + pub checkpoint_interval_ms: u64, + pub operator_memory_bytes: u64, + pub sink_memory_bytes: u64, +} + +pub type SinkRuntimeProperties = HashMap; + +impl SinkRuntimeConfig { + pub fn extract_from_options(options: &mut ConnectorOptions) -> Result { + let pipeline_parallelism = options + .pull_opt_u64(opt::PIPELINE_PARALLELISM)? + .map(|v| v as u32); + let key_by_parallelism = options + .pull_opt_u64(opt::KEY_BY_PARALLELISM)? + .map(|v| v as u32); + let checkpoint_interval_ms = options + .pull_opt_u64(opt::CHECKPOINT_INTERVAL_MS)? + .unwrap_or(DEFAULT_CHECKPOINT_INTERVAL_MS); + let operator_memory_bytes = options + .pull_opt_u64(opt::OPERATOR_MEMORY_BYTES)? + .unwrap_or(DEFAULT_OPERATOR_STATE_STORE_MEMORY_BYTES); + let sink_memory_bytes = options + .pull_opt_u64(opt::SINK_MEMORY_BYTES)? + .unwrap_or(DEFAULT_SINK_BUFFER_MEMORY_BYTES); + Ok(Self { + pipeline_parallelism, + key_by_parallelism, + checkpoint_interval_ms, + operator_memory_bytes, + sink_memory_bytes, + }) + } + + pub fn from_options_map(opts: &HashMap) -> Result { + let pipeline_parallelism = parse_opt_u32(opts, opt::PIPELINE_PARALLELISM)?; + let key_by_parallelism = parse_opt_u32(opts, opt::KEY_BY_PARALLELISM)?; + let checkpoint_interval_ms = parse_opt_u64(opts, opt::CHECKPOINT_INTERVAL_MS)? + .unwrap_or(DEFAULT_CHECKPOINT_INTERVAL_MS); + let operator_memory_bytes = parse_opt_u64(opts, opt::OPERATOR_MEMORY_BYTES)? + .unwrap_or(DEFAULT_OPERATOR_STATE_STORE_MEMORY_BYTES); + let sink_memory_bytes = parse_opt_u64(opts, opt::SINK_MEMORY_BYTES)? + .unwrap_or(DEFAULT_SINK_BUFFER_MEMORY_BYTES); + Ok(Self { + pipeline_parallelism, + key_by_parallelism, + checkpoint_interval_ms, + operator_memory_bytes, + sink_memory_bytes, + }) + } + + pub fn to_runtime_properties(&self) -> HashMap { + let mut out = HashMap::new(); + if let Some(v) = self.pipeline_parallelism { + out.insert(opt::PIPELINE_PARALLELISM.to_string(), v.to_string()); + } + if let Some(v) = self.key_by_parallelism { + out.insert(opt::KEY_BY_PARALLELISM.to_string(), v.to_string()); + } + out.insert( + opt::CHECKPOINT_INTERVAL_MS.to_string(), + self.checkpoint_interval_ms.to_string(), + ); + out.insert( + opt::OPERATOR_MEMORY_BYTES.to_string(), + self.operator_memory_bytes.to_string(), + ); + out.insert( + opt::SINK_MEMORY_BYTES.to_string(), + self.sink_memory_bytes.to_string(), + ); + out + } +} + +fn parse_opt_u32(opts: &HashMap, key: &str) -> Result> { + let Some(raw) = opts.get(key) else { + return Ok(None); + }; + let normalized = normalize_numeric_option(raw); + let parsed = normalized.parse::().map_err(|_| { + DataFusionError::Plan(format!( + "WITH option '{key}' expects unsigned integer, got '{raw}'" + )) + })?; + if parsed == 0 { + return plan_err!("WITH option '{key}' must be > 0"); + } + Ok(Some(parsed)) +} + +fn parse_opt_u64(opts: &HashMap, key: &str) -> Result> { + let Some(raw) = opts.get(key) else { + return Ok(None); + }; + let normalized = normalize_numeric_option(raw); + let parsed = normalized.parse::().map_err(|_| { + DataFusionError::Plan(format!( + "WITH option '{key}' expects unsigned integer, got '{raw}'" + )) + })?; + if parsed == 0 { + return plan_err!("WITH option '{key}' must be > 0"); + } + Ok(Some(parsed)) +} + +fn normalize_numeric_option(raw: &str) -> &str { + raw.trim().trim_matches('\'').trim_matches('"').trim() +} diff --git a/src/sql/connector/sink/s3.rs b/src/sql/connector/sink/s3.rs new file mode 100644 index 00000000..5d04ce46 --- /dev/null +++ b/src/sql/connector/sink/s3.rs @@ -0,0 +1,75 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::common::Result; +use protocol::function_stream_graph::{S3SinkConfig, SinkFormatProto}; + +use crate::sql::common::Format; +use crate::sql::common::connector_options::ConnectorOptions; +use crate::sql::common::constants::connector_type; +use crate::sql::common::with_option_keys as opt; +use crate::sql::connector::config::ConnectorConfig; +use crate::sql::connector::provider::SinkProvider; +use crate::sql::connector::sink::runtime_config::SinkRuntimeProperties; +use crate::sql::connector::sink::utils::SinkUtils; + +pub struct S3SinkConnector; + +impl SinkProvider for S3SinkConnector { + fn name(&self) -> &'static str { + connector_type::S3 + } + + fn build_sink_config( + &self, + options: &mut ConnectorOptions, + format: &Option, + runtime_props: &SinkRuntimeProperties, + ) -> Result { + let path = SinkUtils::require_path(options)?; + + let format_proto = SinkUtils::resolve_sink_format( + format, + self.name(), + &[ + SinkFormatProto::SinkFormatCsv, + SinkFormatProto::SinkFormatParquet, + ], + )?; + + let bucket = SinkUtils::require_str(options, opt::S3_BUCKET, self.name())?; + let region = options + .pull_opt_str(opt::S3_REGION)? + .unwrap_or_else(|| "us-east-1".to_string()); + let endpoint = options.pull_opt_str(opt::S3_ENDPOINT)?; + let access_key_id = options.pull_opt_str(opt::S3_ACCESS_KEY_ID)?; + let secret_access_key = options.pull_opt_str(opt::S3_SECRET_ACCESS_KEY)?; + let session_token = options.pull_opt_str(opt::S3_SESSION_TOKEN)?; + + let parquet_compression = SinkUtils::extract_parquet_compression(options)?; + let extra_properties = options.drain_remaining_string_values()?; + + Ok(ConnectorConfig::S3Sink(S3SinkConfig { + path, + format: format_proto, + bucket, + region, + endpoint, + access_key_id, + secret_access_key, + session_token, + parquet_compression, + extra_properties, + runtime_properties: runtime_props.clone(), + })) + } +} diff --git a/src/sql/connector/sink/utils.rs b/src/sql/connector/sink/utils.rs new file mode 100644 index 00000000..e61cd870 --- /dev/null +++ b/src/sql/connector/sink/utils.rs @@ -0,0 +1,91 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::common::{DataFusionError, Result, plan_err}; +use protocol::function_stream_graph::{ParquetCompressionProto, SinkFormatProto}; + +use crate::sql::common::Format; +use crate::sql::common::connector_options::ConnectorOptions; +use crate::sql::common::constants::parquet_compression_value; +use crate::sql::common::with_option_keys as opt; + +pub struct SinkUtils; + +impl SinkUtils { + pub fn require_path(options: &mut ConnectorOptions) -> Result { + if let Some(v) = options.pull_opt_str(opt::PATH)? { + return Ok(v); + } + if let Some(v) = options.pull_opt_str(opt::SINK_PATH)? { + return Ok(v); + } + plan_err!("Missing required WITH option 'path' (or 'sink.path')") + } + + pub fn extract_parquet_compression(options: &mut ConnectorOptions) -> Result> { + let Some(v) = options.pull_opt_str(opt::PARQUET_COMPRESSION)? else { + return Ok(None); + }; + let parsed = match v.to_ascii_lowercase().as_str() { + parquet_compression_value::UNCOMPRESSED => { + ParquetCompressionProto::ParquetCompressionUncompressed + } + parquet_compression_value::SNAPPY => ParquetCompressionProto::ParquetCompressionSnappy, + parquet_compression_value::GZIP => ParquetCompressionProto::ParquetCompressionGzip, + parquet_compression_value::ZSTD => ParquetCompressionProto::ParquetCompressionZstd, + parquet_compression_value::LZ4 => ParquetCompressionProto::ParquetCompressionLz4, + parquet_compression_value::LZ4_RAW => ParquetCompressionProto::ParquetCompressionLz4Raw, + other => return plan_err!("Unsupported parquet.compression '{other}'"), + }; + Ok(Some(parsed as i32)) + } + + pub fn require_str( + options: &mut ConnectorOptions, + key: &str, + connector: &str, + ) -> Result { + options.pull_opt_str(key)?.ok_or_else(|| { + DataFusionError::Plan(format!( + "Connector '{connector}' requires WITH option '{key}'" + )) + }) + } + + pub fn resolve_sink_format( + format: &Option, + connector_name: &str, + supported_formats: &[SinkFormatProto], + ) -> Result { + let proto_format = match format { + Some(Format::Csv(_)) => SinkFormatProto::SinkFormatCsv, + Some(Format::Json(_)) => SinkFormatProto::SinkFormatJsonl, + Some(Format::Avro(_)) => SinkFormatProto::SinkFormatAvro, + Some(Format::Parquet(_)) => SinkFormatProto::SinkFormatParquet, + Some(Format::Lance(_)) => SinkFormatProto::SinkFormatLance, + Some(f) => { + return plan_err!("Format '{f:?}' cannot be mapped to a sink format"); + } + None => { + return plan_err!("Connector '{connector_name}' requires a format to be specified"); + } + }; + + if !supported_formats.contains(&proto_format) { + return plan_err!( + "Format {proto_format:?} is not supported by connector '{connector_name}'" + ); + } + + Ok(proto_format as i32) + } +} diff --git a/src/sql/connector/source/kafka.rs b/src/sql/connector/source/kafka.rs new file mode 100644 index 00000000..0bc220d9 --- /dev/null +++ b/src/sql/connector/source/kafka.rs @@ -0,0 +1,185 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::common::{Result, plan_datafusion_err, plan_err}; +use protocol::function_stream_graph::{ + BadDataPolicy, DecimalEncodingProto, FormatConfig, JsonFormatConfig, KafkaAuthConfig, + KafkaAuthNone, KafkaOffsetMode, KafkaReadMode, KafkaSourceConfig, RawBytesFormatConfig, + RawStringFormatConfig, TimestampFormatProto, format_config, kafka_auth_config, +}; + +use crate::sql::common::connector_options::ConnectorOptions; +use crate::sql::common::constants::{connector_type, kafka_with_value}; +use crate::sql::common::formats::{ + BadData, DecimalEncoding as SqlDecimalEncoding, Format as SqlFormat, + TimestampFormat as SqlTimestampFormat, +}; +use crate::sql::common::with_option_keys as opt; +use crate::sql::connector::config::ConnectorConfig; +use crate::sql::connector::provider::SourceProvider; + +pub struct KafkaSourceConnector; + +impl KafkaSourceConnector { + fn sql_format_to_proto(fmt: &SqlFormat) -> Result { + match fmt { + SqlFormat::Json(j) => Ok(FormatConfig { + format: Some(format_config::Format::Json(JsonFormatConfig { + timestamp_format: match j.timestamp_format { + SqlTimestampFormat::RFC3339 => { + TimestampFormatProto::TimestampRfc3339 as i32 + } + SqlTimestampFormat::UnixMillis => { + TimestampFormatProto::TimestampUnixMillis as i32 + } + }, + decimal_encoding: match j.decimal_encoding { + SqlDecimalEncoding::Number => DecimalEncodingProto::DecimalNumber as i32, + SqlDecimalEncoding::String => DecimalEncodingProto::DecimalString as i32, + SqlDecimalEncoding::Bytes => DecimalEncodingProto::DecimalBytes as i32, + }, + include_schema: j.include_schema, + confluent_schema_registry: j.confluent_schema_registry, + schema_id: j.schema_id, + debezium: j.debezium, + unstructured: j.unstructured, + })), + }), + SqlFormat::RawString(_) => Ok(FormatConfig { + format: Some(format_config::Format::RawString(RawStringFormatConfig {})), + }), + SqlFormat::RawBytes(_) => Ok(FormatConfig { + format: Some(format_config::Format::RawBytes(RawBytesFormatConfig {})), + }), + other => plan_err!( + "Kafka source connector: format '{}' is not supported", + other.name() + ), + } + } + + fn bad_data_to_proto(bad: &BadData) -> i32 { + match bad { + BadData::Fail {} => BadDataPolicy::BadDataFail as i32, + BadData::Drop {} => BadDataPolicy::BadDataDrop as i32, + } + } +} + +impl SourceProvider for KafkaSourceConnector { + fn name(&self) -> &'static str { + connector_type::KAFKA + } + + fn build_source_config( + &self, + options: &mut ConnectorOptions, + format: &Option, + bad_data: BadData, + ) -> Result { + let bootstrap_servers = match options.pull_opt_str(opt::KAFKA_BOOTSTRAP_SERVERS)? { + Some(s) => s, + None => options + .pull_opt_str(opt::KAFKA_BOOTSTRAP_SERVERS_LEGACY)? + .ok_or_else(|| { + plan_datafusion_err!( + "Kafka connector requires 'bootstrap.servers' in the WITH clause" + ) + })?, + }; + + let topic = options.pull_opt_str(opt::KAFKA_TOPIC)?.ok_or_else(|| { + plan_datafusion_err!("Kafka connector requires 'topic' in the WITH clause") + })?; + + let sql_format = format.as_ref().ok_or_else(|| { + plan_datafusion_err!( + "Kafka source requires 'format' in the WITH clause (e.g. format = 'json')" + ) + })?; + let proto_format = Self::sql_format_to_proto(sql_format)?; + + let rate_limit = options + .pull_opt_u64(opt::KAFKA_RATE_LIMIT_MESSAGES_PER_SECOND)? + .map(|v| v.clamp(1, u32::MAX as u64) as u32) + .unwrap_or(0); + + let value_subject = options.pull_opt_str(opt::KAFKA_VALUE_SUBJECT)?; + + let offset_mode = match options + .pull_opt_str(opt::KAFKA_SCAN_STARTUP_MODE)? + .as_deref() + { + Some(s) if s == kafka_with_value::SCAN_LATEST => { + KafkaOffsetMode::KafkaOffsetLatest as i32 + } + Some(s) if s == kafka_with_value::SCAN_EARLIEST => { + KafkaOffsetMode::KafkaOffsetEarliest as i32 + } + Some(s) + if s == kafka_with_value::SCAN_GROUP_OFFSETS + || s == kafka_with_value::SCAN_GROUP => + { + KafkaOffsetMode::KafkaOffsetGroup as i32 + } + None => KafkaOffsetMode::KafkaOffsetGroup as i32, + Some(other) => { + return plan_err!( + "invalid scan.startup.mode '{other}'; expected latest, earliest, or group-offsets" + ); + } + }; + + let read_mode = match options.pull_opt_str(opt::KAFKA_ISOLATION_LEVEL)?.as_deref() { + Some(s) if s == kafka_with_value::ISOLATION_READ_COMMITTED => { + KafkaReadMode::KafkaReadCommitted as i32 + } + Some(s) if s == kafka_with_value::ISOLATION_READ_UNCOMMITTED => { + KafkaReadMode::KafkaReadUncommitted as i32 + } + None => KafkaReadMode::KafkaReadDefault as i32, + Some(other) => return plan_err!("invalid isolation.level '{other}'"), + }; + + let group_id = match options.pull_opt_str(opt::KAFKA_GROUP_ID)? { + Some(s) => Some(s), + None => options.pull_opt_str(opt::KAFKA_GROUP_ID_LEGACY)?, + }; + let group_id_prefix = options.pull_opt_str(opt::KAFKA_GROUP_ID_PREFIX)?; + + let _ = options.pull_opt_str(opt::TYPE)?; + let _ = options.pull_opt_str(opt::CONNECTOR)?; + + let mut client_configs = options.drain_remaining_string_values()?; + client_configs.remove(opt::CHECKPOINT_INTERVAL_MS); + client_configs.remove(opt::PIPELINE_PARALLELISM); + client_configs.remove(opt::KEY_BY_PARALLELISM); + client_configs.remove(opt::FORMAT); + + Ok(ConnectorConfig::KafkaSource(KafkaSourceConfig { + topic, + bootstrap_servers, + group_id, + group_id_prefix, + offset_mode, + read_mode, + auth: Some(KafkaAuthConfig { + auth: Some(kafka_auth_config::Auth::None(KafkaAuthNone {})), + }), + client_configs, + format: Some(proto_format), + bad_data_policy: Self::bad_data_to_proto(&bad_data), + rate_limit_msgs_per_sec: rate_limit, + value_subject, + })) + } +} diff --git a/src/sql/connector/source/mod.rs b/src/sql/connector/source/mod.rs new file mode 100644 index 00000000..b9574391 --- /dev/null +++ b/src/sql/connector/source/mod.rs @@ -0,0 +1,13 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod kafka; diff --git a/src/sql/logical_node/lookup.rs b/src/sql/logical_node/lookup.rs index c060ba82..d2817c85 100644 --- a/src/sql/logical_node/lookup.rs +++ b/src/sql/logical_node/lookup.rs @@ -10,7 +10,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashMap; use std::fmt::Formatter; use std::sync::Arc; @@ -22,9 +21,7 @@ use datafusion_proto::physical_plan::to_proto::serialize_physical_expr; use prost::Message; use protocol::function_stream_graph; -use protocol::function_stream_graph::{ - ConnectorOp, GenericConnectorConfig, LookupJoinCondition, LookupJoinOperator, -}; +use protocol::function_stream_graph::{ConnectorOp, LookupJoinCondition, LookupJoinOperator}; use crate::multifield_partial_ord; use crate::sql::common::constants::extension_node; @@ -32,7 +29,7 @@ use crate::sql::common::{FsSchema, FsSchemaRef}; use crate::sql::logical_node::logical::{LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName}; use crate::sql::logical_node::{CompiledTopologyNode, StreamingOperatorBlueprint}; use crate::sql::logical_planner::planner::{NamedNode, Planner}; -use crate::sql::schema::SourceTable; +use crate::sql::schema::LookupTable; use crate::sql::schema::utils::add_timestamp_field_arrow; pub const DICTIONARY_SOURCE_NODE_NAME: &str = extension_node::REFERENCE_TABLE_SOURCE; @@ -40,7 +37,7 @@ pub const STREAM_DICTIONARY_JOIN_NODE_NAME: &str = extension_node::STREAM_REFERE #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct ReferenceTableSourceNode { - pub(crate) source_definition: SourceTable, + pub(crate) source_definition: LookupTable, pub(crate) resolved_schema: DFSchemaRef, } @@ -85,7 +82,7 @@ impl UserDefinedLogicalNodeCore for ReferenceTableSourceNode { pub struct StreamReferenceJoinNode { pub(crate) upstream_stream_plan: LogicalPlan, pub(crate) output_schema: DFSchemaRef, - pub(crate) external_dictionary: SourceTable, + pub(crate) external_dictionary: LookupTable, pub(crate) equijoin_conditions: Vec<(Expr, Column)>, pub(crate) post_join_filter: Option, pub(crate) namespace_alias: Option, @@ -140,13 +137,6 @@ impl StreamReferenceJoinNode { let lookup_fs_schema = FsSchema::from_schema_unkeyed(add_timestamp_field_arrow(dictionary_physical_schema))?; - let properties: HashMap = self - .external_dictionary - .catalog_with_options - .iter() - .map(|(k, v)| (k.clone(), v.clone())) - .collect(); - Ok(LookupJoinOperator { input_schema: Some(internal_input_schema.into()), lookup_schema: Some(lookup_fs_schema.clone().into()), @@ -155,11 +145,7 @@ impl StreamReferenceJoinNode { fs_schema: Some(lookup_fs_schema.into()), name: self.external_dictionary.table_identifier.clone(), description: self.external_dictionary.description.clone(), - config: Some( - protocol::function_stream_graph::connector_op::Config::Generic( - GenericConnectorConfig { properties }, - ), - ), + config: Some(self.external_dictionary.connector_config.to_proto_config()), }), key_exprs: self.compile_join_conditions(planner)?, join_type: self.map_api_join_type()?, diff --git a/src/sql/logical_node/sink.rs b/src/sql/logical_node/sink.rs index 2edf8f27..d767afe3 100644 --- a/src/sql/logical_node/sink.rs +++ b/src/sql/logical_node/sink.rs @@ -23,7 +23,8 @@ use crate::sql::common::{FsSchema, FsSchemaRef, UPDATING_META_FIELD}; use crate::sql::logical_node::logical::{LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName}; use crate::sql::logical_node::{CompiledTopologyNode, StreamingOperatorBlueprint}; use crate::sql::logical_planner::planner::{NamedNode, Planner}; -use crate::sql::schema::Table; +use crate::sql::schema::CatalogEntity; +use crate::sql::schema::catalog::ExternalTable; use super::debezium::PackDebeziumEnvelopeNode; use super::remote_table::RemoteTableBoundaryNode; @@ -42,7 +43,7 @@ pub(crate) const STREAM_EGRESS_NODE_NAME: &str = extension_node::STREAM_EGRESS; #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub(crate) struct StreamEgressNode { pub(crate) target_identifier: TableReference, - pub(crate) destination_table: Table, + pub(crate) destination_table: CatalogEntity, pub(crate) egress_schema: DFSchemaRef, upstream_plans: Arc>, } @@ -52,7 +53,7 @@ multifield_partial_ord!(StreamEgressNode, target_identifier, upstream_plans); impl StreamEgressNode { pub fn try_new( target_identifier: TableReference, - destination_table: Table, + destination_table: CatalogEntity, initial_schema: DFSchemaRef, upstream_plan: LogicalPlan, ) -> Result { @@ -72,43 +73,61 @@ impl StreamEgressNode { fn apply_cdc_transformations( plan: LogicalPlan, schema: DFSchemaRef, - destination: &Table, + destination: &CatalogEntity, ) -> Result<(LogicalPlan, DFSchemaRef)> { let is_upstream_updating = plan .schema() .has_column_with_unqualified_name(UPDATING_META_FIELD); match destination { - Table::ConnectorTable(connector) => { - let is_sink_updating = connector.is_updating(); - - match (is_upstream_updating, is_sink_updating) { - (_, true) => { - let debezium_encoder = PackDebeziumEnvelopeNode::try_new(plan)?; - let wrapped_plan = LogicalPlan::Extension(Extension { - node: Arc::new(debezium_encoder), - }); - let new_schema = wrapped_plan.schema().clone(); - - Ok((wrapped_plan, new_schema)) + CatalogEntity::ExternalConnector(b) => match b.as_ref() { + ExternalTable::Sink(sink) => { + let is_sink_updating = sink.is_updating(); + + match (is_upstream_updating, is_sink_updating) { + (_, true) => { + let debezium_encoder = PackDebeziumEnvelopeNode::try_new(plan)?; + let wrapped_plan = LogicalPlan::Extension(Extension { + node: Arc::new(debezium_encoder), + }); + let new_schema = wrapped_plan.schema().clone(); + + Ok((wrapped_plan, new_schema)) + } + (true, false) => { + plan_err!( + "Topology Mismatch: The upstream is producing an updating stream (CDC), \ + but the target sink '{}' is not configured to accept updates. \ + Hint: set `format = 'debezium_json'` in the WITH clause.", + sink.name() + ) + } + (false, false) => Ok((plan, schema)), } - (true, false) => { - plan_err!( - "Topology Mismatch: The upstream is producing an updating stream (CDC), \ - but the target sink '{}' is not configured to accept updates. \ - Hint: set `format = 'debezium_json'` in the WITH clause.", - connector.name() - ) + } + ExternalTable::Source(source) => { + let is_sink_updating = source.is_updating(); + match (is_upstream_updating, is_sink_updating) { + (_, true) => { + let debezium_encoder = PackDebeziumEnvelopeNode::try_new(plan)?; + let wrapped_plan = LogicalPlan::Extension(Extension { + node: Arc::new(debezium_encoder), + }); + let new_schema = wrapped_plan.schema().clone(); + Ok((wrapped_plan, new_schema)) + } + (true, false) => plan_err!( + "Topology Mismatch: upstream produces CDC but target '{}' is a non-updating source table", + source.name() + ), + (false, false) => Ok((plan, schema)), } - (false, false) => Ok((plan, schema)), } - } - Table::LookupTable(..) => { - plan_err!( + ExternalTable::Lookup(_) => plan_err!( "Topology Violation: A Lookup Table cannot be used as a streaming data sink." - ) - } - Table::TableFromQuery { .. } => Ok((plan, schema)), + ), + }, + CatalogEntity::ComputedTable { .. } => Ok((plan, schema)), } } diff --git a/src/sql/mod.rs b/src/sql/mod.rs index 529c7a2d..f4a0eef6 100644 --- a/src/sql/mod.rs +++ b/src/sql/mod.rs @@ -14,6 +14,7 @@ pub mod api; pub mod common; pub mod analysis; +pub mod connector; pub mod functions; pub mod logical_node; pub mod logical_planner; diff --git a/src/sql/schema/catalog.rs b/src/sql/schema/catalog.rs new file mode 100644 index 00000000..479df682 --- /dev/null +++ b/src/sql/schema/catalog.rs @@ -0,0 +1,609 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! External connector catalog: [`ExternalTable`] as [`SourceTable`] | [`SinkTable`] | [`LookupTable`]. + +use std::collections::BTreeMap; +use std::sync::Arc; +use std::time::Duration; + +use datafusion::arrow::datatypes::{DataType, Field, FieldRef, Schema}; +use datafusion::common::{Column, Result, plan_err}; +use datafusion::error::DataFusionError; +use datafusion::logical_expr::Expr; +use protocol::function_stream_graph::ConnectorOp; + +use super::column_descriptor::ColumnDescriptor; +use super::data_encoding_format::DataEncodingFormat; +use super::table::SqlSource; +use super::temporal_pipeline_config::TemporalPipelineConfig; +use crate::multifield_partial_ord; +use crate::sql::common::constants::sql_field; +use crate::sql::common::{Format, FsSchema}; +use crate::sql::connector::config::ConnectorConfig; +use crate::sql::types::ProcessingMode; + +#[derive(Debug, Clone)] +pub struct EngineDescriptor { + pub engine_type: String, + pub raw_payload: String, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum SyncMode { + AppendOnly, + Incremental, +} + +#[derive(Debug, Clone)] +pub struct TableExecutionUnit { + pub label: String, + pub engine_meta: EngineDescriptor, + pub sync_mode: SyncMode, + pub temporal_offset: TemporalPipelineConfig, +} + +/// The only legal shape an external-connector catalog row can take. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum ExternalTable { + Source(SourceTable), + Sink(SinkTable), + Lookup(LookupTable), +} + +impl ExternalTable { + #[inline] + pub fn name(&self) -> &str { + match self { + ExternalTable::Source(t) => t.table_identifier.as_str(), + ExternalTable::Sink(t) => t.table_identifier.as_str(), + ExternalTable::Lookup(t) => t.table_identifier.as_str(), + } + } + + #[inline] + pub fn adapter_type(&self) -> &str { + match self { + ExternalTable::Source(t) => t.adapter_type.as_str(), + ExternalTable::Sink(t) => t.adapter_type.as_str(), + ExternalTable::Lookup(t) => t.adapter_type.as_str(), + } + } + + #[inline] + pub fn description(&self) -> &str { + match self { + ExternalTable::Source(t) => t.description.as_str(), + ExternalTable::Sink(t) => t.description.as_str(), + ExternalTable::Lookup(t) => t.description.as_str(), + } + } + + #[inline] + pub fn schema_specs(&self) -> &[ColumnDescriptor] { + match self { + ExternalTable::Source(t) => &t.schema_specs, + ExternalTable::Sink(t) => &t.schema_specs, + ExternalTable::Lookup(t) => &t.schema_specs, + } + } + + #[inline] + pub fn connector_config(&self) -> &ConnectorConfig { + match self { + ExternalTable::Source(t) => &t.connector_config, + ExternalTable::Sink(t) => &t.connector_config, + ExternalTable::Lookup(t) => &t.connector_config, + } + } + + #[inline] + pub fn key_constraints(&self) -> &[String] { + match self { + ExternalTable::Source(t) => &t.key_constraints, + ExternalTable::Sink(t) => &t.key_constraints, + ExternalTable::Lookup(t) => &t.key_constraints, + } + } + + #[inline] + pub fn connection_format(&self) -> Option<&Format> { + match self { + ExternalTable::Source(t) => t.connection_format.as_ref(), + ExternalTable::Sink(t) => t.connection_format.as_ref(), + ExternalTable::Lookup(t) => t.connection_format.as_ref(), + } + } + + #[inline] + pub fn catalog_with_options(&self) -> &BTreeMap { + match self { + ExternalTable::Source(t) => &t.catalog_with_options, + ExternalTable::Sink(t) => &t.catalog_with_options, + ExternalTable::Lookup(t) => &t.catalog_with_options, + } + } + + pub fn produce_physical_schema(&self) -> Schema { + Schema::new( + self.schema_specs() + .iter() + .filter(|c| !c.is_computed()) + .map(|c| c.arrow_field().clone()) + .collect::>(), + ) + } + + pub fn connector_op(&self) -> ConnectorOp { + let physical = self.produce_physical_schema(); + let fields: Vec = physical + .fields() + .iter() + .map(|f| f.as_ref().clone()) + .collect(); + let fs_schema = FsSchema::from_fields(fields); + + ConnectorOp { + connector: self.adapter_type().to_string(), + fs_schema: Some(fs_schema.into()), + name: self.name().to_string(), + description: self.description().to_string(), + config: Some(self.connector_config().to_proto_config()), + } + } + + #[inline] + pub fn is_updating(&self) -> bool { + match self { + ExternalTable::Source(t) => t.is_updating(), + ExternalTable::Sink(t) => t + .connection_format + .as_ref() + .is_some_and(|f| f.is_updating()), + ExternalTable::Lookup(_) => false, + } + } + + /// Variant-agnostic view of "persisted Arrow fields post-planning". + /// Only Source / Lookup track inferred schema — Sinks derive theirs from the upstream plan. + pub fn effective_fields(&self) -> Vec { + match self { + ExternalTable::Source(t) => t.effective_fields(), + ExternalTable::Sink(t) => t.effective_fields(), + ExternalTable::Lookup(t) => t.effective_fields(), + } + } + + #[inline] + pub fn as_source(&self) -> Option<&SourceTable> { + match self { + ExternalTable::Source(t) => Some(t), + _ => None, + } + } + + #[inline] + pub fn as_sink(&self) -> Option<&SinkTable> { + match self { + ExternalTable::Sink(t) => Some(t), + _ => None, + } + } + + #[inline] + pub fn as_lookup(&self) -> Option<&LookupTable> { + match self { + ExternalTable::Lookup(t) => Some(t), + _ => None, + } + } +} + +/// Ingress external connector (`CREATE TABLE ... WITH (type='source', ...)`). +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct SourceTable { + pub table_identifier: String, + pub adapter_type: String, + pub schema_specs: Vec, + pub connector_config: ConnectorConfig, + pub temporal_config: TemporalPipelineConfig, + pub key_constraints: Vec, + pub payload_format: Option, + pub connection_format: Option, + pub description: String, + pub catalog_with_options: BTreeMap, + + // Planner / catalog; not in SQL text. + pub registry_id: Option, + pub inferred_fields: Option>, +} + +multifield_partial_ord!( + SourceTable, + registry_id, + adapter_type, + table_identifier, + description, + key_constraints, + connection_format, + catalog_with_options +); + +impl SourceTable { + #[inline] + pub fn name(&self) -> &str { + self.table_identifier.as_str() + } + + #[inline] + pub fn connector(&self) -> &str { + self.adapter_type.as_str() + } + + pub fn event_time_field(&self) -> Option<&str> { + self.temporal_config.event_column.as_deref() + } + + pub fn watermark_field(&self) -> Option<&str> { + self.temporal_config.watermark_strategy_column.as_deref() + } + + /// Watermark column safe to persist to the stream catalog. Omits the + /// generated `__watermark` column — that is only resolvable at compile + /// time, the catalog round-trip cannot reconstruct it. + pub fn stream_catalog_watermark_field(&self) -> Option { + self.temporal_config + .watermark_strategy_column + .as_deref() + .filter(|w| *w != sql_field::COMPUTED_WATERMARK) + .map(str::to_string) + } + + #[inline] + pub fn catalog_with_options(&self) -> &BTreeMap { + &self.catalog_with_options + } + + pub fn idle_time(&self) -> Option { + self.temporal_config.liveness_timeout + } + + pub fn produce_physical_schema(&self) -> Schema { + Schema::new( + self.schema_specs + .iter() + .filter(|c| !c.is_computed()) + .map(|c| c.arrow_field().clone()) + .collect::>(), + ) + } + + #[inline] + pub fn physical_schema(&self) -> Schema { + self.produce_physical_schema() + } + + pub fn effective_fields(&self) -> Vec { + self.inferred_fields.clone().unwrap_or_else(|| { + self.schema_specs + .iter() + .map(|c| Arc::new(c.arrow_field().clone())) + .collect() + }) + } + + pub fn convert_to_execution_unit(&self) -> Result { + if self.is_cdc_enabled() && self.schema_specs.iter().any(|c| c.is_computed()) { + return plan_err!("CDC cannot be mixed with computed columns natively"); + } + + let mode = if self.is_cdc_enabled() { + SyncMode::Incremental + } else { + SyncMode::AppendOnly + }; + + Ok(TableExecutionUnit { + label: self.table_identifier.clone(), + engine_meta: EngineDescriptor { + engine_type: self.adapter_type.clone(), + raw_payload: String::new(), + }, + sync_mode: mode, + temporal_offset: self.temporal_config.clone(), + }) + } + + #[inline] + pub fn to_execution_unit(&self) -> Result { + self.convert_to_execution_unit() + } + + fn is_cdc_enabled(&self) -> bool { + self.payload_format + .as_ref() + .is_some_and(|f| f.supports_delta_updates()) + } + + pub fn has_virtual_fields(&self) -> bool { + self.schema_specs.iter().any(|c| c.is_computed()) + } + + pub fn is_updating(&self) -> bool { + self.connection_format + .as_ref() + .is_some_and(|f| f.is_updating()) + || self.payload_format == Some(DataEncodingFormat::DebeziumJson) + } + + pub fn connector_op(&self) -> ConnectorOp { + let physical = self.produce_physical_schema(); + let fields: Vec = physical + .fields() + .iter() + .map(|f| f.as_ref().clone()) + .collect(); + let fs_schema = FsSchema::from_fields(fields); + + ConnectorOp { + connector: self.adapter_type.clone(), + fs_schema: Some(fs_schema.into()), + name: self.table_identifier.clone(), + description: self.description.clone(), + config: Some(self.connector_config.to_proto_config()), + } + } + + pub fn processing_mode(&self) -> ProcessingMode { + if self.is_updating() { + ProcessingMode::Update + } else { + ProcessingMode::Append + } + } + + pub fn timestamp_override(&self) -> Result> { + if let Some(field_name) = self.temporal_config.event_column.clone() { + if self.is_updating() { + return plan_err!("can't use event_time_field with update mode"); + } + let _field = self.get_time_column(&field_name)?; + Ok(Some(Expr::Column(Column::from_name(field_name.as_str())))) + } else { + Ok(None) + } + } + + fn get_time_column(&self, field_name: &str) -> Result<&ColumnDescriptor> { + self.schema_specs + .iter() + .find(|c| { + c.arrow_field().name() == field_name + && matches!(c.arrow_field().data_type(), DataType::Timestamp(..)) + }) + .ok_or_else(|| { + DataFusionError::Plan(format!("field {field_name} not found or not a timestamp")) + }) + } + + pub fn watermark_column(&self) -> Result> { + if let Some(field_name) = self.temporal_config.watermark_strategy_column.clone() { + let _field = self.get_time_column(&field_name)?; + Ok(Some(Expr::Column(Column::from_name(field_name.as_str())))) + } else { + Ok(None) + } + } + + pub fn as_sql_source(&self) -> Result { + if self.is_updating() && self.has_virtual_fields() { + return plan_err!("can't read from a source with virtual fields and update mode."); + } + + let timestamp_override = self.timestamp_override()?; + let watermark_column = self.watermark_column()?; + + let source = SqlSource { + id: self.registry_id, + struct_def: self + .schema_specs + .iter() + .filter(|c| !c.is_computed()) + .map(|c| Arc::new(c.arrow_field().clone())) + .collect(), + config: self.connector_op(), + processing_mode: self.processing_mode(), + idle_time: self.temporal_config.liveness_timeout, + }; + + Ok(SourceOperator { + name: self.table_identifier.clone(), + source, + timestamp_override, + watermark_column, + }) + } +} + +/// Egress external connector, or the sink of `CREATE STREAMING TABLE ... AS SELECT`. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct SinkTable { + pub table_identifier: String, + pub adapter_type: String, + pub schema_specs: Vec, + pub connector_config: ConnectorConfig, + pub partition_exprs: Arc>>, + pub key_constraints: Vec, + pub connection_format: Option, + pub description: String, + pub catalog_with_options: BTreeMap, +} + +multifield_partial_ord!( + SinkTable, + adapter_type, + table_identifier, + description, + key_constraints, + connection_format, + catalog_with_options +); + +impl SinkTable { + #[inline] + pub fn name(&self) -> &str { + self.table_identifier.as_str() + } + + #[inline] + pub fn connector(&self) -> &str { + self.adapter_type.as_str() + } + + #[inline] + pub fn catalog_with_options(&self) -> &BTreeMap { + &self.catalog_with_options + } + + pub fn produce_physical_schema(&self) -> Schema { + Schema::new( + self.schema_specs + .iter() + .filter(|c| !c.is_computed()) + .map(|c| c.arrow_field().clone()) + .collect::>(), + ) + } + + pub fn effective_fields(&self) -> Vec { + self.schema_specs + .iter() + .map(|c| Arc::new(c.arrow_field().clone())) + .collect() + } + + pub fn is_updating(&self) -> bool { + self.connection_format + .as_ref() + .is_some_and(|f| f.is_updating()) + } + + pub fn connector_op(&self) -> ConnectorOp { + let physical = self.produce_physical_schema(); + let fields: Vec = physical + .fields() + .iter() + .map(|f| f.as_ref().clone()) + .collect(); + let fs_schema = FsSchema::from_fields(fields); + + ConnectorOp { + connector: self.adapter_type.clone(), + fs_schema: Some(fs_schema.into()), + name: self.table_identifier.clone(), + description: self.description.clone(), + config: Some(self.connector_config.to_proto_config()), + } + } +} + +/// Lookup-join only; not a scan source. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct LookupTable { + pub table_identifier: String, + pub adapter_type: String, + pub schema_specs: Vec, + pub connector_config: ConnectorConfig, + pub key_constraints: Vec, + pub lookup_cache_max_bytes: Option, + pub lookup_cache_ttl: Option, + pub connection_format: Option, + pub description: String, + pub catalog_with_options: BTreeMap, + + pub registry_id: Option, + pub inferred_fields: Option>, +} + +multifield_partial_ord!( + LookupTable, + registry_id, + adapter_type, + table_identifier, + description, + key_constraints, + connection_format, + catalog_with_options +); + +impl LookupTable { + #[inline] + pub fn name(&self) -> &str { + self.table_identifier.as_str() + } + + #[inline] + pub fn connector(&self) -> &str { + self.adapter_type.as_str() + } + + #[inline] + pub fn catalog_with_options(&self) -> &BTreeMap { + &self.catalog_with_options + } + + pub fn produce_physical_schema(&self) -> Schema { + Schema::new( + self.schema_specs + .iter() + .filter(|c| !c.is_computed()) + .map(|c| c.arrow_field().clone()) + .collect::>(), + ) + } + + pub fn effective_fields(&self) -> Vec { + self.inferred_fields.clone().unwrap_or_else(|| { + self.schema_specs + .iter() + .map(|c| Arc::new(c.arrow_field().clone())) + .collect() + }) + } + + pub fn connector_op(&self) -> ConnectorOp { + let physical = self.produce_physical_schema(); + let fields: Vec = physical + .fields() + .iter() + .map(|f| f.as_ref().clone()) + .collect(); + let fs_schema = FsSchema::from_fields(fields); + + ConnectorOp { + connector: self.adapter_type.clone(), + fs_schema: Some(fs_schema.into()), + name: self.table_identifier.clone(), + description: self.description.clone(), + config: Some(self.connector_config.to_proto_config()), + } + } +} + +/// [`SourceTable`] as an ingestion logical node input. +#[derive(Debug, Clone)] +pub struct SourceOperator { + pub name: String, + pub source: SqlSource, + pub timestamp_override: Option, + pub watermark_column: Option, +} diff --git a/src/sql/schema/catalog_ddl.rs b/src/sql/schema/catalog_ddl.rs deleted file mode 100644 index 45936912..00000000 --- a/src/sql/schema/catalog_ddl.rs +++ /dev/null @@ -1,251 +0,0 @@ -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! Best-effort SQL text for catalog introspection (`SHOW CREATE TABLE`). - -use std::collections::BTreeMap; - -use datafusion::arrow::datatypes::{DataType, TimeUnit}; - -use super::schema_provider::StreamTable; -use super::table::Table as CatalogTable; -use crate::sql::logical_node::logical::LogicalProgram; - -fn data_type_sql(dt: &DataType) -> String { - match dt { - DataType::Null => "NULL".to_string(), - DataType::Boolean => "BOOLEAN".to_string(), - DataType::Int8 => "TINYINT".to_string(), - DataType::Int16 => "SMALLINT".to_string(), - DataType::Int32 => "INT".to_string(), - DataType::Int64 => "BIGINT".to_string(), - DataType::UInt8 => "TINYINT UNSIGNED".to_string(), - DataType::UInt16 => "SMALLINT UNSIGNED".to_string(), - DataType::UInt32 => "INT UNSIGNED".to_string(), - DataType::UInt64 => "BIGINT UNSIGNED".to_string(), - DataType::Float16 => "FLOAT".to_string(), - DataType::Float32 => "REAL".to_string(), - DataType::Float64 => "DOUBLE".to_string(), - DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => "VARCHAR".to_string(), - DataType::Binary | DataType::LargeBinary => "VARBINARY".to_string(), - DataType::Date32 => "DATE".to_string(), - DataType::Date64 => "DATE".to_string(), - DataType::Timestamp(unit, tz) => match (unit, tz) { - (TimeUnit::Second, None) => "TIMESTAMP(0)".to_string(), - (TimeUnit::Millisecond, None) => "TIMESTAMP(3)".to_string(), - (TimeUnit::Microsecond, None) => "TIMESTAMP(6)".to_string(), - (TimeUnit::Nanosecond, None) => "TIMESTAMP(9)".to_string(), - (_, Some(_)) => "TIMESTAMP WITH TIME ZONE".to_string(), - }, - DataType::Decimal128(p, s) => format!("DECIMAL({p},{s})"), - DataType::Decimal256(p, s) => format!("DECIMAL({p},{s})"), - _ => dt.to_string(), - } -} - -fn format_columns(schema: &datafusion::arrow::datatypes::Schema) -> Vec { - schema - .fields() - .iter() - .map(|f| { - let null = if f.is_nullable() { "" } else { " NOT NULL" }; - format!(" {} {}{}", f.name(), data_type_sql(f.data_type()), null) - }) - .collect() -} - -fn format_with_clause(opts: &BTreeMap) -> String { - if opts.is_empty() { - return "WITH ('connector' = '...', 'format' = '...');\n/* Original WITH options are not persisted in the stream catalog. */\n" - .to_string(); - } - let pairs: Vec = opts - .iter() - .map(|(k, v)| { - let k_esc = k.replace('\'', "''"); - let v_esc = v.replace('\'', "''"); - format!(" '{k_esc}' = '{v_esc}'") - }) - .collect(); - format!("WITH (\n{}\n);\n", pairs.join(",\n")) -} - -/// Single-line `col:TYPE` list for result grids. -pub fn schema_columns_one_line(schema: &datafusion::arrow::datatypes::Schema) -> String { - schema - .fields() - .iter() - .map(|f| format!("{}:{}", f.name(), data_type_sql(f.data_type()))) - .collect::>() - .join(", ") -} - -fn pipeline_summary_short(program: &LogicalProgram) -> String { - let mut parts: Vec = Vec::new(); - parts.push(format!("tasks={}", program.task_count())); - parts.push(format!("hash={}", program.get_hash())); - for nw in program.graph.node_weights() { - let chain = nw - .operator_chain - .operators - .iter() - .map(|o| format!("{}", o.operator_name)) - .collect::>() - .join("->"); - parts.push(format!("n{}:{}", nw.node_id, chain)); - } - parts.join(" | ") -} - -/// Extra fields for `SHOW TABLES` result grid (pipeline summary; no full Graphviz). -pub fn stream_table_row_detail(table: &StreamTable) -> String { - match table { - StreamTable::Source { - connector, - event_time_field, - watermark_field, - with_options, - .. - } => { - format!( - "connector={}, event_time={:?}, watermark={:?}, with_options={}", - connector, - event_time_field, - watermark_field, - with_options.len() - ) - } - StreamTable::Sink { program, .. } => pipeline_summary_short(program), - } -} - -fn pipeline_text(program: &LogicalProgram) -> String { - let mut lines: Vec = Vec::new(); - lines.push(format!("tasks_total: {}", program.task_count())); - lines.push(format!("program_hash: {}", program.get_hash())); - for nw in program.graph.node_weights() { - let chain = nw - .operator_chain - .operators - .iter() - .map(|o| format!("{}[{}]", o.operator_name, o.operator_id)) - .collect::>() - .join(" -> "); - lines.push(format!( - "node {} (parallelism={}): {chain}", - nw.node_id, nw.parallelism - )); - } - let dot = program.dot(); - const MAX_DOT: usize = 12_000; - if dot.len() > MAX_DOT { - lines.push(format!( - "graphviz_dot_truncated:\n{}... [{} more bytes]", - &dot[..MAX_DOT], - dot.len() - MAX_DOT - )); - } else { - lines.push(format!("graphviz_dot:\n{dot}")); - } - lines.join("\n") -} - -/// Human-readable `SHOW CREATE TABLE` text (sink `AS SELECT` is not stored). -pub fn show_create_stream_table(table: &StreamTable) -> String { - match table { - StreamTable::Source { - name, - connector, - schema, - event_time_field, - watermark_field, - with_options, - } => { - let cols = format_columns(schema); - let mut ddl = format!("CREATE TABLE {name} (\n{}\n)", cols.join(",\n")); - if let Some(e) = event_time_field { - ddl.push_str(&format!("\n/* EVENT TIME COLUMN: {e} */\n")); - } - if let Some(w) = watermark_field { - ddl.push_str(&format!("/* WATERMARK: {w} */\n")); - } - let mut merged_opts = with_options.clone(); - merged_opts - .entry("connector".to_string()) - .or_insert_with(|| connector.clone()); - ddl.push_str(&format_with_clause(&merged_opts)); - ddl - } - StreamTable::Sink { name, program } => { - let schema = program.egress_arrow_schema().unwrap_or_else(|| { - std::sync::Arc::new(datafusion::arrow::datatypes::Schema::empty()) - }); - let cols = format_columns(&schema); - let mut ddl = format!( - "CREATE STREAMING TABLE {name}\nWITH ('connector' = '...') AS SELECT ...\n/* Sink WITH / AS SELECT text is not stored. Output schema:\n{}\n*/\n\n", - cols.join(",\n") - ); - ddl.push_str("-- Resolved logical pipeline:\n"); - ddl.push_str(&pipeline_text(program)); - ddl.push('\n'); - ddl - } - } -} - -/// Extra fields for `SHOW TABLES` result grid for persisted catalog rows. -pub fn catalog_table_row_detail(table: &CatalogTable) -> String { - match table { - CatalogTable::ConnectorTable(source) => format!( - "kind=connector, connector={}, event_time={:?}, watermark={:?}, with_options={}", - source.connector(), - source.event_time_field(), - source.temporal_config.watermark_strategy_column, - source.catalog_with_options().len() - ), - CatalogTable::LookupTable(source) => format!( - "kind=lookup, connector={}, event_time={:?}, watermark={:?}, with_options={}", - source.connector(), - source.event_time_field(), - source.temporal_config.watermark_strategy_column, - source.catalog_with_options().len() - ), - CatalogTable::TableFromQuery { .. } => "kind=query".to_string(), - } -} - -/// Human-readable `SHOW CREATE TABLE` text for persisted catalog rows. -pub fn show_create_catalog_table(table: &CatalogTable) -> String { - match table { - CatalogTable::ConnectorTable(source) | CatalogTable::LookupTable(source) => { - let schema = source.produce_physical_schema(); - let cols = format_columns(&schema); - let mut ddl = format!("CREATE TABLE {} (\n{}\n)", source.name(), cols.join(",\n")); - if let Some(e) = source.event_time_field() { - ddl.push_str(&format!("\n/* EVENT TIME COLUMN: {e} */\n")); - } - if let Some(w) = source.temporal_config.watermark_strategy_column.as_deref() { - ddl.push_str(&format!("/* WATERMARK: {w} */\n")); - } - let mut opts = source.catalog_with_options().clone(); - opts.entry("connector".to_string()) - .or_insert_with(|| source.connector().to_string()); - ddl.push_str(&format_with_clause(&opts)); - ddl - } - CatalogTable::TableFromQuery { name, .. } => { - format!( - "CREATE TABLE {name} AS SELECT ...;\n/* logical query text is not persisted */\n" - ) - } - } -} diff --git a/src/sql/schema/connector_config.rs b/src/sql/schema/connector_config.rs deleted file mode 100644 index edb44bae..00000000 --- a/src/sql/schema/connector_config.rs +++ /dev/null @@ -1,72 +0,0 @@ -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::HashMap; - -use protocol::function_stream_graph::{ - GenericConnectorConfig, KafkaSinkConfig, KafkaSourceConfig, connector_op, -}; - -#[derive(Debug, Clone)] -pub enum ConnectorConfig { - KafkaSource(KafkaSourceConfig), - KafkaSink(KafkaSinkConfig), - Generic(HashMap), -} - -impl ConnectorConfig { - pub fn to_proto_config(&self) -> connector_op::Config { - match self { - ConnectorConfig::KafkaSource(cfg) => connector_op::Config::KafkaSource(cfg.clone()), - ConnectorConfig::KafkaSink(cfg) => connector_op::Config::KafkaSink(cfg.clone()), - ConnectorConfig::Generic(props) => { - connector_op::Config::Generic(GenericConnectorConfig { - properties: props.clone(), - }) - } - } - } -} - -impl PartialEq for ConnectorConfig { - fn eq(&self, other: &Self) -> bool { - use prost::Message; - match (self, other) { - (ConnectorConfig::KafkaSource(a), ConnectorConfig::KafkaSource(b)) => { - a.encode_to_vec() == b.encode_to_vec() - } - (ConnectorConfig::KafkaSink(a), ConnectorConfig::KafkaSink(b)) => { - a.encode_to_vec() == b.encode_to_vec() - } - (ConnectorConfig::Generic(a), ConnectorConfig::Generic(b)) => a == b, - _ => false, - } - } -} - -impl Eq for ConnectorConfig {} - -impl std::hash::Hash for ConnectorConfig { - fn hash(&self, state: &mut H) { - use prost::Message; - std::mem::discriminant(self).hash(state); - match self { - ConnectorConfig::KafkaSource(cfg) => cfg.encode_to_vec().hash(state), - ConnectorConfig::KafkaSink(cfg) => cfg.encode_to_vec().hash(state), - ConnectorConfig::Generic(m) => { - let mut pairs: Vec<_> = m.iter().collect(); - pairs.sort_by_key(|(k, _)| (*k).clone()); - pairs.hash(state); - } - } - } -} diff --git a/src/sql/schema/data_encoding_format.rs b/src/sql/schema/data_encoding_format.rs index 1cd5c736..0b6f5e1d 100644 --- a/src/sql/schema/data_encoding_format.rs +++ b/src/sql/schema/data_encoding_format.rs @@ -10,78 +10,79 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashMap; - use datafusion::arrow::datatypes::{DataType, Field}; use datafusion::common::{Result, plan_err}; use super::column_descriptor::ColumnDescriptor; use crate::sql::common::Format; -use crate::sql::common::constants::{cdc, connection_format_value, with_opt_bool_str}; -use crate::sql::common::with_option_keys as opt; +use crate::sql::common::constants::cdc; -/// High-level payload encoding (orthogonal to `Format` wire details in `ConnectionSchema`). -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)] pub enum DataEncodingFormat { + #[default] + Raw, StandardJson, DebeziumJson, Avro, Parquet, - Raw, + Csv, + JsonL, + Orc, + Protobuf, } impl DataEncodingFormat { - pub fn extract_from_map(opts: &HashMap) -> Result { - let format_str = opts - .get(opt::FORMAT) - .map(|s| s.as_str()) - .unwrap_or(opt::DEFAULT_FORMAT_VALUE); - let is_debezium = opts - .get(opt::FORMAT_DEBEZIUM_FLAG) - .or_else(|| opts.get(opt::JSON_DEBEZIUM)) - .map(|s| s == with_opt_bool_str::TRUE) - .unwrap_or(false); - - match (format_str, is_debezium) { - (f, true) if f == connection_format_value::JSON => Ok(Self::DebeziumJson), - (f, _) if f == connection_format_value::DEBEZIUM_JSON => Ok(Self::DebeziumJson), - (f, false) if f == connection_format_value::JSON => Ok(Self::StandardJson), - (f, _) if f == connection_format_value::AVRO => Ok(Self::Avro), - (f, _) if f == connection_format_value::PARQUET => Ok(Self::Parquet), - _ => Ok(Self::Raw), + pub fn from_format(format: Option<&Format>) -> Self { + match format { + Some(Format::Json(j)) if j.debezium => Self::DebeziumJson, + Some(Format::Json(_)) => Self::StandardJson, + Some(Format::Avro(_)) => Self::Avro, + Some(Format::Parquet(_)) => Self::Parquet, + Some(Format::Csv(_)) => Self::Csv, + Some(Format::Protobuf(_)) => Self::Protobuf, + Some(Format::RawString(_)) | Some(Format::RawBytes(_)) | None => Self::Raw, + Some(_) => Self::Raw, } } - pub fn from_connection_format(format: &Format) -> Self { - match format { - Format::Json(j) if j.debezium => Self::DebeziumJson, - Format::Json(_) => Self::StandardJson, - Format::Avro(_) => Self::Avro, - Format::Parquet(_) => Self::Parquet, - Format::Protobuf(_) | Format::RawString(_) | Format::RawBytes(_) => Self::Raw, - } + pub fn is_cdc_format(&self) -> bool { + matches!(self, Self::DebeziumJson) } + #[inline] pub fn supports_delta_updates(&self) -> bool { - matches!(self, Self::DebeziumJson) + self.is_cdc_format() } - pub fn apply_envelope(self, columns: Vec) -> Result> { - if !self.supports_delta_updates() { - return Ok(columns); + pub fn apply_envelope( + &self, + logical_columns: Vec, + ) -> Result> { + if !self.is_cdc_format() { + return Ok(logical_columns); } - if columns.iter().any(|c| c.is_computed()) { - return plan_err!("Virtual fields are not supported with CDC envelope"); + + if logical_columns.is_empty() { + return Ok(logical_columns); } - if columns.is_empty() { - return Ok(columns); + + if logical_columns.iter().any(|c| c.is_computed()) { + return plan_err!( + "Computed/Virtual columns are not supported directly inside a CDC source table; \ + define computed columns in a downstream VIEW or AS SELECT streaming query" + ); } - let fields: Vec = columns.into_iter().map(|c| c.into_arrow_field()).collect(); - let struct_type = DataType::Struct(fields.into()); + + let inner_fields: Vec = logical_columns + .into_iter() + .map(|c| c.into_arrow_field()) + .collect(); + + let row_struct_type = DataType::Struct(inner_fields.into()); Ok(vec![ - ColumnDescriptor::new_physical(Field::new(cdc::BEFORE, struct_type.clone(), true)), - ColumnDescriptor::new_physical(Field::new(cdc::AFTER, struct_type.clone(), true)), + ColumnDescriptor::new_physical(Field::new(cdc::BEFORE, row_struct_type.clone(), true)), + ColumnDescriptor::new_physical(Field::new(cdc::AFTER, row_struct_type, true)), ColumnDescriptor::new_physical(Field::new(cdc::OP, DataType::Utf8, true)), ]) } diff --git a/src/sql/schema/introspection/ddl_formatter.rs b/src/sql/schema/introspection/ddl_formatter.rs new file mode 100644 index 00000000..f4ce36e6 --- /dev/null +++ b/src/sql/schema/introspection/ddl_formatter.rs @@ -0,0 +1,156 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::BTreeMap; +use std::fmt::{self, Write}; + +use datafusion::arrow::datatypes::{DataType, Schema, TimeUnit}; + +use crate::sql::common::constants::sql_field; + +pub struct DdlBuilder<'a> { + table_name: &'a str, + schema: &'a Schema, + watermark_column: Option<&'a str>, + primary_keys: &'a [String], + options: BTreeMap, +} + +impl<'a> DdlBuilder<'a> { + pub fn new(table_name: &'a str, schema: &'a Schema) -> Self { + Self { + table_name, + schema, + watermark_column: None, + primary_keys: &[], + options: BTreeMap::new(), + } + } + + pub fn with_watermark(mut self, watermark: Option<&'a str>) -> Self { + self.watermark_column = watermark; + self + } + + pub fn with_primary_keys(mut self, keys: &'a [String]) -> Self { + self.primary_keys = keys; + self + } + + pub fn with_options( + mut self, + opts: &BTreeMap, + role: &str, + connector: &str, + ) -> Self { + self.options = opts.clone(); + self.options + .entry("type".to_string()) + .or_insert_with(|| role.to_string()); + self.options + .entry("connector".to_string()) + .or_insert_with(|| connector.to_string()); + self + } +} + +impl<'a> fmt::Display for DdlBuilder<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + writeln!(f, "CREATE TABLE {} (", self.table_name)?; + + let mut rows: Vec = Vec::new(); + for field in self.schema.fields() { + let null_constraint = if field.is_nullable() { "" } else { " NOT NULL" }; + rows.push(format!( + " {} {}{}", + field.name(), + format_data_type(field.data_type()), + null_constraint + )); + } + + if let Some(wm) = self.watermark_column + && wm != sql_field::COMPUTED_WATERMARK + { + rows.push(format!(" WATERMARK FOR {wm}")); + } + + if !self.primary_keys.is_empty() { + rows.push(format!(" PRIMARY KEY ({})", self.primary_keys.join(", "))); + } + + writeln!(f, "{}", rows.join(",\n"))?; + write!(f, ")")?; + + if !self.options.is_empty() { + writeln!(f)?; + writeln!(f, "WITH (")?; + let mut opt_lines: Vec = Vec::with_capacity(self.options.len()); + for (k, v) in &self.options { + let k_esc = k.replace('\'', "''"); + let v_esc = v.replace('\'', "''"); + opt_lines.push(format!(" '{k_esc}' = '{v_esc}'")); + } + write!(f, "{}\n);", opt_lines.join(",\n"))?; + } else { + write!(f, ";")?; + } + + Ok(()) + } +} + +pub fn format_data_type(dt: &DataType) -> String { + match dt { + DataType::Null => "NULL".to_string(), + DataType::Boolean => "BOOLEAN".to_string(), + DataType::Int8 => "TINYINT".to_string(), + DataType::Int16 => "SMALLINT".to_string(), + DataType::Int32 => "INT".to_string(), + DataType::Int64 => "BIGINT".to_string(), + DataType::UInt8 => "TINYINT UNSIGNED".to_string(), + DataType::UInt16 => "SMALLINT UNSIGNED".to_string(), + DataType::UInt32 => "INT UNSIGNED".to_string(), + DataType::UInt64 => "BIGINT UNSIGNED".to_string(), + DataType::Float16 => "FLOAT".to_string(), + DataType::Float32 => "REAL".to_string(), + DataType::Float64 => "DOUBLE".to_string(), + DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => "VARCHAR".to_string(), + DataType::Binary | DataType::LargeBinary => "VARBINARY".to_string(), + DataType::Date32 | DataType::Date64 => "DATE".to_string(), + DataType::Timestamp(unit, tz) => match (unit, tz) { + (TimeUnit::Second, None) => "TIMESTAMP(0)".to_string(), + (TimeUnit::Millisecond, None) => "TIMESTAMP(3)".to_string(), + (TimeUnit::Microsecond, None) => "TIMESTAMP(6)".to_string(), + (TimeUnit::Nanosecond, None) => "TIMESTAMP(9)".to_string(), + (_, Some(_)) => "TIMESTAMP WITH TIME ZONE".to_string(), + }, + DataType::Decimal128(p, s) | DataType::Decimal256(p, s) => format!("DECIMAL({p}, {s})"), + _ => dt.to_string(), + } +} + +pub fn schema_columns_one_line(schema: &Schema) -> String { + let mut buf = String::new(); + for (idx, field) in schema.fields().iter().enumerate() { + if idx > 0 { + buf.push_str(", "); + } + let _ = write!( + buf, + "{}:{}", + field.name(), + format_data_type(field.data_type()) + ); + } + buf +} diff --git a/src/sql/schema/introspection/mod.rs b/src/sql/schema/introspection/mod.rs new file mode 100644 index 00000000..1ba9c816 --- /dev/null +++ b/src/sql/schema/introspection/mod.rs @@ -0,0 +1,21 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod ddl_formatter; +pub mod show_formatter; +pub mod stream_formatter; + +#[allow(unused_imports)] +pub use ddl_formatter::{DdlBuilder, format_data_type, schema_columns_one_line}; +pub use show_formatter::{catalog_table_row_detail, show_create_catalog_table}; +#[allow(unused_imports)] +pub use stream_formatter::{show_create_stream_table, stream_table_row_detail}; diff --git a/src/sql/schema/introspection/show_formatter.rs b/src/sql/schema/introspection/show_formatter.rs new file mode 100644 index 00000000..28a81ae9 --- /dev/null +++ b/src/sql/schema/introspection/show_formatter.rs @@ -0,0 +1,100 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::sql::common::constants::connection_table_role; +use crate::sql::schema::catalog::ExternalTable; +use crate::sql::schema::table::CatalogEntity; + +use super::ddl_formatter::DdlBuilder; + +impl ExternalTable { + pub fn to_ddl_string(&self) -> String { + match self { + ExternalTable::Source(source) => { + let schema = source.produce_physical_schema(); + DdlBuilder::new(&source.table_identifier, &schema) + .with_watermark(source.temporal_config.watermark_strategy_column.as_deref()) + .with_primary_keys(&source.key_constraints) + .with_options( + &source.catalog_with_options, + connection_table_role::SOURCE, + &source.adapter_type, + ) + .to_string() + } + ExternalTable::Sink(sink) => { + let schema = sink.produce_physical_schema(); + DdlBuilder::new(&sink.table_identifier, &schema) + .with_primary_keys(&sink.key_constraints) + .with_options( + &sink.catalog_with_options, + connection_table_role::SINK, + &sink.adapter_type, + ) + .to_string() + } + ExternalTable::Lookup(lookup) => { + let schema = lookup.produce_physical_schema(); + DdlBuilder::new(&lookup.table_identifier, &schema) + .with_primary_keys(&lookup.key_constraints) + .with_options( + &lookup.catalog_with_options, + connection_table_role::LOOKUP, + &lookup.adapter_type, + ) + .to_string() + } + } + } + + pub fn to_row_detail(&self) -> String { + match self { + ExternalTable::Source(s) => format!( + "{{ kind: 'source', connector: '{}', watermark: '{}', options_count: {} }}", + s.adapter_type, + s.temporal_config + .watermark_strategy_column + .as_deref() + .unwrap_or("none"), + s.catalog_with_options.len() + ), + ExternalTable::Sink(s) => format!( + "{{ kind: 'sink', connector: '{}', partitioned: {}, options_count: {} }}", + s.adapter_type, + s.partition_exprs.as_ref().is_some(), + s.catalog_with_options.len() + ), + ExternalTable::Lookup(s) => format!( + "{{ kind: 'lookup', connector: '{}', cache_ttl_secs: {}, options_count: {} }}", + s.adapter_type, + s.lookup_cache_ttl.map(|d| d.as_secs()).unwrap_or(0), + s.catalog_with_options.len() + ), + } + } +} + +pub fn show_create_catalog_table(table: &CatalogEntity) -> String { + match table { + CatalogEntity::ExternalConnector(ext) => ext.to_ddl_string(), + CatalogEntity::ComputedTable { name, .. } => { + format!("-- Logical query view\nCREATE VIEW {name} AS SELECT ...;") + } + } +} + +pub fn catalog_table_row_detail(table: &CatalogEntity) -> String { + match table { + CatalogEntity::ExternalConnector(ext) => ext.to_row_detail(), + CatalogEntity::ComputedTable { .. } => "{ kind: 'logical_view' }".to_string(), + } +} diff --git a/src/sql/schema/introspection/stream_formatter.rs b/src/sql/schema/introspection/stream_formatter.rs new file mode 100644 index 00000000..ebb02330 --- /dev/null +++ b/src/sql/schema/introspection/stream_formatter.rs @@ -0,0 +1,120 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use datafusion::arrow::datatypes::Schema; + +use crate::sql::common::constants::connection_table_role; +use crate::sql::logical_node::logical::LogicalProgram; +use crate::sql::schema::schema_provider::StreamTable; + +use super::ddl_formatter::DdlBuilder; + +impl StreamTable { + pub fn to_ddl_string(&self) -> String { + match self { + StreamTable::Source { + name, + connector, + schema, + event_time_field: _, + watermark_field, + with_options, + } => DdlBuilder::new(name, schema) + .with_watermark(watermark_field.as_deref()) + .with_options(with_options, connection_table_role::SOURCE, connector) + .to_string(), + StreamTable::Sink { name, program } => { + let schema: Arc = program + .egress_arrow_schema() + .unwrap_or_else(|| Arc::new(Schema::empty())); + + let mut ddl = format!("CREATE STREAMING TABLE {name} AS SELECT ...\n\n"); + ddl.push_str("/* === SINK SCHEMA === */\n"); + let schema_ddl = DdlBuilder::new(name, &schema).to_string(); + ddl.push_str(&schema_ddl); + ddl.push_str("\n\n/* === STREAMING TOPOLOGY === */\n"); + ddl.push_str(&format_pipeline(program)); + ddl + } + } + } + + pub fn to_row_detail(&self) -> String { + match self { + StreamTable::Source { + connector, + event_time_field, + watermark_field, + with_options, + .. + } => format!( + "{{ kind: 'stream_source', connector: '{}', event_time: '{}', watermark: '{}', options_count: {} }}", + connector, + event_time_field.as_deref().unwrap_or("none"), + watermark_field.as_deref().unwrap_or("none"), + with_options.len() + ), + StreamTable::Sink { program, .. } => format!( + "{{ kind: 'streaming_sink', tasks: {}, nodes: {} }}", + program.task_count(), + program.graph.node_count() + ), + } + } +} + +pub fn show_create_stream_table(table: &StreamTable) -> String { + table.to_ddl_string() +} + +pub fn stream_table_row_detail(table: &StreamTable) -> String { + table.to_row_detail() +} + +fn format_pipeline(program: &LogicalProgram) -> String { + let mut lines: Vec = Vec::new(); + lines.push(format!("Pipeline Hash : {}", program.get_hash())); + lines.push(format!("Total Tasks : {}", program.task_count())); + lines.push(format!("Node Count : {}", program.graph.node_count())); + lines.push(String::from("Operator Chains:")); + + for nw in program.graph.node_weights() { + let chain = nw + .operator_chain + .operators + .iter() + .map(|op| format!("{}[{}]", op.operator_name, op.operator_id)) + .collect::>() + .join(" -> "); + + lines.push(format!( + " Node {:<3} | Parallelism {:<3} | {}", + nw.node_id, nw.parallelism, chain + )); + } + + let dot = program.dot(); + const MAX_DOT: usize = 5_000; + if dot.len() > MAX_DOT { + lines.push(format!( + "\nGraphviz DOT (truncated, {} bytes omitted):\n{}...", + dot.len() - MAX_DOT, + &dot[..MAX_DOT] + )); + } else { + lines.push(format!("\nGraphviz DOT:\n{dot}")); + } + + lines.join("\n") +} diff --git a/src/sql/schema/kafka_operator_config.rs b/src/sql/schema/kafka_operator_config.rs deleted file mode 100644 index d87dda8f..00000000 --- a/src/sql/schema/kafka_operator_config.rs +++ /dev/null @@ -1,272 +0,0 @@ -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// -// Builds strongly-typed proto Kafka configs from SQL DDL WITH options. - -use std::collections::HashMap; - -use datafusion::arrow::datatypes::Schema; -use datafusion::common::{Result as DFResult, plan_datafusion_err, plan_err}; - -use protocol::function_stream_graph::connector_op::Config as ProtoConfig; -use protocol::function_stream_graph::{ - BadDataPolicy, DecimalEncodingProto, FormatConfig, JsonFormatConfig, KafkaAuthConfig, - KafkaAuthNone, KafkaOffsetMode, KafkaReadMode, KafkaSinkCommitMode, KafkaSinkConfig, - KafkaSourceConfig, RawBytesFormatConfig, RawStringFormatConfig, TimestampFormatProto, -}; - -use crate::sql::common::connector_options::ConnectorOptions; -use crate::sql::common::constants::{connection_table_role, kafka_with_value}; -use crate::sql::common::formats::{ - BadData, DecimalEncoding as SqlDecimalEncoding, Format as SqlFormat, - TimestampFormat as SqlTimestampFormat, -}; -use crate::sql::common::with_option_keys as opt; -use crate::sql::schema::table_role::TableRole; - -const STREAMING_JOB_OPTION_CHECKPOINT_INTERVAL: &str = "checkpoint.interval"; -const STREAMING_JOB_OPTION_PARALLELISM: &str = "parallelism"; - -fn sql_format_to_proto(fmt: &SqlFormat) -> DFResult { - match fmt { - SqlFormat::Json(j) => Ok(FormatConfig { - format: Some( - protocol::function_stream_graph::format_config::Format::Json(JsonFormatConfig { - timestamp_format: match j.timestamp_format { - SqlTimestampFormat::RFC3339 => { - TimestampFormatProto::TimestampRfc3339 as i32 - } - SqlTimestampFormat::UnixMillis => { - TimestampFormatProto::TimestampUnixMillis as i32 - } - }, - decimal_encoding: match j.decimal_encoding { - SqlDecimalEncoding::Number => DecimalEncodingProto::DecimalNumber as i32, - SqlDecimalEncoding::String => DecimalEncodingProto::DecimalString as i32, - SqlDecimalEncoding::Bytes => DecimalEncodingProto::DecimalBytes as i32, - }, - include_schema: j.include_schema, - confluent_schema_registry: j.confluent_schema_registry, - schema_id: j.schema_id, - debezium: j.debezium, - unstructured: j.unstructured, - }), - ), - }), - SqlFormat::RawString(_) => Ok(FormatConfig { - format: Some( - protocol::function_stream_graph::format_config::Format::RawString( - RawStringFormatConfig {}, - ), - ), - }), - SqlFormat::RawBytes(_) => Ok(FormatConfig { - format: Some( - protocol::function_stream_graph::format_config::Format::RawBytes( - RawBytesFormatConfig {}, - ), - ), - }), - other => plan_err!( - "Kafka connector: format '{}' is not supported yet", - other.name() - ), - } -} - -fn sql_bad_data_to_proto(bad: &BadData) -> i32 { - match bad { - BadData::Fail {} => BadDataPolicy::BadDataFail as i32, - BadData::Drop {} => BadDataPolicy::BadDataDrop as i32, - } -} - -/// Build Kafka proto config from a flat string map (catalog rebuild path). -pub fn build_kafka_proto_config_from_string_map( - map: HashMap, - _physical_schema: &Schema, -) -> DFResult { - let mut options = ConnectorOptions::from_flat_string_map(map)?; - let format = crate::sql::common::formats::Format::from_opts(&mut options) - .map_err(|e| datafusion::error::DataFusionError::Plan(format!("invalid format: {e}")))?; - let bad_data = BadData::from_opts(&mut options).map_err(|e| { - datafusion::error::DataFusionError::Plan(format!("Invalid bad_data: '{e}'")) - })?; - let _framing = crate::sql::common::formats::Framing::from_opts(&mut options) - .map_err(|e| datafusion::error::DataFusionError::Plan(format!("invalid framing: '{e}'")))?; - - let role = match options.pull_opt_str(opt::TYPE)?.as_deref() { - None | Some(connection_table_role::SOURCE) => TableRole::Ingestion, - Some(connection_table_role::SINK) => TableRole::Egress, - Some(connection_table_role::LOOKUP) => TableRole::Reference, - Some(other) => { - return plan_err!("invalid connection type '{other}' in WITH options"); - } - }; - - build_kafka_proto_config(&mut options, role, &format, bad_data) -} - -/// Core builder shared by SQL DDL and catalog reload paths. -pub fn build_kafka_proto_config( - options: &mut ConnectorOptions, - role: TableRole, - format: &Option, - bad_data: BadData, -) -> DFResult { - let bootstrap_servers = match options.pull_opt_str(opt::KAFKA_BOOTSTRAP_SERVERS)? { - Some(s) => s, - None => options - .pull_opt_str(opt::KAFKA_BOOTSTRAP_SERVERS_LEGACY)? - .ok_or_else(|| { - plan_datafusion_err!( - "Kafka connector requires 'bootstrap.servers' in the WITH clause" - ) - })?, - }; - - let topic = options.pull_opt_str(opt::KAFKA_TOPIC)?.ok_or_else(|| { - plan_datafusion_err!("Kafka connector requires 'topic' in the WITH clause") - })?; - - let sql_format = format.clone().ok_or_else(|| { - plan_datafusion_err!( - "Kafka connector requires 'format' in the WITH clause (e.g. format = 'json')" - ) - })?; - let proto_format = sql_format_to_proto(&sql_format)?; - - let rate_limit = options - .pull_opt_u64(opt::KAFKA_RATE_LIMIT_MESSAGES_PER_SECOND)? - .map(|v| v.clamp(1, u32::MAX as u64) as u32) - .unwrap_or(0); - - let value_subject = options.pull_opt_str(opt::KAFKA_VALUE_SUBJECT)?; - - let auth = Some(KafkaAuthConfig { - auth: Some( - protocol::function_stream_graph::kafka_auth_config::Auth::None(KafkaAuthNone {}), - ), - }); - - let _ = options.pull_opt_str(opt::TYPE)?; - let _ = options.pull_opt_str(opt::CONNECTOR)?; - - match role { - TableRole::Ingestion => { - let offset_mode = match options - .pull_opt_str(opt::KAFKA_SCAN_STARTUP_MODE)? - .as_deref() - { - Some(s) if s == kafka_with_value::SCAN_LATEST => { - KafkaOffsetMode::KafkaOffsetLatest as i32 - } - Some(s) if s == kafka_with_value::SCAN_EARLIEST => { - KafkaOffsetMode::KafkaOffsetEarliest as i32 - } - Some(s) - if s == kafka_with_value::SCAN_GROUP_OFFSETS - || s == kafka_with_value::SCAN_GROUP => - { - KafkaOffsetMode::KafkaOffsetGroup as i32 - } - None => KafkaOffsetMode::KafkaOffsetGroup as i32, - Some(other) => { - return plan_err!( - "invalid scan.startup.mode '{other}'; expected latest, earliest, or group-offsets" - ); - } - }; - - let read_mode = match options.pull_opt_str(opt::KAFKA_ISOLATION_LEVEL)?.as_deref() { - Some(s) if s == kafka_with_value::ISOLATION_READ_COMMITTED => { - KafkaReadMode::KafkaReadCommitted as i32 - } - Some(s) if s == kafka_with_value::ISOLATION_READ_UNCOMMITTED => { - KafkaReadMode::KafkaReadUncommitted as i32 - } - None => KafkaReadMode::KafkaReadDefault as i32, - Some(other) => { - return plan_err!("invalid isolation.level '{other}'"); - } - }; - - let group_id = match options.pull_opt_str(opt::KAFKA_GROUP_ID)? { - Some(s) => Some(s), - None => options.pull_opt_str(opt::KAFKA_GROUP_ID_LEGACY)?, - }; - let group_id_prefix = options.pull_opt_str(opt::KAFKA_GROUP_ID_PREFIX)?; - - let mut client_configs = options.drain_remaining_string_values()?; - // Streaming job-level options are parsed by planner/coordinator, not Kafka client. - client_configs.remove(STREAMING_JOB_OPTION_CHECKPOINT_INTERVAL); - client_configs.remove(STREAMING_JOB_OPTION_PARALLELISM); - - Ok(ProtoConfig::KafkaSource(KafkaSourceConfig { - topic, - bootstrap_servers, - group_id, - group_id_prefix, - offset_mode, - read_mode, - auth, - client_configs, - format: Some(proto_format), - bad_data_policy: sql_bad_data_to_proto(&bad_data), - rate_limit_msgs_per_sec: rate_limit, - value_subject, - })) - } - TableRole::Egress => { - let commit_mode = match options - .pull_opt_str(opt::KAFKA_SINK_COMMIT_MODE)? - .as_deref() - { - Some(s) - if s == kafka_with_value::SINK_COMMIT_EXACTLY_ONCE_HYPHEN - || s == kafka_with_value::SINK_COMMIT_EXACTLY_ONCE_UNDERSCORE => - { - KafkaSinkCommitMode::KafkaSinkExactlyOnce as i32 - } - None => KafkaSinkCommitMode::KafkaSinkAtLeastOnce as i32, - Some(s) - if s == kafka_with_value::SINK_COMMIT_AT_LEAST_ONCE_HYPHEN - || s == kafka_with_value::SINK_COMMIT_AT_LEAST_ONCE_UNDERSCORE => - { - KafkaSinkCommitMode::KafkaSinkAtLeastOnce as i32 - } - Some(other) => { - return plan_err!("invalid sink.commit.mode '{other}'"); - } - }; - let key_field = match options.pull_opt_str(opt::KAFKA_SINK_KEY_FIELD)? { - Some(s) => Some(s), - None => options.pull_opt_str(opt::KAFKA_KEY_FIELD_LEGACY)?, - }; - let timestamp_field = match options.pull_opt_str(opt::KAFKA_SINK_TIMESTAMP_FIELD)? { - Some(s) => Some(s), - None => options.pull_opt_str(opt::KAFKA_TIMESTAMP_FIELD_LEGACY)?, - }; - - let mut client_configs = options.drain_remaining_string_values()?; - // Streaming job-level options are parsed by planner/coordinator, not Kafka client. - client_configs.remove(STREAMING_JOB_OPTION_CHECKPOINT_INTERVAL); - client_configs.remove(STREAMING_JOB_OPTION_PARALLELISM); - - Ok(ProtoConfig::KafkaSink(KafkaSinkConfig { - topic, - bootstrap_servers, - commit_mode, - key_field, - timestamp_field, - auth, - client_configs, - format: Some(proto_format), - value_subject, - })) - } - TableRole::Reference => { - plan_err!("Kafka connector cannot be used as a lookup table in this path") - } - } -} diff --git a/src/sql/schema/mod.rs b/src/sql/schema/mod.rs index be3cdda4..e11e4808 100644 --- a/src/sql/schema/mod.rs +++ b/src/sql/schema/mod.rs @@ -10,30 +10,21 @@ // See the License for the specific language governing permissions and // limitations under the License. -pub mod catalog_ddl; +pub mod catalog; pub mod column_descriptor; pub mod connection_type; -pub mod connector_config; pub mod data_encoding_format; -pub mod kafka_operator_config; -pub mod schema_context; +pub mod introspection; pub mod schema_provider; -pub mod source_table; pub mod table; -pub mod table_execution_unit; pub mod table_role; pub mod temporal_pipeline_config; pub mod utils; -pub use catalog_ddl::{ +pub use catalog::{ExternalTable, LookupTable, SinkTable, SourceTable}; +pub use column_descriptor::ColumnDescriptor; +pub use introspection::{ catalog_table_row_detail, schema_columns_one_line, show_create_catalog_table, }; -pub use column_descriptor::ColumnDescriptor; -pub use connection_type::ConnectionType; -pub use connector_config::ConnectorConfig; -pub use source_table::SourceTable; - -/// Back-compat alias for [`SourceTable`]. -pub type ConnectorTable = SourceTable; pub use schema_provider::{ObjectName, StreamPlanningContext, StreamSchemaProvider, StreamTable}; -pub use table::Table; +pub use table::CatalogEntity; diff --git a/src/sql/schema/schema_context.rs b/src/sql/schema/schema_context.rs deleted file mode 100644 index 851bf6af..00000000 --- a/src/sql/schema/schema_context.rs +++ /dev/null @@ -1,37 +0,0 @@ -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use datafusion::arrow::datatypes::{DataType, Schema}; -use datafusion::common::{DFSchema, Result}; -use datafusion::logical_expr::Expr; -use datafusion_expr::ExprSchemable; - -pub trait SchemaContext { - fn resolve_expression(&self, expr: &Expr, schema: &Schema) -> Result; - fn extract_datatype(&self, expr: &Expr, schema: &Schema) -> Result; -} - -/// [`SchemaContext`] backed by a [`DFSchema`] built from the physical Arrow schema. -pub struct DfSchemaContext; - -impl SchemaContext for DfSchemaContext { - fn resolve_expression(&self, expr: &Expr, schema: &Schema) -> Result { - let df = DFSchema::try_from(schema.clone())?; - let _ = expr.get_type(&df)?; - Ok(expr.clone()) - } - - fn extract_datatype(&self, expr: &Expr, schema: &Schema) -> Result { - let df = DFSchema::try_from(schema.clone())?; - expr.get_type(&df) - } -} diff --git a/src/sql/schema/schema_provider.rs b/src/sql/schema/schema_provider.rs index 26fd43e8..15cd58ee 100644 --- a/src/sql/schema/schema_provider.rs +++ b/src/sql/schema/schema_provider.rs @@ -14,7 +14,7 @@ use std::collections::{BTreeMap, HashMap, HashSet}; use std::sync::Arc; use datafusion::arrow::datatypes::{self as datatypes, DataType, Field, Schema}; -use datafusion::common::{DataFusionError, Result}; +use datafusion::common::{DataFusionError, Result as DataFusionResult}; use datafusion::datasource::{DefaultTableSource, TableProvider, TableType}; use datafusion::execution::{FunctionRegistry, SessionStateDefaults}; use datafusion::logical_expr::expr_rewriter::FunctionRewrite; @@ -23,11 +23,13 @@ use datafusion::logical_expr::{AggregateUDF, Expr, ScalarUDF, TableSource, Windo use datafusion::optimizer::Analyzer; use datafusion::sql::TableReference; use datafusion::sql::planner::ContextProvider; +use thiserror::Error; +use tracing::{debug, error, info}; use unicase::UniCase; use crate::sql::common::constants::{planning_placeholder_udf, window_fn}; use crate::sql::logical_node::logical::{DylibUdfConfig, LogicalProgram}; -use crate::sql::schema::table::Table as CatalogTable; +use crate::sql::schema::table::CatalogEntity; use crate::sql::schema::utils::window_arrow_struct; use crate::sql::types::{PlanningOptions, PlanningPlaceholderUdf, SqlConfig}; @@ -38,6 +40,25 @@ fn object_name(s: impl Into) -> ObjectName { UniCase::new(s.into()) } +#[derive(Error, Debug)] +pub enum PlanningError { + #[error("Catalog table not found: {0}")] + TableNotFound(String), + #[error("Planning init failed: {0}")] + InitError(String), + #[error("Engine error: {0}")] + Engine(#[from] DataFusionError), +} + +impl From for DataFusionError { + fn from(err: PlanningError) -> Self { + match err { + PlanningError::Engine(inner) => inner, + other => DataFusionError::Plan(other.to_string()), + } + } +} + #[derive(Clone, Debug)] pub enum StreamTable { Source { @@ -46,7 +67,6 @@ pub enum StreamTable { schema: Arc, event_time_field: Option, watermark_field: Option, - /// Persisted `WITH` options for `SHOW CREATE TABLE`. with_options: BTreeMap, }, Sink { @@ -98,7 +118,7 @@ impl TableProvider for LogicalBatchInput { _projection: Option<&Vec>, _filters: &[Expr], _limit: Option, - ) -> Result> { + ) -> DataFusionResult> { Ok(Arc::new(crate::sql::physical::FsMemExec::new( self.table_name.clone(), Arc::clone(&self.schema), @@ -117,7 +137,7 @@ pub struct FunctionCatalog { #[derive(Clone, Default)] pub struct TableCatalog { pub streams: HashMap>, - pub catalogs: HashMap>, + pub catalogs: HashMap>, pub source_defs: HashMap, } @@ -132,7 +152,6 @@ pub struct StreamPlanningContext { pub sql_config: SqlConfig, } -/// Back-compat name for [`StreamPlanningContext`]. pub type StreamSchemaProvider = StreamPlanningContext; impl StreamPlanningContext { @@ -150,20 +169,25 @@ impl StreamPlanningContext { self.sql_config.key_by_parallelism } - /// Same registration order as the historical `StreamSchemaProvider::new` (placeholders, then DataFusion defaults). + pub fn try_new(config: SqlConfig) -> Result { + info!("Initializing StreamPlanningContext"); + let mut builder = StreamPlanningContextBuilder::default(); + builder + .with_streaming_extensions()? + .with_default_functions()?; + let mut ctx = builder.build(); + ctx.sql_config = config; + Ok(ctx) + } + pub fn new() -> Self { - let mut ctx = Self::builder() - .with_streaming_extensions() - .expect("streaming extensions") - .with_default_functions() - .expect("default functions") - .build(); - ctx.sql_config = crate::sql::planning_runtime::sql_planning_snapshot(); - ctx + let config = crate::sql::planning_runtime::sql_planning_snapshot(); + Self::try_new(config).expect("StreamPlanningContext bootstrap") } pub fn register_stream_table(&mut self, table: StreamTable) { let key = object_name(table.name().to_string()); + debug!(table = %key, "register stream table"); self.tables.streams.insert(key, Arc::new(table)); } @@ -174,12 +198,13 @@ impl StreamPlanningContext { .cloned() } - pub fn register_catalog_table(&mut self, table: CatalogTable) { + pub fn register_catalog_table(&mut self, table: CatalogEntity) { let key = object_name(table.name().to_string()); + debug!(table = %key, "register catalog table"); self.tables.catalogs.insert(key, Arc::new(table)); } - pub fn get_catalog_table(&self, table_name: impl AsRef) -> Option<&CatalogTable> { + pub fn get_catalog_table(&self, table_name: impl AsRef) -> Option<&CatalogEntity> { self.tables .catalogs .get(&object_name(table_name.as_ref().to_string())) @@ -189,7 +214,7 @@ impl StreamPlanningContext { pub fn get_catalog_table_mut( &mut self, table_name: impl AsRef, - ) -> Option<&mut CatalogTable> { + ) -> Option<&mut CatalogEntity> { self.tables .catalogs .get_mut(&object_name(table_name.as_ref().to_string())) @@ -221,8 +246,7 @@ impl StreamPlanningContext { self.register_stream_table(table); } - /// Alias for [`Self::register_catalog_table`]. - pub fn insert_catalog_table(&mut self, table: CatalogTable) { + pub fn insert_catalog_table(&mut self, table: CatalogEntity) { self.register_catalog_table(table); } @@ -254,12 +278,15 @@ impl StreamPlanningContext { } impl ContextProvider for StreamPlanningContext { - fn get_table_source(&self, name: TableReference) -> Result> { - let table = self - .get_stream_table(name.table()) - .ok_or_else(|| DataFusionError::Plan(format!("Table {} not found", name)))?; - - Ok(Self::create_table_source(name.to_string(), table.schema())) + fn get_table_source(&self, name: TableReference) -> DataFusionResult> { + let name_str = name.table(); + match self.get_stream_table(name_str) { + Some(table) => Ok(Self::create_table_source(name.to_string(), table.schema())), + None => { + error!(table = %name_str, "stream table lookup failed"); + Err(DataFusionError::Plan(format!("Table {} not found", name))) + } + } } fn get_function_meta(&self, name: &str) -> Option> { @@ -304,7 +331,7 @@ impl FunctionRegistry for StreamPlanningContext { self.functions.scalars.keys().cloned().collect() } - fn udf(&self, name: &str) -> Result> { + fn udf(&self, name: &str) -> DataFusionResult> { self.functions .scalars .get(name) @@ -312,7 +339,7 @@ impl FunctionRegistry for StreamPlanningContext { .ok_or_else(|| DataFusionError::Plan(format!("No UDF with name {name}"))) } - fn udaf(&self, name: &str) -> Result> { + fn udaf(&self, name: &str) -> DataFusionResult> { self.functions .aggregates .get(name) @@ -320,7 +347,7 @@ impl FunctionRegistry for StreamPlanningContext { .ok_or_else(|| DataFusionError::Plan(format!("No UDAF with name {name}"))) } - fn udwf(&self, name: &str) -> Result> { + fn udwf(&self, name: &str) -> DataFusionResult> { self.functions .windows .get(name) @@ -331,27 +358,33 @@ impl FunctionRegistry for StreamPlanningContext { fn register_function_rewrite( &mut self, rewrite: Arc, - ) -> Result<()> { + ) -> DataFusionResult<()> { self.analyzer.add_function_rewrite(rewrite); Ok(()) } - fn register_udf(&mut self, udf: Arc) -> Result>> { + fn register_udf(&mut self, udf: Arc) -> DataFusionResult>> { Ok(self.functions.scalars.insert(udf.name().to_string(), udf)) } - fn register_udaf(&mut self, udaf: Arc) -> Result>> { + fn register_udaf( + &mut self, + udaf: Arc, + ) -> DataFusionResult>> { Ok(self .functions .aggregates .insert(udaf.name().to_string(), udaf)) } - fn register_udwf(&mut self, udwf: Arc) -> Result>> { + fn register_udwf(&mut self, udwf: Arc) -> DataFusionResult>> { Ok(self.functions.windows.insert(udwf.name().to_string(), udwf)) } - fn register_expr_planner(&mut self, expr_planner: Arc) -> Result<()> { + fn register_expr_planner( + &mut self, + expr_planner: Arc, + ) -> DataFusionResult<()> { self.functions.planners.push(expr_planner); Ok(()) } @@ -371,7 +404,7 @@ impl StreamPlanningContextBuilder { Self::default() } - pub fn with_default_functions(mut self) -> Result { + pub fn with_default_functions(&mut self) -> Result<&mut Self, PlanningError> { for p in SessionStateDefaults::default_scalar_functions() { self.context.register_udf(p)?; } @@ -387,7 +420,7 @@ impl StreamPlanningContextBuilder { Ok(self) } - pub fn with_streaming_extensions(mut self) -> Result { + pub fn with_streaming_extensions(&mut self) -> Result<&mut Self, PlanningError> { let extensions = vec![ PlanningPlaceholderUdf::new_with_return( window_fn::HOP, diff --git a/src/sql/schema/source_table.rs b/src/sql/schema/source_table.rs deleted file mode 100644 index 03eda9ac..00000000 --- a/src/sql/schema/source_table.rs +++ /dev/null @@ -1,592 +0,0 @@ -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::{BTreeMap, HashMap}; -use std::sync::Arc; -use std::time::Duration; - -use datafusion::arrow::datatypes::{DataType, Field, FieldRef, Schema}; -use datafusion::common::{Column, DFSchema, Result, plan_datafusion_err, plan_err}; -use datafusion::error::DataFusionError; -use datafusion::logical_expr::Expr; -use datafusion::sql::TableReference; -use datafusion::sql::planner::{PlannerContext, SqlToRel}; -use datafusion::sql::sqlparser::ast; -use datafusion_expr::ExprSchemable; -use protocol::function_stream_graph::ConnectorOp; -use tracing::warn; - -use super::StreamSchemaProvider; -use super::column_descriptor::ColumnDescriptor; -use super::connector_config::ConnectorConfig; -use super::data_encoding_format::DataEncodingFormat; -use super::schema_context::SchemaContext; -use super::table_execution_unit::{EngineDescriptor, SyncMode, TableExecutionUnit}; -use super::table_role::{ - TableRole, apply_adapter_specific_rules, deduce_role, serialize_backend_params, - validate_adapter_availability, -}; -use super::temporal_pipeline_config::{ - TemporalPipelineConfig, TemporalSpec, resolve_temporal_logic, -}; -use crate::multifield_partial_ord; -use crate::sql::api::ConnectionProfile; -use crate::sql::common::connector_options::ConnectorOptions; -use crate::sql::common::constants::{connection_table_role, connector_type, sql_field}; -use crate::sql::common::with_option_keys as opt; -use crate::sql::common::{BadData, Format, Framing, FsSchema, JsonCompression, JsonFormat}; -use crate::sql::schema::ConnectionType; -use crate::sql::schema::kafka_operator_config::build_kafka_proto_config; -use crate::sql::schema::table::SqlSource; -use crate::sql::types::ProcessingMode; - -/// Connector-backed catalog table (adapter / source-sink model). -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct SourceTable { - pub registry_id: Option, - pub adapter_type: String, - pub table_identifier: String, - pub role: TableRole, - pub schema_specs: Vec, - /// Strongly-typed connector runtime configuration — replaces the legacy `opaque_config: String`. - pub connector_config: ConnectorConfig, - pub temporal_config: TemporalPipelineConfig, - pub key_constraints: Vec, - pub payload_format: Option, - /// Wire [`Format`] when built from SQL `WITH` (updating mode, `ConnectionSchema`). - pub connection_format: Option, - pub description: String, - pub partition_exprs: Arc>>, - pub lookup_cache_max_bytes: Option, - pub lookup_cache_ttl: Option, - pub inferred_fields: Option>, - /// Original `WITH` options for catalog persistence / `SHOW CREATE TABLE`. - pub catalog_with_options: BTreeMap, -} - -multifield_partial_ord!( - SourceTable, - registry_id, - adapter_type, - table_identifier, - role, - description, - key_constraints, - connection_format, - catalog_with_options -); - -impl SourceTable { - #[inline] - pub fn name(&self) -> &str { - self.table_identifier.as_str() - } - - pub fn new( - table_identifier: impl Into, - connector: impl Into, - connection_type: ConnectionType, - ) -> Self { - Self { - registry_id: None, - adapter_type: connector.into(), - table_identifier: table_identifier.into(), - role: connection_type.into(), - schema_specs: Vec::new(), - connector_config: ConnectorConfig::Generic(HashMap::new()), - temporal_config: TemporalPipelineConfig::default(), - key_constraints: Vec::new(), - payload_format: None, - connection_format: None, - description: String::new(), - partition_exprs: Arc::new(None), - lookup_cache_max_bytes: None, - lookup_cache_ttl: None, - inferred_fields: None, - catalog_with_options: BTreeMap::new(), - } - } - - #[inline] - pub fn connector(&self) -> &str { - self.adapter_type.as_str() - } - - #[inline] - pub fn connection_type(&self) -> ConnectionType { - self.role.into() - } - - pub fn event_time_field(&self) -> Option<&str> { - self.temporal_config.event_column.as_deref() - } - - pub fn watermark_field(&self) -> Option<&str> { - self.temporal_config.watermark_strategy_column.as_deref() - } - - /// Watermark column name safe to persist for [`StreamTable::Source`]. Omits the computed - /// [`sql_field::COMPUTED_WATERMARK`] column: stream catalog only stores Arrow physical fields, - /// so `__watermark` cannot be resolved when the table is planned from the catalog. - pub fn stream_catalog_watermark_field(&self) -> Option { - self.temporal_config - .watermark_strategy_column - .as_deref() - .filter(|w| *w != sql_field::COMPUTED_WATERMARK) - .map(str::to_string) - } - - #[inline] - pub fn catalog_with_options(&self) -> &BTreeMap { - &self.catalog_with_options - } - - pub fn idle_time(&self) -> Option { - self.temporal_config.liveness_timeout - } - - pub fn initialize_from_params( - id: &str, - adapter: &str, - raw_columns: Vec, - pk_list: Vec, - time_meta: Option, - options: &mut HashMap, - _schema_ctx: &dyn SchemaContext, - ) -> Result { - validate_adapter_availability(adapter)?; - - let catalog_with_options: BTreeMap = options - .iter() - .map(|(k, v)| (k.clone(), v.clone())) - .collect(); - - let encoding = DataEncodingFormat::extract_from_map(options)?; - - let mut refined_columns = apply_adapter_specific_rules(adapter, raw_columns); - refined_columns = encoding.apply_envelope(refined_columns)?; - - let temporal_settings = resolve_temporal_logic(&refined_columns, time_meta)?; - let _finalized_config = serialize_backend_params(adapter, options)?; - let role = deduce_role(options)?; - - if role == TableRole::Ingestion && encoding.supports_delta_updates() && pk_list.is_empty() { - return plan_err!("CDC source requires at least one primary key"); - } - - Ok(Self { - registry_id: None, - adapter_type: adapter.to_string(), - table_identifier: id.to_string(), - role, - schema_specs: refined_columns, - connector_config: ConnectorConfig::Generic( - catalog_with_options.clone().into_iter().collect(), - ), - temporal_config: temporal_settings, - key_constraints: pk_list, - payload_format: Some(encoding), - connection_format: None, - description: String::new(), - partition_exprs: Arc::new(None), - lookup_cache_max_bytes: None, - lookup_cache_ttl: None, - inferred_fields: None, - catalog_with_options, - }) - } - - pub fn produce_physical_schema(&self) -> Schema { - Schema::new( - self.schema_specs - .iter() - .filter(|c| !c.is_computed()) - .map(|c| c.arrow_field().clone()) - .collect::>(), - ) - } - - #[inline] - pub fn physical_schema(&self) -> Schema { - self.produce_physical_schema() - } - - pub fn convert_to_execution_unit(&self) -> Result { - if self.role == TableRole::Egress { - return plan_err!("Target [{}] is write-only", self.table_identifier); - } - - if self.is_cdc_enabled() && self.schema_specs.iter().any(|c| c.is_computed()) { - return plan_err!("CDC cannot be mixed with computed columns natively"); - } - - let mode = if self.is_cdc_enabled() { - SyncMode::Incremental - } else { - SyncMode::AppendOnly - }; - - Ok(TableExecutionUnit { - label: self.table_identifier.clone(), - engine_meta: EngineDescriptor { - engine_type: self.adapter_type.clone(), - raw_payload: String::new(), - }, - sync_mode: mode, - temporal_offset: self.temporal_config.clone(), - }) - } - - #[inline] - pub fn to_execution_unit(&self) -> Result { - self.convert_to_execution_unit() - } - - fn is_cdc_enabled(&self) -> bool { - self.payload_format - .as_ref() - .is_some_and(|f| f.supports_delta_updates()) - } - - #[allow(clippy::too_many_arguments)] - pub fn from_options( - table_identifier: &str, - connector_name: &str, - temporary: bool, - fields: Vec, - primary_keys: Vec, - watermark: Option<(String, Option)>, - options: &mut ConnectorOptions, - connection_profile: Option<&ConnectionProfile>, - schema_provider: &StreamSchemaProvider, - connection_type_override: Option, - description: String, - ) -> Result { - let _ = connection_profile; - - let catalog_with_options = options.snapshot_for_catalog(); - - if let Some(c) = options.pull_opt_str(opt::CONNECTOR)? - && c != connector_name - { - return plan_err!( - "WITH option `connector` is '{c}' but table uses connector '{connector_name}'" - ); - } - - validate_adapter_availability(connector_name)?; - - let mut columns = fields; - columns = apply_adapter_specific_rules(connector_name, columns); - - let format = Format::from_opts(options) - .map_err(|e| DataFusionError::Plan(format!("invalid format: '{e}'")))?; - - if let Some(Format::Json(JsonFormat { compression, .. })) = &format - && !matches!(compression, JsonCompression::Uncompressed) - && connector_name != connector_type::FILESYSTEM - { - return plan_err!("'json.compression' is only supported for the filesystem connector"); - } - - let _framing = Framing::from_opts(options) - .map_err(|e| DataFusionError::Plan(format!("invalid framing: '{e}'")))?; - - if temporary - && let Some(t) = options.insert_str(opt::TYPE, connection_table_role::LOOKUP)? - && t != connection_table_role::LOOKUP - { - return plan_err!( - "Cannot have a temporary table with type '{t}'; temporary tables must be type 'lookup'" - ); - } - - let payload_format = format - .as_ref() - .map(DataEncodingFormat::from_connection_format); - let encoding = payload_format.unwrap_or(DataEncodingFormat::Raw); - columns = encoding.apply_envelope(columns)?; - - let bad_data = BadData::from_opts(options) - .map_err(|e| DataFusionError::Plan(format!("Invalid bad_data: '{e}'")))?; - - let role = if let Some(t) = connection_type_override { - t.into() - } else { - match options.pull_opt_str(opt::TYPE)?.as_deref() { - None | Some(connection_table_role::SOURCE) => TableRole::Ingestion, - Some(connection_table_role::SINK) => TableRole::Egress, - Some(connection_table_role::LOOKUP) => TableRole::Reference, - Some(other) => { - return plan_err!("invalid connection type '{other}' in WITH options"); - } - } - }; - - let mut table = SourceTable { - registry_id: None, - adapter_type: connector_name.to_string(), - table_identifier: table_identifier.to_string(), - role, - schema_specs: columns, - connector_config: ConnectorConfig::Generic(HashMap::new()), - temporal_config: TemporalPipelineConfig::default(), - key_constraints: Vec::new(), - payload_format, - connection_format: format.clone(), - description, - partition_exprs: Arc::new(None), - lookup_cache_max_bytes: None, - lookup_cache_ttl: None, - inferred_fields: None, - catalog_with_options, - }; - - if let Some(event_time_field) = options.pull_opt_field(opt::EVENT_TIME_FIELD)? { - warn!("`event_time_field` WITH option is deprecated; use WATERMARK FOR syntax"); - table.temporal_config.event_column = Some(event_time_field); - } - - if let Some(watermark_field) = options.pull_opt_field(opt::WATERMARK_FIELD)? { - warn!("`watermark_field` WITH option is deprecated; use WATERMARK FOR syntax"); - table.temporal_config.watermark_strategy_column = Some(watermark_field); - } - - if let Some((time_field, watermark_expr)) = watermark { - let field = table - .schema_specs - .iter() - .find(|c| c.arrow_field().name().as_str() == time_field.as_str()) - .ok_or_else(|| { - plan_datafusion_err!( - "WATERMARK FOR field `{}` does not exist in table", - time_field - ) - })?; - - if !matches!( - field.arrow_field().data_type(), - DataType::Timestamp(_, None) - ) { - return plan_err!( - "WATERMARK FOR field `{time_field}` has type {}, but expected TIMESTAMP", - field.arrow_field().data_type() - ); - } - - for col in table.schema_specs.iter_mut() { - if col.arrow_field().name().as_str() == time_field.as_str() { - col.set_nullable(false); - break; - } - } - - let table_ref = TableReference::bare(table.table_identifier.as_str()); - let df_schema = - DFSchema::try_from_qualified_schema(table_ref, &table.produce_physical_schema())?; - - table.temporal_config.event_column = Some(time_field.clone()); - - if let Some(expr) = watermark_expr { - let logical_expr = plan_generating_expr(&expr, &df_schema, schema_provider) - .map_err(|e| { - DataFusionError::Plan(format!("could not plan watermark expression: {e}")) - })?; - - let (data_type, _nullable) = logical_expr.data_type_and_nullable(&df_schema)?; - if !matches!(data_type, DataType::Timestamp(_, _)) { - return plan_err!( - "the type of the WATERMARK FOR expression must be TIMESTAMP, but was {data_type}" - ); - } - - table.schema_specs.push(ColumnDescriptor::new_computed( - Field::new( - sql_field::COMPUTED_WATERMARK, - logical_expr.get_type(&df_schema)?, - false, - ), - logical_expr, - )); - table.temporal_config.watermark_strategy_column = - Some(sql_field::COMPUTED_WATERMARK.to_string()); - } else { - table.temporal_config.watermark_strategy_column = Some(time_field); - } - } - - let idle_from_micros = options - .pull_opt_i64(opt::IDLE_MICROS)? - .filter(|t| *t > 0) - .map(|t| Duration::from_micros(t as u64)); - let idle_from_duration = options.pull_opt_duration(opt::IDLE_TIME)?; - table.temporal_config.liveness_timeout = idle_from_micros.or(idle_from_duration); - - table.lookup_cache_max_bytes = options.pull_opt_u64(opt::LOOKUP_CACHE_MAX_BYTES)?; - - table.lookup_cache_ttl = options.pull_opt_duration(opt::LOOKUP_CACHE_TTL)?; - - if connector_name.eq_ignore_ascii_case(connector_type::KAFKA) { - let proto_cfg = build_kafka_proto_config(options, role, &format, bad_data)?; - table.connector_config = match proto_cfg { - protocol::function_stream_graph::connector_op::Config::KafkaSource(cfg) => { - ConnectorConfig::KafkaSource(cfg) - } - protocol::function_stream_graph::connector_op::Config::KafkaSink(cfg) => { - ConnectorConfig::KafkaSink(cfg) - } - protocol::function_stream_graph::connector_op::Config::Generic(g) => { - ConnectorConfig::Generic(g.properties) - } - }; - } else { - let extra_opts = options.drain_remaining_string_values()?; - table.connector_config = ConnectorConfig::Generic(extra_opts); - } - - if role == TableRole::Ingestion - && encoding.supports_delta_updates() - && primary_keys.is_empty() - { - return plan_err!("Debezium source must have at least one PRIMARY KEY field"); - } - - table.key_constraints = primary_keys; - - Ok(table) - } - - pub fn has_virtual_fields(&self) -> bool { - self.schema_specs.iter().any(|c| c.is_computed()) - } - - pub fn is_updating(&self) -> bool { - self.connection_format - .as_ref() - .is_some_and(|f| f.is_updating()) - || self.payload_format == Some(DataEncodingFormat::DebeziumJson) - } - - pub fn connector_op(&self) -> ConnectorOp { - let physical = self.produce_physical_schema(); - let fields: Vec = physical - .fields() - .iter() - .map(|f| f.as_ref().clone()) - .collect(); - let fs_schema = FsSchema::from_fields(fields); - - ConnectorOp { - connector: self.adapter_type.clone(), - fs_schema: Some(fs_schema.into()), - name: self.table_identifier.clone(), - description: self.description.clone(), - config: Some(self.connector_config.to_proto_config()), - } - } - - pub fn processing_mode(&self) -> ProcessingMode { - if self.is_updating() { - ProcessingMode::Update - } else { - ProcessingMode::Append - } - } - - pub fn timestamp_override(&self) -> Result> { - if let Some(field_name) = self.temporal_config.event_column.clone() { - if self.is_updating() { - return plan_err!("can't use event_time_field with update mode"); - } - let _field = self.get_time_column(&field_name)?; - Ok(Some(Expr::Column(Column::from_name(field_name.as_str())))) - } else { - Ok(None) - } - } - - fn get_time_column(&self, field_name: &str) -> Result<&ColumnDescriptor> { - self.schema_specs - .iter() - .find(|c| { - c.arrow_field().name() == field_name - && matches!(c.arrow_field().data_type(), DataType::Timestamp(..)) - }) - .ok_or_else(|| { - DataFusionError::Plan(format!("field {field_name} not found or not a timestamp")) - }) - } - - pub fn watermark_column(&self) -> Result> { - if let Some(field_name) = self.temporal_config.watermark_strategy_column.clone() { - let _field = self.get_time_column(&field_name)?; - Ok(Some(Expr::Column(Column::from_name(field_name.as_str())))) - } else { - Ok(None) - } - } - - pub fn as_sql_source(&self) -> Result { - match self.role { - TableRole::Ingestion => {} - TableRole::Egress | TableRole::Reference => { - return plan_err!("cannot read from sink"); - } - }; - - if self.is_updating() && self.has_virtual_fields() { - return plan_err!("can't read from a source with virtual fields and update mode."); - } - - let timestamp_override = self.timestamp_override()?; - let watermark_column = self.watermark_column()?; - - let source = SqlSource { - id: self.registry_id, - struct_def: self - .schema_specs - .iter() - .filter(|c| !c.is_computed()) - .map(|c| Arc::new(c.arrow_field().clone())) - .collect(), - config: self.connector_op(), - processing_mode: self.processing_mode(), - idle_time: self.temporal_config.liveness_timeout, - }; - - Ok(SourceOperator { - name: self.table_identifier.clone(), - source, - timestamp_override, - watermark_column, - }) - } -} - -/// Plan a SQL scalar expression against a table-qualified schema (e.g. watermark `AS` clause). -fn plan_generating_expr( - ast: &ast::Expr, - df_schema: &DFSchema, - schema_provider: &StreamSchemaProvider, -) -> Result { - let planner = SqlToRel::new(schema_provider); - let mut ctx = PlannerContext::new(); - planner.sql_to_expr(ast.clone(), df_schema, &mut ctx) -} - -#[derive(Debug, Clone)] -pub struct SourceOperator { - pub name: String, - pub source: SqlSource, - pub timestamp_override: Option, - pub watermark_column: Option, -} diff --git a/src/sql/schema/table.rs b/src/sql/schema/table.rs index efa0c59f..6c001d9c 100644 --- a/src/sql/schema/table.rs +++ b/src/sql/schema/table.rs @@ -10,11 +10,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -use super::source_table::SourceTable; use crate::sql::analysis::rewrite_plan; use crate::sql::logical_node::remote_table::RemoteTableBoundaryNode; use crate::sql::logical_planner::optimizers::produce_optimized_plan; use crate::sql::schema::StreamSchemaProvider; +use crate::sql::schema::catalog::ExternalTable; use crate::sql::types::{ProcessingMode, QualifiedField}; use datafusion::arrow::datatypes::FieldRef; use datafusion::common::{Result, plan_err}; @@ -24,23 +24,22 @@ use protocol::function_stream_graph::ConnectorOp; use std::sync::Arc; use std::time::Duration; -/// Represents all table types in the FunctionStream SQL catalog. -#[allow(clippy::enum_variant_names)] #[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub enum Table { - /// A lookup table backed by an external connector. - LookupTable(SourceTable), - /// A source/sink table backed by an external connector. - ConnectorTable(SourceTable), - /// A table defined by a query (CREATE VIEW / CREATE TABLE AS SELECT). - TableFromQuery { +pub enum CatalogEntity { + /// Both payload variants are boxed so the enum is not padded to the largest field. + ExternalConnector(Box), + ComputedTable { name: String, - logical_plan: LogicalPlan, + logical_plan: Box, }, } -impl Table { - /// Try to construct a Table from a CREATE TABLE or CREATE VIEW statement. +impl CatalogEntity { + #[inline] + pub fn external(table: ExternalTable) -> Self { + Self::ExternalConnector(Box::new(table)) + } + pub fn try_from_statement( statement: &Statement, schema_provider: &StreamSchemaProvider, @@ -69,11 +68,11 @@ impl Table { resolved_schema: schema, requires_materialization: true, }; - Ok(Some(Table::TableFromQuery { + Ok(Some(CatalogEntity::ComputedTable { name: name.to_string(), - logical_plan: LogicalPlan::Extension(Extension { + logical_plan: Box::new(LogicalPlan::Extension(Extension { node: Arc::new(remote), - }), + })), })) } _ => Ok(None), @@ -82,36 +81,25 @@ impl Table { pub fn name(&self) -> &str { match self { - Table::TableFromQuery { name, .. } => name.as_str(), - Table::ConnectorTable(c) | Table::LookupTable(c) => c.name(), + CatalogEntity::ComputedTable { name, .. } => name.as_str(), + CatalogEntity::ExternalConnector(e) => e.name(), } } pub fn get_fields(&self) -> Vec { match self { - Table::ConnectorTable(SourceTable { - schema_specs, - inferred_fields, - .. - }) - | Table::LookupTable(SourceTable { - schema_specs, - inferred_fields, - .. - }) => inferred_fields.clone().unwrap_or_else(|| { - schema_specs - .iter() - .map(|c| Arc::new(c.arrow_field().clone())) - .collect() - }), - Table::TableFromQuery { logical_plan, .. } => { + CatalogEntity::ExternalConnector(e) => e.effective_fields(), + CatalogEntity::ComputedTable { logical_plan, .. } => { logical_plan.schema().fields().iter().cloned().collect() } } } pub fn set_inferred_fields(&mut self, fields: Vec) -> Result<()> { - let Table::ConnectorTable(t) = self else { + let CatalogEntity::ExternalConnector(ext) = self else { + return Ok(()); + }; + let ExternalTable::Source(t) = ext.as_mut() else { return Ok(()); }; @@ -139,14 +127,27 @@ impl Table { pub fn connector_op(&self) -> Result { match self { - Table::ConnectorTable(c) | Table::LookupTable(c) => Ok(c.connector_op()), - Table::TableFromQuery { .. } => plan_err!("can't write to a query-defined table"), + CatalogEntity::ExternalConnector(e) => Ok(e.connector_op()), + CatalogEntity::ComputedTable { .. } => { + plan_err!("can't write to a query-defined table") + } } } pub fn partition_exprs(&self) -> Option<&Vec> { + let CatalogEntity::ExternalConnector(ext) = self else { + return None; + }; + let ExternalTable::Sink(s) = ext.as_ref() else { + return None; + }; + (*s.partition_exprs).as_ref() + } + + #[inline] + pub fn as_external(&self) -> Option<&ExternalTable> { match self { - Table::ConnectorTable(c) => (*c.partition_exprs).as_ref(), + CatalogEntity::ExternalConnector(e) => Some(e.as_ref()), _ => None, } } diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 823425d2..ec32bdfa 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -10,6 +10,44 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::sync::Arc; + +use anyhow::Context; + pub mod state_backend; pub mod stream_catalog; pub mod task; + +/// Install the process-global [`stream_catalog::CatalogManager`] from configuration. +/// In-memory when `config.stream_catalog.persist` is `false`, otherwise a durable +/// [`stream_catalog::RocksDbMetaStore`] (default path: `{data_dir}/catalog.db`). +pub fn initialize_stream_catalog(config: &crate::config::GlobalConfig) -> anyhow::Result<()> { + use stream_catalog::{CatalogManager, InMemoryMetaStore, MetaStore, RocksDbMetaStore}; + + let store: Arc = if !config.stream_catalog.persist { + Arc::new(InMemoryMetaStore::new()) + } else { + let path = config + .stream_catalog + .db_path + .as_ref() + .map(|p| crate::config::resolve_path(p)) + .unwrap_or_else(|| crate::config::get_data_dir().join("catalog.db")); + + std::fs::create_dir_all(&path).with_context(|| { + format!( + "Failed to create stream catalog RocksDB directory {}", + path.display() + ) + })?; + + Arc::new(RocksDbMetaStore::open(&path).with_context(|| { + format!( + "Failed to open stream catalog RocksDB at {}", + path.display() + ) + })?) + }; + + CatalogManager::init_global(store).context("Stream catalog (CatalogManager) global init failed") +} diff --git a/src/storage/stream_catalog/manager.rs b/src/storage/stream_catalog/manager.rs index 3c9d561e..fa810e8d 100644 --- a/src/storage/stream_catalog/manager.rs +++ b/src/storage/stream_catalog/manager.rs @@ -10,24 +10,25 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::path::Path; +use std::collections::BTreeMap; use std::sync::{Arc, OnceLock}; -use anyhow::{Context, anyhow, bail}; +use anyhow::{anyhow, bail}; use datafusion::common::{Result as DFResult, internal_err, plan_err}; use prost::Message; use protocol::function_stream_graph::FsProgram; use protocol::storage::{self as pb, table_definition}; use tracing::{debug, info, warn}; -use crate::runtime::streaming::operators::source::kafka as kafka_snap; use unicase::UniCase; use crate::sql::common::constants::sql_field; +use crate::sql::connector::config::ConnectorConfig; +use crate::sql::schema::catalog::{ExternalTable, LookupTable, SourceTable}; use crate::sql::schema::column_descriptor::ColumnDescriptor; -use crate::sql::schema::connection_type::ConnectionType; -use crate::sql::schema::source_table::SourceTable; -use crate::sql::schema::table::Table as CatalogTable; +use crate::sql::schema::table::CatalogEntity; +use crate::sql::schema::table_role::TableRole; +use crate::sql::schema::temporal_pipeline_config::TemporalPipelineConfig; use crate::sql::schema::{StreamPlanningContext, StreamTable}; use super::codec::CatalogCodec; @@ -36,91 +37,16 @@ use super::meta_store::MetaStore; const CATALOG_KEY_PREFIX: &str = "catalog:stream_table:"; const STREAMING_JOB_KEY_PREFIX: &str = "streaming_job:"; -/// One persisted streaming job row from catalog (program + checkpoint metadata + Kafka offsets). +/// One persisted streaming job row from catalog (program + checkpoint metadata). #[derive(Debug, Clone)] pub struct StoredStreamingJob { pub table_name: String, pub program: FsProgram, pub checkpoint_interval_ms: u64, pub latest_checkpoint_epoch: u64, - pub kafka_source_checkpoints: Vec, -} - -fn parse_kafka_offset_snapshot_filename(name: &str) -> Option<(u32, u32)> { - const PREFIX: &str = "kafka_source_offsets_pipe"; - const SUFFIX: &str = ".bin"; - if !name.starts_with(PREFIX) || !name.ends_with(SUFFIX) { - return None; - } - let mid = name.strip_prefix(PREFIX)?.strip_suffix(SUFFIX)?; - let (pipe, sub_part) = mid.split_once("_sub")?; - Some((pipe.parse().ok()?, sub_part.parse().ok()?)) -} - -/// Removes on-disk staging snapshots once their payload is committed into catalog (same epoch). -fn cleanup_kafka_offset_snapshots_for_epoch(job_dir: &Path, epoch: u64) { - let Ok(rd) = std::fs::read_dir(job_dir) else { - return; - }; - for ent in rd.flatten() { - let path = ent.path(); - let name = ent.file_name().to_string_lossy().into_owned(); - if parse_kafka_offset_snapshot_filename(&name).is_none() { - continue; - } - let Ok(bytes) = std::fs::read(&path) else { - continue; - }; - let Ok(saved) = kafka_snap::decode_kafka_offset_snapshot(&bytes) else { - continue; - }; - if saved.epoch == epoch && std::fs::remove_file(&path).is_err() { - debug!(path = %path.display(), "Could not remove staged Kafka offset snapshot (non-fatal)"); - } - } -} - -/// Writes catalog-stored Kafka checkpoints back to the job state dir before `submit_job` resumes sources. -pub fn materialize_kafka_source_checkpoints_from_catalog( - job_dir: &Path, - checkpoints: &[pb::KafkaSourceSubtaskCheckpoint], -) -> DFResult<()> { - if checkpoints.is_empty() { - return Ok(()); - } - std::fs::create_dir_all(job_dir).map_err(|e| { - datafusion::common::DataFusionError::Execution(format!( - "create job state dir {}: {e}", - job_dir.display() - )) - })?; - for c in checkpoints { - let saved = kafka_snap::KafkaSourceSavedOffsets { - epoch: c.checkpoint_epoch, - partitions: c - .partitions - .iter() - .map(|p| kafka_snap::KafkaState { - partition: p.partition, - offset: p.offset, - }) - .collect(), - }; - let path = kafka_snap::kafka_snapshot_path(job_dir, c.pipeline_id, c.subtask_index); - let bytes = kafka_snap::encode_kafka_offset_snapshot(&saved).map_err(|e| { - datafusion::common::DataFusionError::Execution(format!( - "encode kafka snapshot for {}: {e}", - path.display() - )) - })?; - std::fs::write(&path, &bytes).map_err(|e| { - datafusion::common::DataFusionError::Execution(format!( - "write kafka snapshot {}: {e}", - path.display() - )) - })?; - } - Ok(()) + /// Source-type-agnostic per-subtask checkpoint entries. Each entry is a + /// [`pb::SourceCheckpointInfo`] oneof envelope — the catalog does not inspect the payload. + pub source_checkpoints: Vec, } pub struct CatalogManager { @@ -188,7 +114,7 @@ impl CatalogManager { comment: comment.to_string(), checkpoint_interval_ms, latest_checkpoint_epoch: 0, - kafka_source_checkpoints: vec![], + source_checkpoints: vec![], }; let payload = def.encode_to_vec(); let key = Self::build_streaming_job_key(table_name); @@ -207,16 +133,14 @@ impl CatalogManager { /// Persist the globally-completed checkpoint epoch after all operators ACK. /// Only advances forward; stale epochs are silently ignored. /// - /// `kafka_source_checkpoints` is assembled by the job coordinator from source pipeline checkpoint - /// ACKs (in-memory); it is stored next to `latest_checkpoint_epoch` in the catalog. - /// - /// `job_state_dir` is only used to remove legacy on-disk staging snapshots for this epoch, if present. + /// `source_checkpoints` is the source-agnostic list assembled by the job coordinator via + /// [`CheckpointAggregatorRegistry::aggregate_all`]; it is stored atomically next to + /// `latest_checkpoint_epoch` via [`MetaStore::write_batch`]. pub fn commit_job_checkpoint( &self, table_name: &str, epoch: u64, - job_state_dir: &Path, - kafka_source_checkpoints: Vec, + source_checkpoints: Vec, ) -> DFResult<()> { let key = Self::build_streaming_job_key(table_name); @@ -237,22 +161,21 @@ impl CatalogManager { if epoch > def.latest_checkpoint_epoch { def.latest_checkpoint_epoch = epoch; - def.kafka_source_checkpoints = kafka_source_checkpoints; - let new_payload = def.encode_to_vec(); - self.store.put(&key, new_payload)?; + def.source_checkpoints = source_checkpoints; + self.store + .write_batch(vec![(key, Some(def.encode_to_vec()))])?; debug!( table = %table_name, epoch = epoch, - kafka_subtasks = def.kafka_source_checkpoints.len(), - "Checkpoint metadata committed to Catalog" + source_subtasks = def.source_checkpoints.len(), + "Checkpoint metadata committed to Catalog (write_batch)" ); - cleanup_kafka_offset_snapshots_for_epoch(job_state_dir, epoch); } Ok(()) } - /// Load all persisted streaming jobs (including Kafka offset checkpoints for restore). + /// Load all persisted streaming jobs (including source checkpoint data for restore). pub fn load_streaming_job_definitions(&self) -> DFResult> { let records = self.store.scan_prefix(STREAMING_JOB_KEY_PREFIX)?; let mut out = Vec::with_capacity(records.len()); @@ -284,7 +207,7 @@ impl CatalogManager { program, checkpoint_interval_ms: def.checkpoint_interval_ms, latest_checkpoint_epoch: def.latest_checkpoint_epoch, - kafka_source_checkpoints: def.kafka_source_checkpoints, + source_checkpoints: def.source_checkpoints, }); } Ok(out) @@ -294,7 +217,7 @@ impl CatalogManager { // Catalog table persistence (CREATE TABLE / DROP TABLE) // ======================================================================== - pub fn add_catalog_table(&self, table: CatalogTable) -> DFResult<()> { + pub fn add_catalog_table(&self, table: CatalogEntity) -> DFResult<()> { let proto_def = self.encode_catalog_table(&table)?; let payload = proto_def.encode_to_vec(); let key = Self::build_store_key(table.name()); @@ -332,36 +255,44 @@ impl CatalogManager { ctx.tables.catalogs = catalogs.clone(); for (name, table) in catalogs { - let source = match table.as_ref() { - CatalogTable::ConnectorTable(s) | CatalogTable::LookupTable(s) => s, - CatalogTable::TableFromQuery { .. } => continue, + let stream = match table.as_ref() { + CatalogEntity::ExternalConnector(b) => match b.as_ref() { + ExternalTable::Source(s) => Some(StreamTable::Source { + name: s.name().to_string(), + connector: s.connector().to_string(), + schema: Arc::new(s.produce_physical_schema()), + event_time_field: s.event_time_field().map(str::to_string), + watermark_field: s.stream_catalog_watermark_field(), + with_options: s.catalog_with_options().clone(), + }), + ExternalTable::Lookup(l) => Some(StreamTable::Source { + name: l.name().to_string(), + connector: l.connector().to_string(), + schema: Arc::new(l.produce_physical_schema()), + event_time_field: None, + watermark_field: None, + with_options: l.catalog_with_options().clone(), + }), + ExternalTable::Sink(_) => None, + }, + CatalogEntity::ComputedTable { .. } => None, }; - - let schema = Arc::new(source.produce_physical_schema()); - ctx.tables.streams.insert( - name, - Arc::new(StreamTable::Source { - name: source.name().to_string(), - connector: source.connector().to_string(), - schema, - event_time_field: source.event_time_field().map(str::to_string), - watermark_field: source.stream_catalog_watermark_field(), - with_options: source.catalog_with_options().clone(), - }), - ); + if let Some(st) = stream { + ctx.tables.streams.insert(name, Arc::new(st)); + } } ctx } /// All persisted catalog tables, sorted by table name. - pub fn list_catalog_tables(&self) -> DFResult>> { - let mut out: Vec> = + pub fn list_catalog_tables(&self) -> DFResult>> { + let mut out: Vec> = self.load_catalog_tables_map()?.into_values().collect(); out.sort_by(|a, b| a.name().cmp(b.name())); Ok(out) } - pub fn get_catalog_table(&self, name: &str) -> DFResult>> { + pub fn get_catalog_table(&self, name: &str) -> DFResult>> { let key = UniCase::new(name.to_string()); Ok(self.load_catalog_tables_map()?.get(&key).cloned()) } @@ -376,17 +307,40 @@ impl CatalogManager { watermark_field, with_options, } => { - let mut source = SourceTable::new(name, connector, ConnectionType::Source); - source.schema_specs = schema + let schema_specs: Vec = schema .fields() .iter() .map(|f| ColumnDescriptor::new_physical((**f).clone())) .collect(); - source.inferred_fields = Some(schema.fields().iter().cloned().collect()); - source.temporal_config.event_column = event_time_field; - source.temporal_config.watermark_strategy_column = watermark_field; - source.catalog_with_options = with_options; - self.add_catalog_table(CatalogTable::ConnectorTable(source)) + let inferred_fields = Some(schema.fields().iter().cloned().collect()); + let physical_schema = schema.as_ref().clone(); + + let connector_config = build_connector_config_for_role( + &connector, + TableRole::Ingestion, + &with_options, + &physical_schema, + )?; + + let source = SourceTable { + table_identifier: name, + adapter_type: connector, + schema_specs, + connector_config, + temporal_config: TemporalPipelineConfig { + event_column: event_time_field, + watermark_strategy_column: watermark_field, + liveness_timeout: None, + }, + key_constraints: Vec::new(), + payload_format: None, + connection_format: None, + description: String::new(), + catalog_with_options: with_options.into_iter().collect(), + registry_id: None, + inferred_fields, + }; + self.add_catalog_table(CatalogEntity::external(ExternalTable::Source(source))) } StreamTable::Sink { name, .. } => plan_err!( "Persisting streaming sink '{name}' in stream catalog is no longer supported" @@ -406,19 +360,7 @@ impl CatalogManager { self.list_catalog_tables() .unwrap_or_default() .into_iter() - .filter_map(|t| match t.as_ref() { - CatalogTable::ConnectorTable(s) | CatalogTable::LookupTable(s) => { - Some(Arc::new(StreamTable::Source { - name: s.name().to_string(), - connector: s.connector().to_string(), - schema: Arc::new(s.produce_physical_schema()), - event_time_field: s.event_time_field().map(str::to_string), - watermark_field: s.stream_catalog_watermark_field(), - with_options: s.catalog_with_options().clone(), - })) - } - CatalogTable::TableFromQuery { .. } => None, - }) + .filter_map(|t| external_to_stream_table(t.as_ref()).map(Arc::new)) .collect() } @@ -426,44 +368,54 @@ impl CatalogManager { self.get_catalog_table(name) .ok() .flatten() - .and_then(|t| match t.as_ref() { - CatalogTable::ConnectorTable(s) | CatalogTable::LookupTable(s) => { - Some(Arc::new(StreamTable::Source { - name: s.name().to_string(), - connector: s.connector().to_string(), - schema: Arc::new(s.produce_physical_schema()), - event_time_field: s.event_time_field().map(str::to_string), - watermark_field: s.stream_catalog_watermark_field(), - with_options: s.catalog_with_options().clone(), - })) - } - CatalogTable::TableFromQuery { .. } => None, - }) + .and_then(|t| external_to_stream_table(t.as_ref()).map(Arc::new)) } - fn encode_catalog_table(&self, table: &CatalogTable) -> DFResult { + fn encode_catalog_table(&self, table: &CatalogEntity) -> DFResult { let table_type = match table { - CatalogTable::ConnectorTable(source) | CatalogTable::LookupTable(source) => { - let mut opts = source.catalog_with_options().clone(); - opts.entry("connector".to_string()) - .or_insert_with(|| source.connector().to_string()); - let catalog_row = pb::CatalogSourceTable { - arrow_schema_ipc: CatalogCodec::encode_schema(&Arc::new( - source.produce_physical_schema(), - ))?, - event_time_field: source.event_time_field().map(str::to_string), - watermark_field: source.stream_catalog_watermark_field(), - with_options: opts.into_iter().collect(), - connector: source.connector().to_string(), - description: source.description.clone(), - }; - if matches!(table, CatalogTable::LookupTable(_)) { - table_definition::TableType::LookupTable(catalog_row) - } else { + CatalogEntity::ExternalConnector(b) => match b.as_ref() { + ExternalTable::Source(source) => { + let mut opts: std::collections::HashMap = + source.catalog_with_options.clone().into_iter().collect(); + opts.entry("connector".to_string()) + .or_insert_with(|| source.connector().to_string()); + let catalog_row = pb::CatalogSourceTable { + arrow_schema_ipc: CatalogCodec::encode_schema(&Arc::new( + source.produce_physical_schema(), + ))?, + event_time_field: source.event_time_field().map(str::to_string), + watermark_field: source.stream_catalog_watermark_field(), + with_options: opts, + connector: source.connector().to_string(), + description: source.description.clone(), + }; table_definition::TableType::ConnectorTable(catalog_row) } - } - CatalogTable::TableFromQuery { name, .. } => { + ExternalTable::Lookup(lookup) => { + let mut opts: std::collections::HashMap = + lookup.catalog_with_options.clone().into_iter().collect(); + opts.entry("connector".to_string()) + .or_insert_with(|| lookup.connector().to_string()); + let catalog_row = pb::CatalogSourceTable { + arrow_schema_ipc: CatalogCodec::encode_schema(&Arc::new( + lookup.produce_physical_schema(), + ))?, + event_time_field: None, + watermark_field: None, + with_options: opts, + connector: lookup.connector().to_string(), + description: lookup.description.clone(), + }; + table_definition::TableType::LookupTable(catalog_row) + } + ExternalTable::Sink(sink) => { + return plan_err!( + "Persisting SINK table '{}' in stream catalog is not supported", + sink.name() + ); + } + }, + CatalogEntity::ComputedTable { name, .. } => { return plan_err!( "Persisting query-defined table '{}' is not supported by stream catalog storage", name @@ -483,7 +435,7 @@ impl CatalogManager { table_name: String, source_row: pb::CatalogSourceTable, as_lookup: bool, - ) -> DFResult { + ) -> DFResult { let connector = if source_row.connector.is_empty() { source_row .with_options @@ -493,71 +445,75 @@ impl CatalogManager { } else { source_row.connector.clone() }; - let mut source = SourceTable::new( - table_name, - connector, - if as_lookup { - ConnectionType::Lookup - } else { - ConnectionType::Source - }, - ); + let schema = CatalogCodec::decode_schema(&source_row.arrow_schema_ipc)?; - source.schema_specs = schema + let schema_specs: Vec = schema .fields() .iter() .map(|f| ColumnDescriptor::new_physical((**f).clone())) .collect(); - source.inferred_fields = Some(schema.fields().iter().cloned().collect()); - source.temporal_config.event_column = source_row.event_time_field; - source.temporal_config.watermark_strategy_column = source_row - .watermark_field - .filter(|w| w != sql_field::COMPUTED_WATERMARK); - source.catalog_with_options = source_row.with_options.into_iter().collect(); - source.description = source_row.description; - - // Rebuild strongly-typed ConnectorConfig from persisted WITH options. - if source.connector().eq_ignore_ascii_case("kafka") { - use crate::sql::schema::ConnectorConfig; - use crate::sql::schema::kafka_operator_config::build_kafka_proto_config_from_string_map; - let opts_map: std::collections::HashMap = source - .catalog_with_options - .iter() - .map(|(k, v)| (k.clone(), v.clone())) - .collect(); - let physical = source.produce_physical_schema(); - if let Ok(proto_cfg) = build_kafka_proto_config_from_string_map(opts_map, &physical) { - source.connector_config = match proto_cfg { - protocol::function_stream_graph::connector_op::Config::KafkaSource(cfg) => { - ConnectorConfig::KafkaSource(cfg) - } - protocol::function_stream_graph::connector_op::Config::KafkaSink(cfg) => { - ConnectorConfig::KafkaSink(cfg) - } - protocol::function_stream_graph::connector_op::Config::Generic(g) => { - ConnectorConfig::Generic(g.properties) - } - }; - } + let inferred_fields = Some(schema.fields().iter().cloned().collect()); + let physical_schema = schema.as_ref().clone(); + let catalog_with_options: BTreeMap = + source_row.with_options.clone().into_iter().collect(); + + let role = if as_lookup { + TableRole::Reference } else { - use crate::sql::schema::ConnectorConfig; - source.connector_config = ConnectorConfig::Generic( - source - .catalog_with_options - .iter() - .map(|(k, v)| (k.clone(), v.clone())) - .collect(), - ); - } + TableRole::Ingestion + }; + let connector_config = build_connector_config_for_role( + &connector, + role, + &source_row.with_options, + &physical_schema, + )?; if as_lookup { - Ok(CatalogTable::LookupTable(source)) + Ok(CatalogEntity::external(ExternalTable::Lookup( + LookupTable { + table_identifier: table_name, + adapter_type: connector, + schema_specs, + connector_config, + key_constraints: Vec::new(), + lookup_cache_max_bytes: None, + lookup_cache_ttl: None, + connection_format: None, + description: source_row.description, + catalog_with_options, + registry_id: None, + inferred_fields, + }, + ))) } else { - Ok(CatalogTable::ConnectorTable(source)) + let watermark_field = source_row + .watermark_field + .filter(|w| w != sql_field::COMPUTED_WATERMARK); + Ok(CatalogEntity::external(ExternalTable::Source( + SourceTable { + table_identifier: table_name, + adapter_type: connector, + schema_specs, + connector_config, + temporal_config: TemporalPipelineConfig { + event_column: source_row.event_time_field, + watermark_strategy_column: watermark_field, + liveness_timeout: None, + }, + key_constraints: Vec::new(), + payload_format: None, + connection_format: None, + description: source_row.description, + catalog_with_options, + registry_id: None, + inferred_fields, + }, + ))) } } - fn decode_catalog_table(&self, proto_def: pb::TableDefinition) -> DFResult { + fn decode_catalog_table(&self, proto_def: pb::TableDefinition) -> DFResult { let Some(table_type) = proto_def.table_type else { return internal_err!( "Corrupted catalog row: missing table_type for {}", @@ -577,7 +533,7 @@ impl CatalogManager { fn load_catalog_tables_map( &self, - ) -> DFResult>> + ) -> DFResult>> { let mut out = std::collections::HashMap::new(); let records = self.store.scan_prefix(CATALOG_KEY_PREFIX)?; @@ -611,6 +567,52 @@ impl CatalogManager { } } +fn build_connector_config_for_role( + connector: &str, + role: TableRole, + with_options: &M, + physical_schema: &datafusion::arrow::datatypes::Schema, +) -> DFResult +where + for<'a> &'a M: IntoIterator, +{ + let flat: std::collections::HashMap = with_options + .into_iter() + .map(|(k, v)| (k.clone(), v.clone())) + .collect(); + crate::sql::connector::factory::build_connector_config_from_catalog( + connector, + role, + flat, + physical_schema, + ) +} + +fn external_to_stream_table(table: &CatalogEntity) -> Option { + match table { + CatalogEntity::ExternalConnector(b) => match b.as_ref() { + ExternalTable::Source(s) => Some(StreamTable::Source { + name: s.name().to_string(), + connector: s.connector().to_string(), + schema: Arc::new(s.produce_physical_schema()), + event_time_field: s.event_time_field().map(str::to_string), + watermark_field: s.stream_catalog_watermark_field(), + with_options: s.catalog_with_options().clone(), + }), + ExternalTable::Lookup(l) => Some(StreamTable::Source { + name: l.name().to_string(), + connector: l.connector().to_string(), + schema: Arc::new(l.produce_physical_schema()), + event_time_field: None, + watermark_field: None, + with_options: l.catalog_with_options().clone(), + }), + ExternalTable::Sink(_) => None, + }, + CatalogEntity::ComputedTable { .. } => None, + } +} + pub fn restore_global_catalog_from_store() { let Some(mgr) = CatalogManager::try_global() else { return; @@ -677,22 +679,11 @@ pub fn restore_streaming_jobs_from_store() { program, checkpoint_interval_ms: interval_ms, latest_checkpoint_epoch: latest_epoch, - kafka_source_checkpoints, + source_checkpoints, } = job; let jm = job_manager.clone(); let name = table_name.clone(); - let job_dir = jm.job_state_directory(&table_name); - if let Err(e) = - materialize_kafka_source_checkpoints_from_catalog(&job_dir, &kafka_source_checkpoints) - { - warn!( - table = %table_name, - error = %e, - "Failed to materialize Kafka checkpoints from catalog before job restore" - ); - } - let custom_interval = if interval_ms > 0 { Some(interval_ms) } else { @@ -704,7 +695,13 @@ pub fn restore_streaming_jobs_from_store() { None }; - match rt.block_on(jm.submit_job(name.clone(), program, custom_interval, recovery_epoch)) { + match rt.block_on(jm.submit_job( + name.clone(), + program, + custom_interval, + recovery_epoch, + source_checkpoints, + )) { Ok(job_id) => { info!( table = %table_name, job_id = %job_id, @@ -727,36 +724,6 @@ pub fn restore_streaming_jobs_from_store() { ); } -pub fn initialize_stream_catalog(config: &crate::config::GlobalConfig) -> anyhow::Result<()> { - if !config.stream_catalog.persist { - return CatalogManager::init_global_in_memory() - .context("Stream catalog (CatalogManager) in-memory init failed"); - } - - let path = config - .stream_catalog - .db_path - .as_ref() - .map(|p| crate::config::resolve_path(p)) - .unwrap_or_else(|| crate::config::get_data_dir().join("stream_catalog")); - - std::fs::create_dir_all(&path).with_context(|| { - format!( - "Failed to create stream catalog directory {}", - path.display() - ) - })?; - - let store = std::sync::Arc::new(super::RocksDbMetaStore::open(&path).with_context(|| { - format!( - "Failed to open stream catalog RocksDB at {}", - path.display() - ) - })?); - - CatalogManager::init_global(store).context("Stream catalog (CatalogManager) init failed") -} - #[allow(clippy::unwrap_or_default)] pub fn planning_schema_provider() -> StreamPlanningContext { CatalogManager::try_global() @@ -770,10 +737,11 @@ mod tests { use datafusion::arrow::datatypes::{DataType, Field}; + use crate::sql::connector::config::ConnectorConfig; + use crate::sql::schema::catalog::{ExternalTable, SourceTable}; use crate::sql::schema::column_descriptor::ColumnDescriptor; - use crate::sql::schema::connection_type::ConnectionType; - use crate::sql::schema::source_table::SourceTable; - use crate::sql::schema::table::Table as CatalogTable; + use crate::sql::schema::table::CatalogEntity; + use crate::sql::schema::temporal_pipeline_config::TemporalPipelineConfig; use crate::storage::stream_catalog::InMemoryMetaStore; use super::CatalogManager; @@ -782,34 +750,34 @@ mod tests { CatalogManager::new(Arc::new(InMemoryMetaStore::new())) } - #[test] - fn add_table_roundtrip_snapshot() { - let mgr = create_test_manager(); - let mut source = SourceTable::new("t1", "kafka", ConnectionType::Source); - source.schema_specs = vec![ColumnDescriptor::new_physical(Field::new( - "a", - DataType::Int32, - false, - ))]; - source.temporal_config.event_column = Some("ts".into()); - let table = CatalogTable::ConnectorTable(source); - - mgr.add_catalog_table(table).unwrap(); - - let got = mgr.get_catalog_table("t1").unwrap().expect("table present"); - assert_eq!(got.name(), "t1"); + fn make_test_source(name: &str) -> SourceTable { + SourceTable { + table_identifier: name.to_string(), + adapter_type: "kafka".to_string(), + schema_specs: vec![ColumnDescriptor::new_physical(Field::new( + "a", + DataType::Int32, + false, + ))], + connector_config: ConnectorConfig::KafkaSource( + protocol::function_stream_graph::KafkaSourceConfig::default(), + ), + temporal_config: TemporalPipelineConfig::default(), + key_constraints: Vec::new(), + payload_format: None, + connection_format: None, + description: String::new(), + catalog_with_options: std::collections::BTreeMap::new(), + registry_id: None, + inferred_fields: None, + } } #[test] fn drop_table_if_exists() { let mgr = create_test_manager(); - let mut source = SourceTable::new("t_drop", "kafka", ConnectionType::Source); - source.schema_specs = vec![ColumnDescriptor::new_physical(Field::new( - "a", - DataType::Int32, - false, - ))]; - mgr.add_catalog_table(CatalogTable::ConnectorTable(source)) + let source = make_test_source("t_drop"); + mgr.add_catalog_table(CatalogEntity::external(ExternalTable::Source(source))) .unwrap(); mgr.drop_catalog_table("t_drop", false).unwrap(); diff --git a/src/storage/stream_catalog/meta_store.rs b/src/storage/stream_catalog/meta_store.rs index 6f61b3f7..741273cf 100644 --- a/src/storage/stream_catalog/meta_store.rs +++ b/src/storage/stream_catalog/meta_store.rs @@ -23,6 +23,19 @@ pub trait MetaStore: Send + Sync { fn get(&self, key: &str) -> Result>>; fn delete(&self, key: &str) -> Result<()>; fn scan_prefix(&self, prefix: &str) -> Result)>>; + + /// Atomic apply of many puts (`Some(value)`) and/or deletes (`None`). + /// Default implementation is sequential; backends should override with a + /// single transaction / `WriteBatch` when available. + fn write_batch(&self, batch: Vec<(String, Option>)>) -> Result<()> { + for (k, v) in batch { + match v { + Some(val) => self.put(&k, val)?, + None => self.delete(&k)?, + } + } + Ok(()) + } } /// In-process KV store for single-node deployments and tests. @@ -67,4 +80,19 @@ impl MetaStore for InMemoryMetaStore { .map(|(k, v)| (k.clone(), v.clone())) .collect()) } + + fn write_batch(&self, batch: Vec<(String, Option>)>) -> Result<()> { + let mut db = self.db.write(); + for (k, v) in batch { + match v { + Some(val) => { + db.insert(k, val); + } + None => { + db.remove(&k); + } + } + } + Ok(()) + } } diff --git a/src/storage/stream_catalog/mod.rs b/src/storage/stream_catalog/mod.rs index ef176c40..08a987dd 100644 --- a/src/storage/stream_catalog/mod.rs +++ b/src/storage/stream_catalog/mod.rs @@ -17,10 +17,11 @@ mod manager; mod meta_store; mod rocksdb_meta_store; +pub use super::initialize_stream_catalog; + #[allow(unused_imports)] pub use manager::{ - CatalogManager, StoredStreamingJob, initialize_stream_catalog, - materialize_kafka_source_checkpoints_from_catalog, restore_global_catalog_from_store, + CatalogManager, StoredStreamingJob, restore_global_catalog_from_store, restore_streaming_jobs_from_store, }; pub use meta_store::{InMemoryMetaStore, MetaStore}; diff --git a/src/storage/stream_catalog/rocksdb_meta_store.rs b/src/storage/stream_catalog/rocksdb_meta_store.rs index 5315454f..1537f278 100644 --- a/src/storage/stream_catalog/rocksdb_meta_store.rs +++ b/src/storage/stream_catalog/rocksdb_meta_store.rs @@ -16,7 +16,7 @@ use std::sync::Arc; use anyhow::Context; use datafusion::common::Result; -use rocksdb::{DB, Direction, IteratorMode, Options}; +use rocksdb::{DB, Direction, IteratorMode, Options, WriteBatch}; use super::MetaStore; @@ -84,6 +84,24 @@ impl MetaStore for RocksDbMetaStore { } Ok(out) } + + fn write_batch(&self, batch: Vec<(String, Option>)>) -> Result<()> { + if batch.is_empty() { + return Ok(()); + } + let mut wb = WriteBatch::default(); + for (k, v) in batch { + match v { + Some(val) => wb.put(k.as_bytes(), val.as_slice()), + None => wb.delete(k.as_bytes()), + } + } + self.db.write(wb).map_err(|e| { + datafusion::common::DataFusionError::Execution(format!( + "stream catalog store write_batch: {e}" + )) + }) + } } #[cfg(test)] @@ -120,4 +138,27 @@ mod tests { let _ = std::fs::remove_dir_all(&dir); } + + #[test] + fn write_batch_put_and_delete() { + let dir: PathBuf = + std::env::temp_dir().join(format!("fs_stream_catalog_wb_{}", Uuid::new_v4())); + let _ = std::fs::remove_dir_all(&dir); + + let store = RocksDbMetaStore::open(&dir).expect("open"); + store + .write_batch(vec![ + ("k1".to_string(), Some(vec![1])), + ("k2".to_string(), Some(vec![2])), + ]) + .unwrap(); + assert_eq!(store.get("k1").unwrap(), Some(vec![1])); + assert_eq!(store.get("k2").unwrap(), Some(vec![2])); + + store.write_batch(vec![("k1".to_string(), None)]).unwrap(); + assert!(store.get("k1").unwrap().is_none()); + assert_eq!(store.get("k2").unwrap(), Some(vec![2])); + + let _ = std::fs::remove_dir_all(&dir); + } } diff --git a/tests/integration/Makefile b/tests/integration/Makefile index f16d640b..990436ac 100644 --- a/tests/integration/Makefile +++ b/tests/integration/Makefile @@ -15,7 +15,7 @@ # ----------------------------------------------------------------------- # Usage: # make test — Setup env + run pytest (PYTEST_ARGS="-k xxx") -# make clean — Remove .venv and test output +# make clean — Remove test output # # Prerequisites: # The FunctionStream binary must already be built (make build / make build-lite @@ -24,7 +24,7 @@ PROJECT_ROOT := $(shell git -C $(CURDIR) rev-parse --show-toplevel) PYTHON_ROOT := $(PROJECT_ROOT)/python -VENV := $(CURDIR)/.venv +VENV := $(PROJECT_ROOT)/.venv PIP := $(VENV)/bin/pip PY := $(VENV)/bin/python @@ -41,7 +41,7 @@ help: @echo "Integration Test Targets:" @echo "" @echo " test Setup Python env + run pytest (PYTEST_ARGS=...)" - @echo " clean Remove .venv and target/tests output" + @echo " clean Remove target/tests output" install: requirements.txt $(PYTHON_ROOT)/functionstream-api/pyproject.toml $(PYTHON_ROOT)/functionstream-client/pyproject.toml $(call log,ENV,Setting up Python virtual environment) @@ -58,7 +58,6 @@ test: install clean: $(call log,CLEAN,Removing test artifacts) - @rm -rf $(VENV) @rm -rf $(CURDIR)/target @rm -rf $(CURDIR)/install $(call success,Clean complete) diff --git a/tests/integration/README.md b/tests/integration/README.md index 23096720..55995055 100644 --- a/tests/integration/README.md +++ b/tests/integration/README.md @@ -1,88 +1,25 @@ # Integration Tests -## Prerequisites +This directory contains the end-to-end (E2E) integration test suite for FunctionStream. -| Dependency | Version | Purpose | -|------------|----------|-------------------------------------------------| -| Python | >= 3.9 | Test framework runtime | -| Rust | stable | Build the FunctionStream binary | -| Docker | >= 20.10 | Run a Kafka broker for streaming integration tests | +## 📋 Prerequisites -> **Docker is required.** The test framework automatically pulls and manages -> an `apache/kafka:3.7.0` container in KRaft mode to provide a real Kafka -> broker for tests that involve Kafka input/output. Tests will fail if the -> Docker daemon is not running. +Ensure the following dependencies are available in your `PATH` before running the suite: -## Quick Start +| Dependency | Version | Note | +|------------|----------|----------------------------------------------| +| Python | `>= 3.9` | Test framework runtime | +| Rust | `stable` | For compiling the `function-stream` binary | +| Docker | `>= 20.10`| Required for containerized infrastructure (Kafka, MinIO) | -```bash -# From the project root -make build # Build the release binary (with --features python) -make integration-test +*Note: Infrastructure containers (e.g., `apache/kafka:3.7.0` in KRaft mode, `minio/minio`) are automatically provisioned and torn down by the test framework via the Docker daemon.* -# Or run directly from this directory -cd tests/integration -make test -``` +## 🚀 Quick Start -## Directory Layout - -``` -tests/integration/ -├── Makefile # test / clean targets -├── requirements.txt # Python dependencies (pytest, grpcio, docker, ...) -├── pytest.ini # Pytest configuration -├── framework/ # Reusable test infrastructure -│ ├── instance.py # FunctionStreamInstance facade -│ ├── workspace.py # Per-test directory management -│ ├── config.py # Server config generation -│ ├── process.py # OS process lifecycle (start/stop/kill) -│ ├── utils.py # Port allocation, readiness probes -│ └── kafka_manager.py # Docker-managed Kafka broker (KRaft mode) -├── test/ # Test suites -│ ├── wasm/ # WASM function tests -│ │ └── python_sdk/ # Python SDK integration tests -│ └── streaming/ # Streaming engine tests (future) -└── target/ # Test output (git-ignored) - ├── .shared_cache/ # Shared WASM compilation cache across tests - └── ////logs/ -``` - -## Test Output - -Each test gets an isolated server instance with its own log directory: - -``` -target/wasm/python_sdk/TestFunctionLifecycle/test_full_lifecycle_transitions/20260416_221655/ - logs/ - app.log # FunctionStream application log - stdout.log # Server stdout - stderr.log # Server stderr -``` - -Only `logs/` is retained after tests complete; `conf/` and `data/` are -automatically cleaned up. - -## Python Dependencies - -All Python packages are listed in `requirements.txt` and installed -automatically by `make test`. Key dependencies: - -- `pytest` — test runner -- `grpcio` / `protobuf` — gRPC client communication -- `docker` — Docker SDK for managing the Kafka container -- `confluent-kafka` — Kafka admin client for topic management -- `functionstream-api` / `functionstream-client` — local editable installs - -## Running Specific Tests +The test suite requires a fresh release build of the engine with the `python` feature enabled. ```bash -# Single test -make test PYTEST_ARGS="-k test_full_lifecycle_transitions" - -# Single file -make test PYTEST_ARGS="test/wasm/python_sdk/test_lifecycle.py" - -# Verbose with live log -make test PYTEST_ARGS="-v --log-cli-level=DEBUG" +# From the project root +make build +make integration-test ``` diff --git a/tests/integration/framework/__init__.py b/tests/integration/framework/__init__.py index b735753a..b8c08d67 100644 --- a/tests/integration/framework/__init__.py +++ b/tests/integration/framework/__init__.py @@ -12,11 +12,14 @@ from .instance import FunctionStreamInstance -__all__ = ["FunctionStreamInstance", "KafkaDockerManager"] +__all__ = ["FunctionStreamInstance", "KafkaDockerManager", "MinioDockerManager"] def __getattr__(name: str): if name == "KafkaDockerManager": from .kafka_manager import KafkaDockerManager return KafkaDockerManager + if name == "MinioDockerManager": + from .minio_manager import MinioDockerManager + return MinioDockerManager raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/tests/integration/framework/minio_manager.py b/tests/integration/framework/minio_manager.py new file mode 100644 index 00000000..d10abf3e --- /dev/null +++ b/tests/integration/framework/minio_manager.py @@ -0,0 +1,170 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Docker-managed MinIO service for integration tests. +""" + +from __future__ import annotations + +import logging +import time +from dataclasses import dataclass +from typing import Any, Dict, Optional + +import boto3 +import docker +import requests +from docker.errors import APIError, DockerException, NotFound + +from .utils import find_free_port + +logger = logging.getLogger(__name__) + + +class MinioDockerManagerError(Exception): + """Base exception for MinioDockerManager errors.""" + + +@dataclass(frozen=True) +class MinioConfig: + image: str = "minio/minio:RELEASE.2024-01-16T16-07-38Z" + container_name: str = "fs-integration-minio" + root_user: str = "minioadmin" + root_password: str = "minioadmin" + api_host: str = "127.0.0.1" + api_port: int = 9000 + console_port: int = 9001 + readiness_timeout_sec: int = 60 + + @property + def endpoint_url(self) -> str: + return f"http://{self.api_host}:{self.api_port}" + + +class MinioDockerManager: + def __init__( + self, + config: Optional[MinioConfig] = None, + docker_client: Optional[docker.DockerClient] = None, + ) -> None: + if config is None: + config = MinioConfig(api_port=find_free_port(), console_port=find_free_port()) + self.config = config + self._docker_client = docker_client + + @property + def docker_client(self) -> docker.DockerClient: + if self._docker_client is None: + try: + self._docker_client = docker.from_env() + except DockerException as e: + raise MinioDockerManagerError(f"Failed to connect to Docker daemon: {e}") from e + return self._docker_client + + @property + def s3_client(self): + return boto3.client( + "s3", + endpoint_url=self.config.endpoint_url, + aws_access_key_id=self.config.root_user, + aws_secret_access_key=self.config.root_password, + region_name="us-east-1", + ) + + def __enter__(self) -> "MinioDockerManager": + self.setup_minio() + return self + + def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: + self.teardown_minio() + + def setup_minio(self) -> None: + self._ensure_image() + self._ensure_container() + self._wait_for_readiness() + logger.info("MinIO ready at %s", self.config.endpoint_url) + + def teardown_minio(self) -> None: + try: + container = self.docker_client.containers.get(self.config.container_name) + container.stop(timeout=5) + except NotFound: + logger.debug("MinIO container not found during teardown.") + except APIError as exc: + logger.warning("Docker API error while stopping MinIO: %s", exc) + + def create_bucket_if_not_exists(self, bucket: str) -> None: + client = self.s3_client + buckets = [b["Name"] for b in client.list_buckets().get("Buckets", [])] + if bucket not in buckets: + client.create_bucket(Bucket=bucket) + + def clear_bucket(self, bucket: str) -> None: + client = self.s3_client + token = None + while True: + kwargs: Dict[str, Any] = {"Bucket": bucket} + if token: + kwargs["ContinuationToken"] = token + resp = client.list_objects_v2(**kwargs) + contents = resp.get("Contents", []) + if contents: + for item in contents: + client.delete_object(Bucket=bucket, Key=item["Key"]) + if not resp.get("IsTruncated"): + break + token = resp.get("NextContinuationToken") + + def _ensure_image(self) -> None: + try: + self.docker_client.images.get(self.config.image) + except NotFound: + logger.info("Pulling MinIO image '%s'...", self.config.image) + self.docker_client.images.pull(self.config.image) + + def _ensure_container(self) -> None: + try: + container = self.docker_client.containers.get(self.config.container_name) + if container.status != "running": + container.start() + return + except NotFound: + pass + + self.docker_client.containers.run( + image=self.config.image, + name=self.config.container_name, + command='server /data --console-address ":9001"', + ports={"9000/tcp": self.config.api_port, "9001/tcp": self.config.console_port}, + environment={ + "MINIO_ROOT_USER": self.config.root_user, + "MINIO_ROOT_PASSWORD": self.config.root_password, + }, + detach=True, + remove=True, + ) + + def _wait_for_readiness(self) -> None: + deadline = time.time() + self.config.readiness_timeout_sec + url = f"{self.config.endpoint_url}/minio/health/ready" + while time.time() < deadline: + try: + r = requests.get(url, timeout=1.5) + if r.status_code == 200: + return + except requests.RequestException: + pass + time.sleep(1.0) + raise MinioDockerManagerError( + f"MinIO not ready within {self.config.readiness_timeout_sec}s ({url})" + ) diff --git a/tests/integration/requirements.txt b/tests/integration/requirements.txt index d1e597c3..ac613dd9 100644 --- a/tests/integration/requirements.txt +++ b/tests/integration/requirements.txt @@ -19,6 +19,11 @@ protobuf>=4.25.0 # Docker + Kafka management for integration tests docker>=7.0 confluent-kafka>=2.3.0 +boto3>=1.34.0 +requests>=2.31.0 +pyarrow>=17.0.0 +lance>=1.1.0 +python-dateutil>=2.9.0 # FunctionStream Python packages (local editable installs) -e ../../python/functionstream-api diff --git a/tests/integration/test/wasm/python_sdk/conftest.py b/tests/integration/test/wasm/python_sdk/conftest.py index e0acd26e..c83a4763 100644 --- a/tests/integration/test/wasm/python_sdk/conftest.py +++ b/tests/integration/test/wasm/python_sdk/conftest.py @@ -10,19 +10,27 @@ # See the License for the specific language governing permissions and # limitations under the License. +import concurrent.futures import logging +import os import re import sys from pathlib import Path -from typing import Generator, List +from typing import Any, Generator, Set _CURRENT_DIR = Path(__file__).resolve().parent -_INTEGRATION_ROOT = Path(__file__).resolve().parents[3] +_INTEGRATION_ROOT = _CURRENT_DIR.parents[2] -if str(_INTEGRATION_ROOT) not in sys.path: - sys.path.insert(0, str(_INTEGRATION_ROOT)) -if str(_CURRENT_DIR) not in sys.path: - sys.path.insert(0, str(_CURRENT_DIR)) + +def _inject_path_safely(target_path: Path) -> None: + path_str = str(target_path) + if path_str not in sys.path: + sys.path.insert(0, path_str) + os.environ["PYTHONPATH"] = f"{path_str}{os.pathsep}{os.environ.get('PYTHONPATH', '')}" + + +_inject_path_safely(_INTEGRATION_ROOT) +_inject_path_safely(_CURRENT_DIR) import pytest from framework import FunctionStreamInstance, KafkaDockerManager @@ -33,41 +41,32 @@ @pytest.fixture(scope="session") def kafka() -> Generator[KafkaDockerManager, None, None]: - """ - Session-scoped Kafka broker manager. - Leverages Context Manager for guaranteed teardown. - """ with KafkaDockerManager() as mgr: yield mgr - try: - mgr.clear_all_topics() - except Exception as e: - logger.warning("Failed to clear topics during Kafka teardown: %s", e) @pytest.fixture(scope="session") def kafka_topics(kafka: KafkaDockerManager) -> str: - """ - Pre-creates standard topics and returns the bootstrap server address. - """ kafka.create_topics_if_not_exist(["in", "out", "events", "counts"]) return kafka.config.bootstrap_servers +@pytest.fixture(scope="session") +def minio() -> Generator[Any, None, None]: + try: + from framework import MinioDockerManager + except ModuleNotFoundError as exc: + pytest.skip(f"MinIO tests require optional dependency: {exc}") + with MinioDockerManager() as mgr: + yield mgr + + def _sanitize_segment(segment: str) -> str: clean = re.sub(r"[^\w\-]+", "_", segment).strip("_") return clean or "unknown" def _nodeid_to_workspace_path(nodeid: str) -> str: - """ - Convert pytest nodeid into a readable nested path under target/. - - Example: - test/wasm/python_sdk/test_data_flow.py::TestDataFlow::test_single_word_counting - -> - test/wasm/python_sdk/test_data_flow/TestDataFlow/test_single_word_counting - """ parts = nodeid.split("::") file_part = Path(parts[0]).with_suffix("") file_segments = [_sanitize_segment(seg) for seg in file_part.parts] @@ -77,10 +76,6 @@ def _nodeid_to_workspace_path(nodeid: str) -> str: @pytest.fixture def fs_server(request: pytest.FixtureRequest) -> Generator[FunctionStreamInstance, None, None]: - """ - Function-scoped FunctionStream instance. - Uses Context Manager to ensure SIGKILL and workspace cleanup. - """ test_name = _nodeid_to_workspace_path(request.node.nodeid) with FunctionStreamInstance(test_name=test_name) as instance: yield instance @@ -88,30 +83,54 @@ def fs_server(request: pytest.FixtureRequest) -> Generator[FunctionStreamInstanc @pytest.fixture def fs_client(fs_server: FunctionStreamInstance) -> Generator[FsClient, None, None]: - """ - Function-scoped FsClient connected to the isolated fs_server. - """ with fs_server.get_client() as client: yield client -@pytest.fixture -def function_registry(fs_client: FsClient) -> Generator[List[str], None, None]: - """ - RAII-style registry for FunctionStream tasks. - Ensures absolute teardown of functions to prevent state leakage. - """ - registered_names: List[str] = [] +class FunctionTracker: + def __init__(self, client: FsClient): + self._client = client + self._registered: Set[str] = set() - yield registered_names + def __contains__(self, name: str) -> bool: + return name in self._registered - for name in registered_names: + def append(self, name: str) -> None: + self._registered.add(name) + + def extend(self, names) -> None: + for name in names: + self._registered.add(name) + + def remove(self, name: str) -> None: + self._registered.discard(name) + + def register(self, name: str) -> None: + self._registered.add(name) + + def _teardown_single_function(self, name: str) -> None: try: - fs_client.stop_function(name) + self._client.stop_function(name) except Exception as e: - logger.debug("Failed to stop function '%s' during cleanup: %s", name, e) + logger.debug("Ignored stop error for '%s': %s", name, e) try: - fs_client.drop_function(name) + self._client.drop_function(name) except Exception as e: - logger.error("Failed to drop function '%s' during cleanup: %s", name, e) \ No newline at end of file + logger.debug("Ignored drop error for '%s': %s", name, e) + + def teardown_all(self) -> None: + if not self._registered: + return + + with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: + list(executor.map(self._teardown_single_function, self._registered)) + + self._registered.clear() + + +@pytest.fixture +def function_registry(fs_client: FsClient) -> Generator[FunctionTracker, None, None]: + tracker = FunctionTracker(fs_client) + yield tracker + tracker.teardown_all() \ No newline at end of file diff --git a/tests/integration/test/wasm/python_sdk/test_streaming_sql_sinks.py b/tests/integration/test/wasm/python_sdk/test_streaming_sql_sinks.py new file mode 100644 index 00000000..a31d8956 --- /dev/null +++ b/tests/integration/test/wasm/python_sdk/test_streaming_sql_sinks.py @@ -0,0 +1,420 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +import datetime as dt +import io +import json +import shutil +import tempfile +import time +import uuid +from pathlib import Path +from typing import Any, Dict, List + +import lance +import pyarrow as pa +import pyarrow.parquet as pq +import pytest +from dateutil import parser as dt_parser + +from .test_data_flow import produce_messages + + +def _uid(prefix: str) -> str: + return f"{prefix}_{uuid.uuid4().hex[:8]}" + + +def _bucket_name(prefix: str) -> str: + return f"{prefix}-{uuid.uuid4().hex[:12]}".lower() + + +def _sql_ok(fs_server: Any, sql: str) -> Any: + resp = fs_server.execute_sql(sql) + assert resp.status_code == 200, f"SQL failed: {sql}\nstatus={resp.status_code}\nmsg={resp.message}" + return resp + + +def _sql_drop_streaming_table(fs_server: Any, table_name: str) -> None: + resp = fs_server.execute_sql(f"DROP STREAMING TABLE {table_name};") + if resp.status_code == 200: + return + msg = (resp.message or "").lower() + if "not found" in msg or "does not exist" in msg: + return + raise AssertionError( + f"SQL failed: DROP STREAMING TABLE {table_name};\n" + f"status={resp.status_code}\nmsg={resp.message}" + ) + + +def _create_source(fs_server: Any, source_name: str, in_topic: str, bootstrap: str) -> None: + _sql_ok( + fs_server, + f""" + CREATE TABLE {source_name} ( + id BIGINT, + value VARCHAR, + ts TIMESTAMP NOT NULL, + WATERMARK FOR ts AS ts - INTERVAL '1' SECOND + ) WITH ( + 'connector' = 'kafka', + 'topic' = '{in_topic}', + 'format' = 'json', + 'scan.startup.mode' = 'earliest', + 'bootstrap.servers' = '{bootstrap}' + ); + """, + ) + + +def _create_sink_streaming_table( + fs_server: Any, + stream_name: str, + connector: str, + format_name: str, + with_extra: Dict[str, str], + source_name: str, + select_expr: str = "id, value, ts", +) -> None: + with_pairs = { + "connector": connector, + "type": "sink", + "format": format_name, + "checkpoint.interval.ms": "1000", + **with_extra, + } + with_sql = ",\n".join([f"'{k}' = '{v}'" for k, v in with_pairs.items()]) + _sql_ok( + fs_server, + f""" + CREATE STREAMING TABLE {stream_name} WITH ( + {with_sql} + ) AS + SELECT {select_expr} FROM {source_name}; + """, + ) + + +def _publish_rows(kafka_bootstrap: str, topic: str, rows: List[Dict[str, Any]]) -> None: + produce_messages(kafka_bootstrap, topic, [json.dumps(r) for r in rows]) + + +def _sample_rows() -> List[Dict[str, Any]]: + now = dt.datetime.now(dt.timezone.utc) + return [ + {"id": 1, "value": "alpha", "ts": (now - dt.timedelta(seconds=2)).isoformat()}, + {"id": 2, "value": "beta", "ts": (now - dt.timedelta(seconds=1)).isoformat()}, + {"id": 3, "value": "gamma", "ts": now.isoformat()}, + ] + + +def _parse_timestamp(val: Any) -> float: + if isinstance(val, dt.datetime): + # Parquet/Iceberg may return naive datetimes that are in UTC without a tz marker; + # treat them as UTC to avoid local-timezone skew (e.g. UTC+8 adds 28800 s). + if val.tzinfo is None: + val = val.replace(tzinfo=dt.timezone.utc) + return val.timestamp() + if isinstance(val, str): + parsed = dt_parser.isoparse(val) + if parsed.tzinfo is None: + parsed = parsed.replace(tzinfo=dt.timezone.utc) + return parsed.timestamp() + raise TypeError(f"Unknown timestamp type: {type(val)}") + + +def _assert_data_integrity(actual_rows: List[Dict[str, Any]], expected_rows: List[Dict[str, Any]]) -> None: + assert len(actual_rows) >= len(expected_rows), f"Expected at least {len(expected_rows)} rows, got {len(actual_rows)}" + + actual_mapped = {str(r["id"]): r for r in actual_rows} + for expected in expected_rows: + exp_id = str(expected["id"]) + assert exp_id in actual_mapped, f"Data Loss: Missing row with id {exp_id}" + + actual = actual_mapped[exp_id] + if "value" in expected: + assert str(actual.get("value")) == str(expected["value"]), f"Data Corruption: Value mismatch for id {exp_id}" + + assert "ts" in actual, f"Data Corruption: Missing timestamp column in output for id {exp_id}" + + expected_ts = _parse_timestamp(expected["ts"]) + actual_ts = _parse_timestamp(actual["ts"]) + assert abs(actual_ts - expected_ts) < 1.0, f"Timestamp drift too large for id {exp_id}: expected {expected_ts}, got {actual_ts}" + + +def _wait_and_verify_local_csv(dir_path: Path, expected_count: int, timeout_s: float = 30.0) -> List[Dict[str, Any]]: + deadline = time.time() + timeout_s + while time.time() < deadline: + files = list(dir_path.glob("*.csv")) + if files: + merged: List[Dict[str, Any]] = [] + for f in files: + try: + with f.open("r", encoding="utf-8") as fp: + merged.extend(list(csv.DictReader(fp))) + except Exception: + pass + if len(merged) >= expected_count: + return merged + time.sleep(1.0) + raise TimeoutError(f"Failed to verify {expected_count} rows in local CSV at {dir_path}") + + +def _wait_and_verify_local_parquet(dir_path: Path, expected_count: int, timeout_s: float = 30.0) -> List[Dict[str, Any]]: + deadline = time.time() + timeout_s + while time.time() < deadline: + files = list(dir_path.glob("**/*.parquet")) + if files: + try: + tables = [pq.read_table(f) for f in files] + if tables: + combined = pa.concat_tables(tables) + if combined.num_rows >= expected_count: + return combined.to_pylist() + except Exception: + pass + time.sleep(1.0) + raise TimeoutError(f"Failed to verify {expected_count} rows in local Parquet at {dir_path}") + + +def _wait_and_verify_local_lance(dir_path: Path, expected_count: int, timeout_s: float = 30.0) -> List[Dict[str, Any]]: + deadline = time.time() + timeout_s + while time.time() < deadline: + try: + ds = lance.dataset(dir_path.as_posix()) + if ds.count_rows() >= expected_count: + return ds.to_table().to_pylist() + except Exception: + pass + time.sleep(1.0) + raise TimeoutError(f"Failed to verify {expected_count} rows in local Lance dataset at {dir_path}") + + +def _wait_and_verify_s3_parquet(minio: Any, bucket: str, prefix: str, expected_count: int, timeout_s: float = 30.0) -> List[Dict[str, Any]]: + deadline = time.time() + timeout_s + client = minio.s3_client + while time.time() < deadline: + resp = client.list_objects_v2(Bucket=bucket, Prefix=prefix) + keys = [obj["Key"] for obj in resp.get("Contents", []) if obj["Key"].endswith(".parquet")] + if keys: + try: + tables = [] + for k in keys: + body = client.get_object(Bucket=bucket, Key=k)["Body"].read() + tables.append(pq.read_table(io.BytesIO(body))) + if tables: + combined = pa.concat_tables(tables) + if combined.num_rows >= expected_count: + return combined.to_pylist() + except Exception: + pass + time.sleep(1.0) + raise TimeoutError(f"Failed to verify {expected_count} rows in S3 Parquet at s3://{bucket}/{prefix}") + + +def _wait_and_verify_s3_lance(minio: Any, bucket: str, prefix: str, expected_count: int, timeout_s: float = 35.0) -> List[Dict[str, Any]]: + uri = f"s3://{bucket}/{prefix}" + deadline = time.time() + timeout_s + storage_options = { + "endpoint": minio.config.endpoint_url, + "access_key_id": minio.config.root_user, + "secret_access_key": minio.config.root_password, + "region": "us-east-1", + "allow_http": "true", + } + while time.time() < deadline: + try: + ds = lance.dataset(uri, storage_options=storage_options) + if ds.count_rows() >= expected_count: + return ds.to_table().to_pylist() + except Exception: + pass + time.sleep(1.0) + raise TimeoutError(f"Failed to verify {expected_count} rows in S3 Lance dataset at {uri}") + + +class TestStreamingSqlSinks: + def test_filesystem_csv_sink(self, fs_server: Any, kafka: Any, kafka_topics: str) -> None: + source_name = _uid("src") + stream_name = _uid("st_fs_csv") + in_topic = _uid("topic_in") + kafka.create_topics_if_not_exist([in_topic]) + _create_source(fs_server, source_name, in_topic, kafka_topics) + + temp_dir = Path(tempfile.mkdtemp(prefix="fs_sink_csv_")) + try: + _create_sink_streaming_table( + fs_server, + stream_name, + connector="filesystem", + format_name="csv", + with_extra={"path": temp_dir.as_posix()}, + source_name=source_name, + ) + expected_data = _sample_rows() + _publish_rows(kafka_topics, in_topic, expected_data) + actual_data = _wait_and_verify_local_csv(temp_dir, len(expected_data)) + _assert_data_integrity(actual_data, expected_data) + finally: + _sql_drop_streaming_table(fs_server, stream_name) + shutil.rmtree(temp_dir, ignore_errors=True) + + def test_s3_parquet_sink(self, fs_server: Any, kafka: Any, kafka_topics: str, minio: Any) -> None: + source_name = _uid("src") + stream_name = _uid("st_s3_pq") + in_topic = _uid("topic_in") + kafka.create_topics_if_not_exist([in_topic]) + _create_source(fs_server, source_name, in_topic, kafka_topics) + + bucket = _bucket_name("sink-bucket") + prefix = f"sinks/{stream_name}" + minio.create_bucket_if_not_exists(bucket) + try: + _create_sink_streaming_table( + fs_server, + stream_name, + connector="s3", + format_name="parquet", + with_extra={ + "path": prefix, + "s3.bucket": bucket, + "s3.region": "us-east-1", + "s3.endpoint": minio.config.endpoint_url, + "s3.access_key_id": minio.config.root_user, + "s3.secret_access_key": minio.config.root_password, + }, + source_name=source_name, + ) + expected_data = _sample_rows() + _publish_rows(kafka_topics, in_topic, expected_data) + actual_data = _wait_and_verify_s3_parquet(minio, bucket, prefix, len(expected_data)) + _assert_data_integrity(actual_data, expected_data) + finally: + _sql_drop_streaming_table(fs_server, stream_name) + minio.clear_bucket(bucket) + + def test_delta_parquet_sink(self, fs_server: Any, kafka: Any, kafka_topics: str) -> None: + source_name = _uid("src") + stream_name = _uid("st_delta_pq") + in_topic = _uid("topic_in") + kafka.create_topics_if_not_exist([in_topic]) + _create_source(fs_server, source_name, in_topic, kafka_topics) + + temp_dir = Path(tempfile.mkdtemp(prefix="delta_sink_pq_")) + try: + _create_sink_streaming_table( + fs_server, + stream_name, + connector="delta", + format_name="parquet", + with_extra={"path": temp_dir.as_posix()}, + source_name=source_name, + ) + expected_data = _sample_rows() + _publish_rows(kafka_topics, in_topic, expected_data) + actual_data = _wait_and_verify_local_parquet(temp_dir, len(expected_data)) + _assert_data_integrity(actual_data, expected_data) + finally: + _sql_drop_streaming_table(fs_server, stream_name) + shutil.rmtree(temp_dir, ignore_errors=True) + + def test_iceberg_parquet_sink(self, fs_server: Any, kafka: Any, kafka_topics: str) -> None: + source_name = _uid("src") + stream_name = _uid("st_iceberg_pq") + in_topic = _uid("topic_in") + kafka.create_topics_if_not_exist([in_topic]) + _create_source(fs_server, source_name, in_topic, kafka_topics) + + temp_dir = Path(tempfile.mkdtemp(prefix="iceberg_sink_pq_")) + try: + _create_sink_streaming_table( + fs_server, + stream_name, + connector="iceberg", + format_name="parquet", + with_extra={"path": temp_dir.as_posix()}, + source_name=source_name, + ) + expected_data = _sample_rows() + _publish_rows(kafka_topics, in_topic, expected_data) + actual_data = _wait_and_verify_local_parquet(temp_dir, len(expected_data)) + _assert_data_integrity(actual_data, expected_data) + finally: + _sql_drop_streaming_table(fs_server, stream_name) + shutil.rmtree(temp_dir, ignore_errors=True) + + def test_lancedb_lance_sink(self, fs_server: Any, kafka: Any, kafka_topics: str) -> None: + source_name = _uid("src") + stream_name = _uid("st_lancedb") + in_topic = _uid("topic_in") + kafka.create_topics_if_not_exist([in_topic]) + _create_source(fs_server, source_name, in_topic, kafka_topics) + + temp_dir = Path(tempfile.mkdtemp(prefix="lancedb_sink_")) + try: + _create_sink_streaming_table( + fs_server, + stream_name, + connector="lanceDB", + format_name="lance", + with_extra={"path": temp_dir.as_posix()}, + source_name=source_name, + select_expr="id, ts", + ) + expected_data = _sample_rows() + _publish_rows(kafka_topics, in_topic, expected_data) + actual_data = _wait_and_verify_local_lance(temp_dir, len(expected_data)) + expected_truncated = [{"id": r["id"], "ts": r["ts"]} for r in expected_data] + _assert_data_integrity(actual_data, expected_truncated) + finally: + _sql_drop_streaming_table(fs_server, stream_name) + shutil.rmtree(temp_dir, ignore_errors=True) + + def test_lancedb_lance_sink_to_s3( + self, fs_server: Any, kafka: Any, kafka_topics: str, minio: Any + ) -> None: + source_name = _uid("src") + stream_name = _uid("st_lancedb_s3") + in_topic = _uid("topic_in") + kafka.create_topics_if_not_exist([in_topic]) + _create_source(fs_server, source_name, in_topic, kafka_topics) + + bucket = _bucket_name("lance-bucket") + prefix = f"sinks/{stream_name}" + minio.create_bucket_if_not_exists(bucket) + s3_path = f"s3://{bucket}/{prefix}" + try: + _create_sink_streaming_table( + fs_server, + stream_name, + connector="lanceDB", + format_name="lance", + with_extra={ + "path": s3_path, + "s3.bucket": bucket, + "s3.region": "us-east-1", + "s3.endpoint": minio.config.endpoint_url, + "s3.access_key_id": minio.config.root_user, + "s3.secret_access_key": minio.config.root_password, + }, + source_name=source_name, + select_expr="id, ts", + ) + expected_data = _sample_rows() + _publish_rows(kafka_topics, in_topic, expected_data) + actual_data = _wait_and_verify_s3_lance(minio, bucket, prefix, len(expected_data)) + expected_truncated = [{"id": r["id"], "ts": r["ts"]} for r in expected_data] + _assert_data_integrity(actual_data, expected_truncated) + finally: + _sql_drop_streaming_table(fs_server, stream_name) + minio.clear_bucket(bucket)