diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml
index bd842ade2817..bdf29edb9beb 100644
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -22,8 +22,6 @@ on:
branches:
- dev
paths-ignore:
- - 'docs/**'
- - '**/*.md'
- 'seatunnel-ui/**'
concurrency:
diff --git a/config/seatunnel.yaml b/config/seatunnel.yaml
index 7c689a328d3d..5961c839238b 100644
--- a/config/seatunnel.yaml
+++ b/config/seatunnel.yaml
@@ -27,8 +27,6 @@ seatunnel:
checkpoint:
interval: 10000
timeout: 60000
- max-concurrent: 1
- tolerable-failure: 2
storage:
type: hdfs
max-retained: 3
diff --git a/docs/en/connector-v2/formats/kafka-compatible-kafkaconnect-json.md b/docs/en/connector-v2/formats/kafka-compatible-kafkaconnect-json.md
new file mode 100644
index 000000000000..7de8a9e838b2
--- /dev/null
+++ b/docs/en/connector-v2/formats/kafka-compatible-kafkaconnect-json.md
@@ -0,0 +1,47 @@
+# Kafka source compatible kafka-connect-json
+
+Seatunnel connector kafka supports parsing data extracted through kafka connect source, especially data extracted from kafka connect jdbc and kafka connect debezium
+
+# How to use
+
+## Kafka output to mysql
+
+```bash
+env {
+ execution.parallelism = 1
+ job.mode = "BATCH"
+}
+
+source {
+ Kafka {
+ bootstrap.servers = "localhost:9092"
+ topic = "jdbc_source_record"
+ result_table_name = "kafka_table"
+ start_mode = earliest
+ schema = {
+ fields {
+ id = "int"
+ name = "string"
+ description = "string"
+ weight = "string"
+ }
+ },
+ format = COMPATIBLE_KAFKA_CONNECT_JSON
+ }
+}
+
+
+sink {
+ Jdbc {
+ driver = com.mysql.cj.jdbc.Driver
+ url = "jdbc:mysql://localhost:3306/seatunnel"
+ user = st_user
+ password = seatunnel
+ generate_sink_sql = true
+ database = seatunnel
+ table = jdbc_sink
+ primary_keys = ["id"]
+ }
+}
+```
+
diff --git a/docs/en/connector-v2/sink/Redis.md b/docs/en/connector-v2/sink/Redis.md
index fcface7da22a..7d2ef237e1ce 100644
--- a/docs/en/connector-v2/sink/Redis.md
+++ b/docs/en/connector-v2/sink/Redis.md
@@ -23,6 +23,7 @@ Used to write data to Redis.
| mode | string | no | single |
| nodes | list | yes when mode=cluster | - |
| format | string | no | json |
+| expire | long | no | -1 |
| common-options | | no | - |
### host [string]
@@ -120,6 +121,10 @@ Connector will generate data as the following and write it to redis:
```
+### expire [long]
+
+Set redis expiration time, the unit is second. The default value is -1, keys do not automatically expire by default.
+
### common options
Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details
diff --git a/docs/en/connector-v2/sink/S3File.md b/docs/en/connector-v2/sink/S3File.md
index 7841afdf04e3..4bb670ae38c8 100644
--- a/docs/en/connector-v2/sink/S3File.md
+++ b/docs/en/connector-v2/sink/S3File.md
@@ -1,24 +1,17 @@
# S3File
-> S3 file sink connector
+> S3 File Sink Connector
-## Description
-
-Output data to aws s3 file system.
-
-:::tip
+## Support Those Engines
-If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x.
+> Spark
+> Flink
+> SeaTunnel Zeta
-If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this.
-
-To use this connector you need put hadoop-aws-3.1.4.jar and aws-java-sdk-bundle-1.11.271.jar in ${SEATUNNEL_HOME}/lib dir.
-
-:::
-
-## Key features
+## Key Features
- [x] [exactly-once](../../concept/connector-v2-features.md)
+- [ ] [cdc](../../concept/connector-v2-features.md)
By default, we use 2PC commit to ensure `exactly-once`
@@ -30,59 +23,100 @@ By default, we use 2PC commit to ensure `exactly-once`
- [x] json
- [x] excel
-## Options
-
-| name | type | required | default value | remarks |
-|----------------------------------|---------|----------|-------------------------------------------------------|--------------------------------------------------------------------------------------------------------|
-| path | string | yes | - | |
-| bucket | string | yes | - | |
-| fs.s3a.endpoint | string | yes | - | |
-| fs.s3a.aws.credentials.provider | string | yes | com.amazonaws.auth.InstanceProfileCredentialsProvider | |
-| access_key | string | no | - | Only used when fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider |
-| access_secret | string | no | - | Only used when fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider |
-| custom_filename | boolean | no | false | Whether you need custom the filename |
-| file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true |
-| filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true |
-| file_format_type | string | no | "csv" | |
-| field_delimiter | string | no | '\001' | Only used when file_format_type is text |
-| row_delimiter | string | no | "\n" | Only used when file_format_type is text |
-| have_partition | boolean | no | false | Whether you need processing partitions. |
-| partition_by | array | no | - | Only used then have_partition is true |
-| partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true |
-| is_partition_field_write_in_file | boolean | no | false | Only used then have_partition is true |
-| sink_columns | array | no | | When this parameter is empty, all fields are sink columns |
-| is_enable_transaction | boolean | no | true | |
-| batch_size | int | no | 1000000 | |
-| compress_codec | string | no | none | |
-| common-options | object | no | - | |
-| max_rows_in_memory | int | no | - | Only used when file_format_type is excel. |
-| sheet_name | string | no | Sheet${Random number} | Only used when file_format_type is excel. |
-
-### path [string]
-
-The target dir path is required.
-
-### bucket [string]
-
-The bucket address of s3 file system, for example: `s3n://seatunnel-test`, if you use `s3a` protocol, this parameter should be `s3a://seatunnel-test`.
-
-### fs.s3a.endpoint [string]
-
-fs s3a endpoint
-
-### fs.s3a.aws.credentials.provider [string]
-
-The way to authenticate s3a. We only support `org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider` and `com.amazonaws.auth.InstanceProfileCredentialsProvider` now.
-
-More information about the credential provider you can see [Hadoop AWS Document](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html#Simple_name.2Fsecret_credentials_with_SimpleAWSCredentialsProvider.2A)
-
-### access_key [string]
-
-The access key of s3 file system. If this parameter is not set, please confirm that the credential provider chain can be authenticated correctly, you could check this [hadoop-aws](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html)
+## Description
-### access_secret [string]
+Output data to aws s3 file system.
-The access secret of s3 file system. If this parameter is not set, please confirm that the credential provider chain can be authenticated correctly, you could check this [hadoop-aws](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html)
+## Supported DataSource Info
+
+| Datasource | Supported Versions |
+|------------|--------------------|
+| S3 | current |
+
+## Database Dependency
+
+> If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x.
+>
+> If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under `${SEATUNNEL_HOME}/lib` to confirm this.
+> To use this connector you need put `hadoop-aws-3.1.4.jar` and `aws-java-sdk-bundle-1.11.271.jar` in `${SEATUNNEL_HOME}/lib` dir.
+
+## Data Type Mapping
+
+If write to `csv`, `text` file type, All column will be string.
+
+### Orc File Type
+
+| SeaTunnel Data type | Orc Data type |
+|----------------------|-----------------------|
+| STRING | STRING |
+| BOOLEAN | BOOLEAN |
+| TINYINT | BYTE |
+| SMALLINT | SHORT |
+| INT | INT |
+| BIGINT | LONG |
+| FLOAT | FLOAT |
+| FLOAT | FLOAT |
+| DOUBLE | DOUBLE |
+| DECIMAL | DECIMAL |
+| BYTES | BINARY |
+| DATE | DATE |
+| TIME TIMESTAMP | TIMESTAMP |
+| ROW | STRUCT |
+| NULL | UNSUPPORTED DATA TYPE |
+| ARRAY | LIST |
+| Map | Map |
+
+### Parquet File Type
+
+| SeaTunnel Data type | Parquet Data type |
+|----------------------|-----------------------|
+| STRING | STRING |
+| BOOLEAN | BOOLEAN |
+| TINYINT | INT_8 |
+| SMALLINT | INT_16 |
+| INT | INT32 |
+| BIGINT | INT64 |
+| FLOAT | FLOAT |
+| FLOAT | FLOAT |
+| DOUBLE | DOUBLE |
+| DECIMAL | DECIMAL |
+| BYTES | BINARY |
+| DATE | DATE |
+| TIME TIMESTAMP | TIMESTAMP_MILLIS |
+| ROW | GroupType |
+| NULL | UNSUPPORTED DATA TYPE |
+| ARRAY | LIST |
+| Map | Map |
+
+## Sink Options
+
+| name | type | required | default value | Description |
+|----------------------------------|---------|----------|-------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| path | string | yes | - | |
+| bucket | string | yes | - | |
+| fs.s3a.endpoint | string | yes | - | |
+| fs.s3a.aws.credentials.provider | string | yes | com.amazonaws.auth.InstanceProfileCredentialsProvider | The way to authenticate s3a. We only support `org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider` and `com.amazonaws.auth.InstanceProfileCredentialsProvider` now. |
+| access_key | string | no | - | Only used when fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider |
+| access_secret | string | no | - | Only used when fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider |
+| custom_filename | boolean | no | false | Whether you need custom the filename |
+| file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true |
+| filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true |
+| file_format_type | string | no | "csv" | |
+| field_delimiter | string | no | '\001' | Only used when file_format is text |
+| row_delimiter | string | no | "\n" | Only used when file_format is text |
+| have_partition | boolean | no | false | Whether you need processing partitions. |
+| partition_by | array | no | - | Only used when have_partition is true |
+| partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used when have_partition is true |
+| is_partition_field_write_in_file | boolean | no | false | Only used when have_partition is true |
+| sink_columns | array | no | | When this parameter is empty, all fields are sink columns |
+| is_enable_transaction | boolean | no | true | |
+| batch_size | int | no | 1000000 | |
+| compress_codec | string | no | none | |
+| common-options | object | no | - | |
+| max_rows_in_memory | int | no | - | Only used when file_format is excel. |
+| sheet_name | string | no | Sheet${Random number} | Only used when file_format is excel. |
+| hadoop_s3_properties | map | no | | If you need to add a other option, you could add it here and refer to this [link](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) |
+| |
### hadoop_s3_properties [map]
@@ -208,6 +242,83 @@ Writer the sheet of the workbook
## Example
+### Simple:
+
+> This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to S3File Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target s3 dir will also create a file and all of the data in write in it.
+> Before run this job, you need create s3 path: /seatunnel/text. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../start-v2/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../start-v2/locally/quick-start-seatunnel-engine.md) to run this job.
+
+```
+# Defining the runtime environment
+env {
+ # You can set flink configuration here
+ execution.parallelism = 1
+ job.mode = "BATCH"
+}
+
+source {
+ # This is a example source plugin **only for test and demonstrate the feature source plugin**
+ FakeSource {
+ parallelism = 1
+ result_table_name = "fake"
+ row.num = 16
+ schema = {
+ fields {
+ c_map = "map>"
+ c_array = "array"
+ name = string
+ c_boolean = boolean
+ age = tinyint
+ c_smallint = smallint
+ c_int = int
+ c_bigint = bigint
+ c_float = float
+ c_double = double
+ c_decimal = "decimal(16, 1)"
+ c_null = "null"
+ c_bytes = bytes
+ c_date = date
+ c_timestamp = timestamp
+ }
+ }
+ }
+ # If you would like to get more information about how to configure seatunnel and see full list of source plugins,
+ # please go to https://seatunnel.apache.org/docs/category/source-v2
+}
+
+transform {
+ # If you would like to get more information about how to configure seatunnel and see full list of transform plugins,
+ # please go to https://seatunnel.apache.org/docs/category/transform-v2
+}
+
+sink {
+ S3File {
+ bucket = "s3a://seatunnel-test"
+ tmp_path = "/tmp/seatunnel"
+ path="/seatunnel/text"
+ fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn"
+ fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider"
+ file_format_type = "text"
+ field_delimiter = "\t"
+ row_delimiter = "\n"
+ have_partition = true
+ partition_by = ["age"]
+ partition_dir_expression = "${k0}=${v0}"
+ is_partition_field_write_in_file = true
+ custom_filename = true
+ file_name_expression = "${transactionId}_${now}"
+ filename_time_format = "yyyy.MM.dd"
+ sink_columns = ["name","age"]
+ is_enable_transaction=true
+ hadoop_s3_properties {
+ "fs.s3a.buffer.dir" = "/data/st_test/s3a"
+ "fs.s3a.fast.upload.buffer" = "disk"
+ }
+ }
+ # If you would like to get more information about how to configure seatunnel and see full list of sink plugins,
+ # please go to https://seatunnel.apache.org/docs/category/sink-v2
+}
+```
+
For text file format with `have_partition` and `custom_filename` and `sink_columns` and `com.amazonaws.auth.InstanceProfileCredentialsProvider`
```hocon
diff --git a/docs/en/connector-v2/source/MyHours.md b/docs/en/connector-v2/source/MyHours.md
index ec3a93553364..f90d42ab1cba 100644
--- a/docs/en/connector-v2/source/MyHours.md
+++ b/docs/en/connector-v2/source/MyHours.md
@@ -2,11 +2,13 @@
> My Hours source connector
-## Description
+## Support Those Engines
-Used to read data from My Hours.
+> Spark
+> Flink
+> SeaTunnel Zeta
-## Key features
+## Key Features
- [x] [batch](../../concept/connector-v2-features.md)
- [ ] [stream](../../concept/connector-v2-features.md)
@@ -15,71 +17,103 @@ Used to read data from My Hours.
- [ ] [parallelism](../../concept/connector-v2-features.md)
- [ ] [support user-defined split](../../concept/connector-v2-features.md)
-## Options
-
-| name | type | required | default value |
-|-----------------------------|---------|----------|---------------|
-| url | String | Yes | - |
-| email | String | Yes | - |
-| password | String | Yes | - |
-| method | String | No | get |
-| schema | Config | No | - |
-| schema.fields | Config | No | - |
-| format | String | No | json |
-| params | Map | No | - |
-| body | String | No | - |
-| json_field | Config | No | - |
-| content_json | String | No | - |
-| poll_interval_ms | int | No | - |
-| retry | int | No | - |
-| retry_backoff_multiplier_ms | int | No | 100 |
-| retry_backoff_max_ms | int | No | 10000 |
-| enable_multi_lines | boolean | No | false |
-| common-options | config | No | - |
-
-### url [String]
-
-http request url
-
-### email [String]
-
-email for login
-
-### password [String]
-
-password for login
-
-### method [String]
-
-http request method, only supports GET, POST method
-
-### params [Map]
-
-http params
-
-### body [String]
-
-http body
-
-### poll_interval_ms [int]
+## Description
-request http api interval(millis) in stream mode
+Used to read data from My Hours.
-### retry [int]
+## Key features
-The max retry times if request http return to `IOException`
+- [x] [batch](../../concept/connector-v2-features.md)
+- [ ] [stream](../../concept/connector-v2-features.md)
+- [ ] [exactly-once](../../concept/connector-v2-features.md)
+- [ ] [column projection](../../concept/connector-v2-features.md)
+- [ ] [parallelism](../../concept/connector-v2-features.md)
+- [ ] [support user-defined split](../../concept/connector-v2-features.md)
-### retry_backoff_multiplier_ms [int]
+## Supported DataSource Info
+
+In order to use the My Hours connector, the following dependencies are required.
+They can be downloaded via install-plugin.sh or from the Maven central repository.
+
+| Datasource | Supported Versions | Dependency |
+|------------|--------------------|---------------------------------------------------------------------------------------------|
+| My Hours | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/seatunnel-connectors-v2) |
+
+## Source Options
+
+| Name | Type | Required | Default | Description |
+|-----------------------------|---------|----------|---------|--------------------------------------------------------------------------------------------------------------------------------------|
+| url | String | Yes | - | Http request url. |
+| email | String | Yes | - | My hours login email address. |
+| password | String | Yes | - | My hours login password. |
+| schema | Config | No | - | Http and seatunnel data structure mapping |
+| schema.fields | Config | No | - | The schema fields of upstream data |
+| json_field | Config | No | - | This parameter helps you configure the schema,so this parameter must be used with schema. |
+| content_json | String | No | - | This parameter can get some json data.If you only need the data in the 'book' section, configure `content_field = "$.store.book.*"`. |
+| format | String | No | json | The format of upstream data, now only support `json` `text`, default `json`. |
+| method | String | No | get | Http request method, only supports GET, POST method. |
+| headers | Map | No | - | Http headers. |
+| params | Map | No | - | Http params. |
+| body | String | No | - | Http body. |
+| poll_interval_ms | Int | No | - | Request http api interval(millis) in stream mode. |
+| retry | Int | No | - | The max retry times if request http return to `IOException`. |
+| retry_backoff_multiplier_ms | Int | No | 100 | The retry-backoff times(millis) multiplier if request http failed. |
+| retry_backoff_max_ms | Int | No | 10000 | The maximum retry-backoff times(millis) if request http failed |
+| enable_multi_lines | Boolean | No | false | |
+| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details |
+
+## How to Create a My Hours Data Synchronization Jobs
-The retry-backoff times(millis) multiplier if request http failed
+```hocon
+env {
+ execution.parallelism = 1
+ job.mode = "BATCH"
+}
-### retry_backoff_max_ms [int]
+MyHours{
+ url = "https://api2.myhours.com/api/Projects/getAll"
+ email = "seatunnel@test.com"
+ password = "seatunnel"
+ schema {
+ fields {
+ name = string
+ archived = boolean
+ dateArchived = string
+ dateCreated = string
+ clientName = string
+ budgetAlertPercent = string
+ budgetType = int
+ totalTimeLogged = double
+ budgetValue = double
+ totalAmount = double
+ totalExpense = double
+ laborCost = double
+ totalCost = double
+ billableTimeLogged = double
+ totalBillableAmount = double
+ billable = boolean
+ roundType = int
+ roundInterval = int
+ budgetSpentPercentage = double
+ budgetTarget = int
+ budgetPeriodType = string
+ budgetSpent = string
+ id = string
+ }
+ }
+}
-The maximum retry-backoff times(millis) if request http failed
+# Console printing of the read data
+sink {
+ Console {
+ parallelism = 1
+ }
+}
+```
-### format [String]
+## Parameter Interpretation
-the format of upstream data, now only support `json` `text`, default `json`.
+### format
when you assign format is `json`, you should also assign schema option, for example:
@@ -98,11 +132,11 @@ you should assign schema as the following:
```hocon
schema {
- fields {
- code = int
- data = string
- success = boolean
- }
+ fields {
+ code = int
+ data = string
+ success = boolean
+ }
}
```
@@ -131,13 +165,7 @@ connector will generate data as the following:
|----------------------------------------------------------|
| {"code": 200, "data": "get success", "success": true} |
-### schema [Config]
-
-#### fields [Config]
-
-the schema fields of upstream data
-
-### content_json [String]
+### content_json
This parameter can get some json data.If you only need the data in the 'book' section, configure `content_field = "$.store.book.*"`.
@@ -212,14 +240,14 @@ Here is an example:
- Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json)
- See this link for task configuration [http_contentjson_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_contentjson_to_assert.conf).
-### json_field [Config]
+### json_field
This parameter helps you configure the schema,so this parameter must be used with schema.
If your data looks something like this:
```json
-{
+{
"store": {
"book": [
{
@@ -273,47 +301,6 @@ source {
- Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json)
- See this link for task configuration [http_jsonpath_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_jsonpath_to_assert.conf).
-### common options
-
-Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details
-
-## Example
-
-```hocon
-MyHours{
- url = "https://api2.myhours.com/api/Projects/getAll"
- email = "seatunnel@test.com"
- password = "seatunnel"
- schema {
- fields {
- name = string
- archived = boolean
- dateArchived = string
- dateCreated = string
- clientName = string
- budgetAlertPercent = string
- budgetType = int
- totalTimeLogged = double
- budgetValue = double
- totalAmount = double
- totalExpense = double
- laborCost = double
- totalCost = double
- billableTimeLogged = double
- totalBillableAmount = double
- billable = boolean
- roundType = int
- roundInterval = int
- budgetSpentPercentage = double
- budgetTarget = int
- budgetPeriodType = string
- budgetSpent = string
- id = string
- }
- }
-}
-```
-
## Changelog
### next version
diff --git a/docs/en/connector-v2/source/S3File.md b/docs/en/connector-v2/source/S3File.md
index f7ad1cc8bd0f..54124a370382 100644
--- a/docs/en/connector-v2/source/S3File.md
+++ b/docs/en/connector-v2/source/S3File.md
@@ -1,22 +1,14 @@
# S3File
-> S3 file source connector
+> S3 File Source Connector
-## Description
-
-Read data from aws s3 file system.
-
-:::tip
-
-If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x.
-
-If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this.
+## Support Those Engines
-To use this connector you need put hadoop-aws-3.1.4.jar and aws-java-sdk-bundle-1.11.271.jar in ${SEATUNNEL_HOME}/lib dir.
+> Spark
+> Flink
+> SeaTunnel Zeta
-:::
-
-## Key features
+## Key Features
- [x] [batch](../../concept/connector-v2-features.md)
- [ ] [stream](../../concept/connector-v2-features.md)
@@ -35,104 +27,31 @@ Read all the data in a split in a pollNext call. What splits are read will be sa
- [x] json
- [x] excel
-## Options
-
-| name | type | required | default value |
-|---------------------------------|---------|----------|-------------------------------------------------------|
-| path | string | yes | - |
-| file_format_type | string | yes | - |
-| bucket | string | yes | - |
-| fs.s3a.endpoint | string | yes | - |
-| fs.s3a.aws.credentials.provider | string | yes | com.amazonaws.auth.InstanceProfileCredentialsProvider |
-| read_columns | list | no | - |
-| access_key | string | no | - |
-| access_secret | string | no | - |
-| hadoop_s3_properties | map | no | - |
-| delimiter | string | no | \001 |
-| parse_partition_from_path | boolean | no | true |
-| date_format | string | no | yyyy-MM-dd |
-| datetime_format | string | no | yyyy-MM-dd HH:mm:ss |
-| time_format | string | no | HH:mm:ss |
-| skip_header_row_number | long | no | 0 |
-| schema | config | no | - |
-| common-options | | no | - |
-| sheet_name | string | no | - |
-| file_filter_pattern | string | no | - |
-
-### path [string]
-
-The source file path.
-
-### fs.s3a.endpoint [string]
-
-fs s3a endpoint
-
-### fs.s3a.aws.credentials.provider [string]
-
-The way to authenticate s3a. We only support `org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider` and `com.amazonaws.auth.InstanceProfileCredentialsProvider` now.
-
-More information about the credential provider you can see [Hadoop AWS Document](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html#Simple_name.2Fsecret_credentials_with_SimpleAWSCredentialsProvider.2A)
-
-### delimiter [string]
-
-Field delimiter, used to tell connector how to slice and dice fields when reading text files
-
-default `\001`, the same as hive's default delimiter
-
-### parse_partition_from_path [boolean]
-
-Control whether parse the partition keys and values from file path
-
-For example if you read a file from path `s3n://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26`
-
-Every record data from file will be added these two fields:
-
-| name | age |
-|---------------|-----|
-| tyrantlucifer | 26 |
-
-Tips: **Do not define partition fields in schema option**
-
-### date_format [string]
-
-Date type format, used to tell connector how to convert string to date, supported as the following formats:
-
-`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd`
-
-default `yyyy-MM-dd`
-
-### datetime_format [string]
-
-Datetime type format, used to tell connector how to convert string to datetime, supported as the following formats:
-
-`yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss`
-
-default `yyyy-MM-dd HH:mm:ss`
-
-### time_format [string]
-
-Time type format, used to tell connector how to convert string to time, supported as the following formats:
-
-`HH:mm:ss` `HH:mm:ss.SSS`
-
-default `HH:mm:ss`
+## Description
-### skip_header_row_number [long]
+Read data from aws s3 file system.
-Skip the first few lines, but only for the txt and csv.
+## Supported DataSource Info
-For example, set like following:
+| Datasource | Supported versions |
+|------------|--------------------|
+| S3 | current |
-`skip_header_row_number = 2`
+## Dependency
-then SeaTunnel will skip the first 2 lines from source files
+> If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x.
+>
+> If you use SeaTunnel Zeta, It automatically integrated the hadoop jar when you download and install SeaTunnel Zeta. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this.
+> To use this connector you need put hadoop-aws-3.1.4.jar and aws-java-sdk-bundle-1.11.271.jar in ${SEATUNNEL_HOME}/lib dir.
-### file_format_type [string]
+## Data Type Mapping
-File type, supported as the following file types:
+Data type mapping is related to the type of file being read, We supported as the following file types:
`text` `csv` `parquet` `orc` `json` `excel`
+### JSON File Type
+
If you assign file type to `json`, you should also assign schema option to tell connector how to parse data to the row you want.
For example:
@@ -174,7 +93,7 @@ connector will generate data as the following:
|------|-------------|---------|
| 200 | get success | true |
-If you assign file type to `parquet` `orc`, schema option not required, connector can find the schema of upstream data automatically.
+### Text Or CSV File Type
If you assign file type to `text` `csv`, you can choose to specify the schema information or not.
@@ -215,61 +134,102 @@ connector will generate data as the following:
|---------------|-----|--------|
| tyrantlucifer | 26 | male |
-### bucket [string]
-
-The bucket address of s3 file system, for example: `s3n://seatunnel-test`, if you use `s3a` protocol, this parameter should be `s3a://seatunnel-test`.
-
-### access_key [string]
-
-The access key of s3 file system. If this parameter is not set, please confirm that the credential provider chain can be authenticated correctly, you could check this [hadoop-aws](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html)
-
-### access_secret [string]
-
-The access secret of s3 file system. If this parameter is not set, please confirm that the credential provider chain can be authenticated correctly, you could check this [hadoop-aws](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html)
+### Orc File Type
-### hadoop_s3_properties [map]
-
-If you need to add a other option, you could add it here and refer to this [hadoop-aws](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html)
-
-```
-hadoop_s3_properties {
- "xxx" = "xxx"
- }
-```
-
-### schema [config]
-
-#### fields [Config]
-
-The schema of upstream data.
-
-### read_columns [list]
-
-The read column list of the data source, user can use it to implement field projection.
-
-The file type supported column projection as the following shown:
-
-- text
-- json
-- csv
-- orc
-- parquet
-- excel
+If you assign file type to `parquet` `orc`, schema option not required, connector can find the schema of upstream data automatically.
-**Tips: If the user wants to use this feature when reading `text` `json` `csv` files, the schema option must be configured**
+| Orc Data type | SeaTunnel Data type |
+|----------------------------------|----------------------------------------------------------------|
+| BOOLEAN | BOOLEAN |
+| INT | INT |
+| BYTE | BYTE |
+| SHORT | SHORT |
+| LONG | LONG |
+| FLOAT | FLOAT |
+| DOUBLE | DOUBLE |
+| BINARY | BINARY |
+| STRING VARCHAR CHAR | STRING |
+| DATE | LOCAL_DATE_TYPE |
+| TIMESTAMP | LOCAL_DATE_TIME_TYPE |
+| DECIMAL | DECIMAL |
+| LIST(STRING) | STRING_ARRAY_TYPE |
+| LIST(BOOLEAN) | BOOLEAN_ARRAY_TYPE |
+| LIST(TINYINT) | BYTE_ARRAY_TYPE |
+| LIST(SMALLINT) | SHORT_ARRAY_TYPE |
+| LIST(INT) | INT_ARRAY_TYPE |
+| LIST(BIGINT) | LONG_ARRAY_TYPE |
+| LIST(FLOAT) | FLOAT_ARRAY_TYPE |
+| LIST(DOUBLE) | DOUBLE_ARRAY_TYPE |
+| Map | MapType, This type of K and V will transform to SeaTunnel type |
+| STRUCT | SeaTunnelRowType |
+
+### Parquet File Type
-### common options
+If you assign file type to `parquet` `orc`, schema option not required, connector can find the schema of upstream data automatically.
-Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details.
+| Orc Data type | SeaTunnel Data type |
+|----------------------|----------------------------------------------------------------|
+| INT_8 | BYTE |
+| INT_16 | SHORT |
+| DATE | DATE |
+| TIMESTAMP_MILLIS | TIMESTAMP |
+| INT64 | LONG |
+| INT96 | TIMESTAMP |
+| BINARY | BYTES |
+| FLOAT | FLOAT |
+| DOUBLE | DOUBLE |
+| BOOLEAN | BOOLEAN |
+| FIXED_LEN_BYTE_ARRAY | TIMESTAMP DECIMAL |
+| DECIMAL | DECIMAL |
+| LIST(STRING) | STRING_ARRAY_TYPE |
+| LIST(BOOLEAN) | BOOLEAN_ARRAY_TYPE |
+| LIST(TINYINT) | BYTE_ARRAY_TYPE |
+| LIST(SMALLINT) | SHORT_ARRAY_TYPE |
+| LIST(INT) | INT_ARRAY_TYPE |
+| LIST(BIGINT) | LONG_ARRAY_TYPE |
+| LIST(FLOAT) | FLOAT_ARRAY_TYPE |
+| LIST(DOUBLE) | DOUBLE_ARRAY_TYPE |
+| Map | MapType, This type of K and V will transform to SeaTunnel type |
+| STRUCT | SeaTunnelRowType |
-### sheet_name [string]
+## Options
-Reader the sheet of the workbook,Only used when file_format_type is excel.
+| name | type | required | default value | Description |
+|---------------------------------|---------|----------|-------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| path | string | yes | - | The s3 path that needs to be read can have sub paths, but the sub paths need to meet certain format requirements. Specific requirements can be referred to "parse_partition_from_path" option |
+| file_format_type | string | yes | - | File type, supported as the following file types: `text` `csv` `parquet` `orc` `json` `excel` |
+| bucket | string | yes | - | The bucket address of s3 file system, for example: `s3n://seatunnel-test`, if you use `s3a` protocol, this parameter should be `s3a://seatunnel-test`. |
+| fs.s3a.endpoint | string | yes | - | fs s3a endpoint |
+| fs.s3a.aws.credentials.provider | string | yes | com.amazonaws.auth.InstanceProfileCredentialsProvider | The way to authenticate s3a. We only support `org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider` and `com.amazonaws.auth.InstanceProfileCredentialsProvider` now. More information about the credential provider you can see [Hadoop AWS Document](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html#Simple_name.2Fsecret_credentials_with_SimpleAWSCredentialsProvider.2A) |
+| read_columns | list | no | - | The read column list of the data source, user can use it to implement field projection. The file type supported column projection as the following shown: `text` `csv` `parquet` `orc` `json` `excel` . If the user wants to use this feature when reading `text` `json` `csv` files, the "schema" option must be configured. |
+| access_key | string | no | - | Only used when `fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider ` |
+| access_secret | string | no | - | Only used when `fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider ` |
+| hadoop_s3_properties | map | no | - | If you need to add other option, you could add it here and refer to this [link](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) |
+| delimiter | string | no | \001 | Field delimiter, used to tell connector how to slice and dice fields when reading text files. Default `\001`, the same as hive's default delimiter. |
+| parse_partition_from_path | boolean | no | true | Control whether parse the partition keys and values from file path. For example if you read a file from path `s3n://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26`. Every record data from file will be added these two fields: name="tyrantlucifer", age=16 |
+| date_format | string | no | yyyy-MM-dd | Date type format, used to tell connector how to convert string to date, supported as the following formats:`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd`. default `yyyy-MM-dd` |
+| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | Datetime type format, used to tell connector how to convert string to datetime, supported as the following formats:`yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss` |
+| time_format | string | no | HH:mm:ss | Time type format, used to tell connector how to convert string to time, supported as the following formats:`HH:mm:ss` `HH:mm:ss.SSS` |
+| skip_header_row_number | long | no | 0 | Skip the first few lines, but only for the txt and csv. For example, set like following:`skip_header_row_number = 2`. Then SeaTunnel will skip the first 2 lines from source files |
+| schema | config | no | - | The schema of upstream data. |
+| common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. |
+| sheet_name | string | no | - | Reader the sheet of the workbook,Only used when file_format is excel. |
## Example
-```hocon
+1. In this example, We read data from s3 path `s3a://seatunnel-test/seatunnel/text` and the file type is orc in this path.
+ We use `org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider` to authentication so `access_key` and `secret_key` is required.
+ All columns in the file will be read and send to sink.
+
+```
+# Defining the runtime environment
+env {
+ # You can set flink configuration here
+ execution.parallelism = 1
+ job.mode = "BATCH"
+}
+source {
S3File {
path = "/seatunnel/text"
fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn"
@@ -279,9 +239,21 @@ Reader the sheet of the workbook,Only used when file_format_type is excel.
bucket = "s3a://seatunnel-test"
file_format_type = "orc"
}
+}
+
+transform {
+ # If you would like to get more information about how to configure seatunnel and see full list of transform plugins,
+ # please go to https://seatunnel.apache.org/docs/category/transform-v2
+}
+sink {
+ Console {}
+}
```
+2. Use `InstanceProfileCredentialsProvider` to authentication
+ The file type in S3 is json, so need config schema option.
+
```hocon
S3File {
@@ -300,9 +272,47 @@ Reader the sheet of the workbook,Only used when file_format_type is excel.
```
-### file_filter_pattern [string]
+3. Use `InstanceProfileCredentialsProvider` to authentication
+ The file type in S3 is json and has five fields (`id`, `name`, `age`, `sex`, `type`), so need config schema option.
+ In this job, we only need send `id` and `name` column to mysql.
-Filter pattern, which used for filtering files.
+```
+# Defining the runtime environment
+env {
+ # You can set flink configuration here
+ execution.parallelism = 1
+ job.mode = "BATCH"
+}
+
+source {
+ S3File {
+ path = "/seatunnel/json"
+ bucket = "s3a://seatunnel-test"
+ fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn"
+ fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider"
+ file_format_type = "json"
+ read_columns = ["id", "name"]
+ schema {
+ fields {
+ id = int
+ name = string
+ age = int
+ sex = int
+ type = string
+ }
+ }
+ }
+}
+
+transform {
+ # If you would like to get more information about how to configure seatunnel and see full list of transform plugins,
+ # please go to https://seatunnel.apache.org/docs/category/transform-v2
+}
+
+sink {
+ Console {}
+}
+```
## Changelog
diff --git a/docs/en/seatunnel-engine/checkpoint-storage.md b/docs/en/seatunnel-engine/checkpoint-storage.md
index a88f301439e4..f2a6487f28d2 100644
--- a/docs/en/seatunnel-engine/checkpoint-storage.md
+++ b/docs/en/seatunnel-engine/checkpoint-storage.md
@@ -59,8 +59,6 @@ seatunnel:
checkpoint:
interval: 6000
timeout: 7000
- max-concurrent: 1
- tolerable-failure: 2
storage:
type: hdfs
max-retained: 3
@@ -94,8 +92,6 @@ seatunnel:
checkpoint:
interval: 6000
timeout: 7000
- max-concurrent: 1
- tolerable-failure: 2
storage:
type: hdfs
max-retained: 3
@@ -119,8 +115,6 @@ seatunnel:
checkpoint:
interval: 6000
timeout: 7000
- max-concurrent: 1
- tolerable-failure: 2
storage:
type: hdfs
max-retained: 3
@@ -152,6 +146,28 @@ seatunnel:
kerberosKeytab: your-kerberos-keytab
```
+if HDFS is in HA mode , you can config like this:
+
+```yaml
+seatunnel:
+ engine:
+ checkpoint:
+ storage:
+ type: hdfs
+ max-retained: 3
+ plugin-config:
+ storage.type: hdfs
+ fs.defaultFS: hdfs://usdp-bing
+ seatunnel.hadoop.dfs.nameservices: usdp-bing
+ seatunnel.hadoop.dfs.ha.namenodes.usdp-bing: nn1,nn2
+ seatunnel.hadoop.dfs.namenode.rpc-address.usdp-bing.nn1: usdp-bing-nn1:8020
+ seatunnel.hadoop.dfs.namenode.rpc-address.usdp-bing.nn2: usdp-bing-nn2:8020
+ seatunnel.hadoop.dfs.client.failover.proxy.provider.usdp-bing: org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider"
+
+```
+
+if HDFS has some other configs in `hdfs-site.xml` or `core-site.xml` , just set HDFS config by using `seatunnel.hadoop.` prefix.
+
#### LocalFile
```yaml
@@ -160,8 +176,6 @@ seatunnel:
checkpoint:
interval: 6000
timeout: 7000
- max-concurrent: 1
- tolerable-failure: 2
storage:
type: hdfs
max-retained: 3
diff --git a/docs/en/seatunnel-engine/deployment.md b/docs/en/seatunnel-engine/deployment.md
index 1f8692530cdd..18c1a587a2a3 100644
--- a/docs/en/seatunnel-engine/deployment.md
+++ b/docs/en/seatunnel-engine/deployment.md
@@ -75,14 +75,6 @@ The interval between two checkpoints, unit is milliseconds. If the `checkpoint.i
The timeout of a checkpoint. If a checkpoint cannot be completed within the timeout period, a checkpoint failure will be triggered. Therefore, Job will be restored.
-**max-concurrent**
-
-How many checkpoints can be performed simultaneously at most.
-
-**tolerable-failure**
-
-Maximum number of retries after checkpoint failure.
-
Example
```
@@ -95,8 +87,6 @@ seatunnel:
checkpoint:
interval: 300000
timeout: 10000
- max-concurrent: 1
- tolerable-failure: 2
```
**checkpoint storage**
diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/serialization/DefaultSerializer.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/serialization/DefaultSerializer.java
index 2100b9529cdc..5fabe2a284a9 100644
--- a/seatunnel-api/src/main/java/org/apache/seatunnel/api/serialization/DefaultSerializer.java
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/serialization/DefaultSerializer.java
@@ -35,6 +35,9 @@ public byte[] serialize(T obj) throws IOException {
@Override
public T deserialize(byte[] serialized) throws IOException {
+ if (serialized == null) {
+ return null;
+ }
return SerializationUtils.deserialize(serialized);
}
}
diff --git a/seatunnel-connectors-v2/connector-kafka/pom.xml b/seatunnel-connectors-v2/connector-kafka/pom.xml
index 0ce4bba6b171..7955ab3f5467 100644
--- a/seatunnel-connectors-v2/connector-kafka/pom.xml
+++ b/seatunnel-connectors-v2/connector-kafka/pom.xml
@@ -31,6 +31,7 @@
3.2.0
+ 1.6.4.Final
@@ -61,6 +62,17 @@
seatunnel-format-compatible-debezium-json${project.version}
+
+ org.apache.seatunnel
+ seatunnel-format-compatible-connect-json
+ ${project.version}
+
+
+ org.apache.kafka
+ connect-json
+ ${kafka.client.version}
+
+
diff --git a/seatunnel-connectors-v2/connector-kafka/src/main/java/org/apache/seatunnel/connectors/seatunnel/kafka/config/MessageFormat.java b/seatunnel-connectors-v2/connector-kafka/src/main/java/org/apache/seatunnel/connectors/seatunnel/kafka/config/MessageFormat.java
index 1ef29f6322a3..07f9a38ddffe 100644
--- a/seatunnel-connectors-v2/connector-kafka/src/main/java/org/apache/seatunnel/connectors/seatunnel/kafka/config/MessageFormat.java
+++ b/seatunnel-connectors-v2/connector-kafka/src/main/java/org/apache/seatunnel/connectors/seatunnel/kafka/config/MessageFormat.java
@@ -22,5 +22,6 @@ public enum MessageFormat {
TEXT,
CANAL_JSON,
DEBEZIUM_JSON,
- COMPATIBLE_DEBEZIUM_JSON
+ COMPATIBLE_DEBEZIUM_JSON,
+ COMPATIBLE_KAFKA_CONNECT_JSON
}
diff --git a/seatunnel-connectors-v2/connector-kafka/src/main/java/org/apache/seatunnel/connectors/seatunnel/kafka/source/KafkaSource.java b/seatunnel-connectors-v2/connector-kafka/src/main/java/org/apache/seatunnel/connectors/seatunnel/kafka/source/KafkaSource.java
index 30878e82a2c4..802d7986a94c 100644
--- a/seatunnel-connectors-v2/connector-kafka/src/main/java/org/apache/seatunnel/connectors/seatunnel/kafka/source/KafkaSource.java
+++ b/seatunnel-connectors-v2/connector-kafka/src/main/java/org/apache/seatunnel/connectors/seatunnel/kafka/source/KafkaSource.java
@@ -45,6 +45,7 @@
import org.apache.seatunnel.connectors.seatunnel.kafka.config.StartMode;
import org.apache.seatunnel.connectors.seatunnel.kafka.exception.KafkaConnectorException;
import org.apache.seatunnel.connectors.seatunnel.kafka.state.KafkaSourceState;
+import org.apache.seatunnel.format.compatible.kafka.connect.json.CompatibleKafkaConnectDeserializationSchema;
import org.apache.seatunnel.format.json.JsonDeserializationSchema;
import org.apache.seatunnel.format.json.canal.CanalJsonDeserializationSchema;
import org.apache.seatunnel.format.json.debezium.DebeziumJsonDeserializationSchema;
@@ -268,6 +269,11 @@ private void setDeserialization(Config config) {
.setIgnoreParseErrors(true)
.build();
break;
+ case COMPATIBLE_KAFKA_CONNECT_JSON:
+ deserializationSchema =
+ new CompatibleKafkaConnectDeserializationSchema(
+ typeInfo, config, false, false);
+ break;
case DEBEZIUM_JSON:
boolean includeSchema = DEBEZIUM_RECORD_INCLUDE_SCHEMA.defaultValue();
if (config.hasPath(DEBEZIUM_RECORD_INCLUDE_SCHEMA.key())) {
diff --git a/seatunnel-connectors-v2/connector-kafka/src/main/java/org/apache/seatunnel/connectors/seatunnel/kafka/source/KafkaSourceReader.java b/seatunnel-connectors-v2/connector-kafka/src/main/java/org/apache/seatunnel/connectors/seatunnel/kafka/source/KafkaSourceReader.java
index 226fded2409b..a2d3bae2b4d3 100644
--- a/seatunnel-connectors-v2/connector-kafka/src/main/java/org/apache/seatunnel/connectors/seatunnel/kafka/source/KafkaSourceReader.java
+++ b/seatunnel-connectors-v2/connector-kafka/src/main/java/org/apache/seatunnel/connectors/seatunnel/kafka/source/KafkaSourceReader.java
@@ -25,6 +25,7 @@
import org.apache.seatunnel.connectors.seatunnel.kafka.config.MessageFormatErrorHandleWay;
import org.apache.seatunnel.connectors.seatunnel.kafka.exception.KafkaConnectorErrorCode;
import org.apache.seatunnel.connectors.seatunnel.kafka.exception.KafkaConnectorException;
+import org.apache.seatunnel.format.compatible.kafka.connect.json.CompatibleKafkaConnectDeserializationSchema;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
@@ -150,9 +151,18 @@ public void pollNext(Collector output) throws Exception {
recordList) {
try {
- deserializationSchema.deserialize(
- record.value(), output);
- } catch (Exception e) {
+ if (deserializationSchema
+ instanceof
+ CompatibleKafkaConnectDeserializationSchema) {
+ ((CompatibleKafkaConnectDeserializationSchema)
+ deserializationSchema)
+ .deserialize(
+ record, output);
+ } else {
+ deserializationSchema.deserialize(
+ record.value(), output);
+ }
+ } catch (IOException e) {
if (this.messageFormatErrorHandleWay
== MessageFormatErrorHandleWay
.SKIP) {
diff --git a/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/config/RedisConfig.java b/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/config/RedisConfig.java
index c777d2378273..511cbe4aa993 100644
--- a/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/config/RedisConfig.java
+++ b/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/config/RedisConfig.java
@@ -102,6 +102,12 @@ public enum HashKeyParseMode {
.withDescription(
"hash key parse mode, support all or kv, default value is all");
+ public static final Option EXPIRE =
+ Options.key("expire")
+ .longType()
+ .defaultValue(-1L)
+ .withDescription("Set redis expiration time.");
+
public enum Format {
JSON,
// TEXT will be supported later
diff --git a/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/config/RedisDataType.java b/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/config/RedisDataType.java
index 64772b5381d3..a315e0cdae0c 100644
--- a/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/config/RedisDataType.java
+++ b/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/config/RedisDataType.java
@@ -30,8 +30,9 @@
public enum RedisDataType {
KEY {
@Override
- public void set(Jedis jedis, String key, String value) {
+ public void set(Jedis jedis, String key, String value, long expire) {
jedis.set(key, value);
+ expire(jedis, key, expire);
}
@Override
@@ -41,9 +42,10 @@ public List get(Jedis jedis, String key) {
},
HASH {
@Override
- public void set(Jedis jedis, String key, String value) {
+ public void set(Jedis jedis, String key, String value, long expire) {
Map fieldsMap = JsonUtils.toMap(value);
jedis.hset(key, fieldsMap);
+ expire(jedis, key, expire);
}
@Override
@@ -54,8 +56,9 @@ public List get(Jedis jedis, String key) {
},
LIST {
@Override
- public void set(Jedis jedis, String key, String value) {
+ public void set(Jedis jedis, String key, String value, long expire) {
jedis.lpush(key, value);
+ expire(jedis, key, expire);
}
@Override
@@ -65,8 +68,9 @@ public List get(Jedis jedis, String key) {
},
SET {
@Override
- public void set(Jedis jedis, String key, String value) {
+ public void set(Jedis jedis, String key, String value, long expire) {
jedis.sadd(key, value);
+ expire(jedis, key, expire);
}
@Override
@@ -77,8 +81,9 @@ public List get(Jedis jedis, String key) {
},
ZSET {
@Override
- public void set(Jedis jedis, String key, String value) {
+ public void set(Jedis jedis, String key, String value, long expire) {
jedis.zadd(key, 1, value);
+ expire(jedis, key, expire);
}
@Override
@@ -91,7 +96,13 @@ public List get(Jedis jedis, String key) {
return Collections.emptyList();
}
- public void set(Jedis jedis, String key, String value) {
+ private static void expire(Jedis jedis, String key, long expire) {
+ if (expire > 0) {
+ jedis.expire(key, expire);
+ }
+ }
+
+ public void set(Jedis jedis, String key, String value, long expire) {
// do nothing
}
}
diff --git a/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/config/RedisParameters.java b/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/config/RedisParameters.java
index c8bb879d0f5b..8954b4da2a1f 100644
--- a/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/config/RedisParameters.java
+++ b/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/config/RedisParameters.java
@@ -47,6 +47,7 @@ public class RedisParameters implements Serializable {
private RedisConfig.RedisMode mode;
private RedisConfig.HashKeyParseMode hashKeyParseMode;
private List redisNodes = Collections.emptyList();
+ private long expire = RedisConfig.EXPIRE.defaultValue();
public void buildWithConfig(Config config) {
// set host
@@ -89,6 +90,9 @@ public void buildWithConfig(Config config) {
if (config.hasPath(RedisConfig.KEY_PATTERN.key())) {
this.keysPattern = config.getString(RedisConfig.KEY_PATTERN.key());
}
+ if (config.hasPath(RedisConfig.EXPIRE.key())) {
+ this.expire = config.getLong(RedisConfig.EXPIRE.key());
+ }
// set redis data type
try {
String dataType = config.getString(RedisConfig.DATA_TYPE.key());
diff --git a/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/sink/RedisSinkFactory.java b/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/sink/RedisSinkFactory.java
index e68a893f79c3..22ae1568740e 100644
--- a/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/sink/RedisSinkFactory.java
+++ b/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/sink/RedisSinkFactory.java
@@ -41,7 +41,8 @@ public OptionRule optionRule() {
RedisConfig.AUTH,
RedisConfig.USER,
RedisConfig.KEY_PATTERN,
- RedisConfig.FORMAT)
+ RedisConfig.FORMAT,
+ RedisConfig.EXPIRE)
.conditional(RedisConfig.MODE, RedisConfig.RedisMode.CLUSTER, RedisConfig.NODES)
.build();
}
diff --git a/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/sink/RedisSinkWriter.java b/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/sink/RedisSinkWriter.java
index 657e3aaa5658..80b1449b9d6d 100644
--- a/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/sink/RedisSinkWriter.java
+++ b/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/sink/RedisSinkWriter.java
@@ -59,7 +59,8 @@ public void write(SeaTunnelRow element) throws IOException {
} else {
key = keyField;
}
- redisDataType.set(jedis, key, data);
+ long expire = redisParameters.getExpire();
+ redisDataType.set(jedis, key, data, expire);
}
@Override
diff --git a/seatunnel-connectors-v2/connector-starrocks/src/main/java/org/apache/seatunnel/connectors/seatunnel/starrocks/serialize/StarRocksJsonSerializer.java b/seatunnel-connectors-v2/connector-starrocks/src/main/java/org/apache/seatunnel/connectors/seatunnel/starrocks/serialize/StarRocksJsonSerializer.java
index 0e6de2f60127..5bf15c533a15 100644
--- a/seatunnel-connectors-v2/connector-starrocks/src/main/java/org/apache/seatunnel/connectors/seatunnel/starrocks/serialize/StarRocksJsonSerializer.java
+++ b/seatunnel-connectors-v2/connector-starrocks/src/main/java/org/apache/seatunnel/connectors/seatunnel/starrocks/serialize/StarRocksJsonSerializer.java
@@ -19,9 +19,10 @@
import org.apache.seatunnel.api.table.type.SeaTunnelRow;
import org.apache.seatunnel.api.table.type.SeaTunnelRowType;
+import org.apache.seatunnel.api.table.type.SqlType;
import org.apache.seatunnel.common.utils.JsonUtils;
-import java.util.HashMap;
+import java.util.LinkedHashMap;
import java.util.Map;
public class StarRocksJsonSerializer extends StarRocksBaseSerializer
@@ -38,10 +39,22 @@ public StarRocksJsonSerializer(SeaTunnelRowType seaTunnelRowType, boolean enable
@Override
public String serialize(SeaTunnelRow row) {
- Map rowMap = new HashMap<>(row.getFields().length);
+ Map rowMap = new LinkedHashMap<>(row.getFields().length);
for (int i = 0; i < row.getFields().length; i++) {
- Object value = convert(seaTunnelRowType.getFieldType(i), row.getField(i));
+ SqlType sqlType = seaTunnelRowType.getFieldType(i).getSqlType();
+ Object value;
+ if (sqlType == SqlType.ARRAY
+ || sqlType == SqlType.MAP
+ || sqlType == SqlType.ROW
+ || sqlType == SqlType.MULTIPLE_ROW) {
+ // If the field type is complex type, we should keep the origin value.
+ // It will be transformed to json string in the next step
+ // JsonUtils.toJsonString(rowMap).
+ value = row.getField(i);
+ } else {
+ value = convert(seaTunnelRowType.getFieldType(i), row.getField(i));
+ }
rowMap.put(seaTunnelRowType.getFieldName(i), value);
}
if (enableUpsertDelete) {
diff --git a/seatunnel-connectors-v2/connector-starrocks/src/test/java/org/apache/seatunnel/connectors/seatunnel/starrocks/serialize/StarRocksJsonSerializerTest.java b/seatunnel-connectors-v2/connector-starrocks/src/test/java/org/apache/seatunnel/connectors/seatunnel/starrocks/serialize/StarRocksJsonSerializerTest.java
new file mode 100644
index 000000000000..6e0d9476441d
--- /dev/null
+++ b/seatunnel-connectors-v2/connector-starrocks/src/test/java/org/apache/seatunnel/connectors/seatunnel/starrocks/serialize/StarRocksJsonSerializerTest.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.seatunnel.connectors.seatunnel.starrocks.serialize;
+
+import org.apache.seatunnel.api.table.type.ArrayType;
+import org.apache.seatunnel.api.table.type.BasicType;
+import org.apache.seatunnel.api.table.type.MapType;
+import org.apache.seatunnel.api.table.type.SeaTunnelDataType;
+import org.apache.seatunnel.api.table.type.SeaTunnelRow;
+import org.apache.seatunnel.api.table.type.SeaTunnelRowType;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+import java.util.Collections;
+
+public class StarRocksJsonSerializerTest {
+
+ @Test
+ public void serialize() {
+ String[] filedNames = {"id", "name", "array", "map"};
+ SeaTunnelDataType>[] filedTypes = {
+ BasicType.LONG_TYPE,
+ BasicType.STRING_TYPE,
+ ArrayType.STRING_ARRAY_TYPE,
+ new MapType<>(BasicType.STRING_TYPE, BasicType.STRING_TYPE)
+ };
+
+ SeaTunnelRowType seaTunnelRowType = new SeaTunnelRowType(filedNames, filedTypes);
+ StarRocksJsonSerializer starRocksJsonSerializer =
+ new StarRocksJsonSerializer(seaTunnelRowType, false);
+ Object[] fields = {
+ 1, "Tom", new String[] {"tag1", "tag2"}, Collections.singletonMap("key1", "value1")
+ };
+ SeaTunnelRow seaTunnelRow = new SeaTunnelRow(fields);
+ String jsonString = starRocksJsonSerializer.serialize(seaTunnelRow);
+ Assertions.assertEquals(
+ "{\"id\":1,\"name\":\"Tom\",\"array\":[\"tag1\",\"tag2\"],\"map\":{\"key1\":\"value1\"}}",
+ jsonString);
+ }
+}
diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/pom.xml b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/pom.xml
index 81cbb7856984..fa2e1930cce4 100644
--- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/pom.xml
+++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/pom.xml
@@ -92,6 +92,11 @@
postgresqltest
+
+ mysql
+ mysql-connector-java
+ test
+ org.testcontainersmysql
diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/src/test/java/org/apache/seatunnel/e2e/connector/kafka/KafkaConnectToKafkaIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/src/test/java/org/apache/seatunnel/e2e/connector/kafka/KafkaConnectToKafkaIT.java
new file mode 100644
index 000000000000..591049917f8f
--- /dev/null
+++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/src/test/java/org/apache/seatunnel/e2e/connector/kafka/KafkaConnectToKafkaIT.java
@@ -0,0 +1,282 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.seatunnel.e2e.connector.kafka;
+
+import org.apache.seatunnel.shade.com.fasterxml.jackson.databind.JsonNode;
+import org.apache.seatunnel.shade.com.fasterxml.jackson.databind.ObjectMapper;
+
+import org.apache.seatunnel.connectors.seatunnel.cdc.mysql.testutils.MySqlContainer;
+import org.apache.seatunnel.connectors.seatunnel.cdc.mysql.testutils.MySqlVersion;
+import org.apache.seatunnel.e2e.common.TestResource;
+import org.apache.seatunnel.e2e.common.TestSuiteBase;
+import org.apache.seatunnel.e2e.common.container.ContainerExtendedFactory;
+import org.apache.seatunnel.e2e.common.container.EngineType;
+import org.apache.seatunnel.e2e.common.container.TestContainer;
+import org.apache.seatunnel.e2e.common.junit.DisabledOnContainer;
+import org.apache.seatunnel.e2e.common.junit.TestContainerExtension;
+
+import org.apache.kafka.clients.producer.KafkaProducer;
+import org.apache.kafka.clients.producer.ProducerConfig;
+import org.apache.kafka.clients.producer.ProducerRecord;
+import org.apache.kafka.common.serialization.ByteArraySerializer;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.TestTemplate;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.testcontainers.containers.Container;
+import org.testcontainers.containers.KafkaContainer;
+import org.testcontainers.containers.output.Slf4jLogConsumer;
+import org.testcontainers.lifecycle.Startables;
+import org.testcontainers.utility.DockerImageName;
+import org.testcontainers.utility.DockerLoggerFactory;
+
+import com.google.common.collect.Lists;
+import lombok.SneakyThrows;
+import lombok.extern.slf4j.Slf4j;
+
+import java.io.IOException;
+import java.sql.Connection;
+import java.sql.DriverManager;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Properties;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Stream;
+
+import static org.awaitility.Awaitility.given;
+
+@Slf4j
+@DisabledOnContainer(
+ value = {},
+ type = {EngineType.SPARK})
+public class KafkaConnectToKafkaIT extends TestSuiteBase implements TestResource {
+ private static final Logger LOG = LoggerFactory.getLogger(KafkaConnectToKafkaIT.class);
+ private final ObjectMapper objectMapper = new ObjectMapper();
+ // kafka
+ private static final String KAFKA_IMAGE_NAME = "confluentinc/cp-kafka:latest";
+
+ private static final String KAFKA_JDBC_TOPIC = "jdbc_source_record";
+
+ private static final String KAFKA_HOST = "kafka_connect_source_record";
+
+ private static KafkaContainer KAFKA_CONTAINER;
+
+ private KafkaProducer kafkaProducer;
+
+ // -----------------------------------mysql-----------------------------------------
+ private static MySqlContainer MYSQL_CONTAINER;
+ private static final String MYSQL_DATABASE = "seatunnel";
+ private static final String MYSQL_HOST = "kafka_to_mysql_e2e";
+ private static final int MYSQL_PORT = 3306;
+ private static final String MYSQL_DRIVER_JAR =
+ "https://repo1.maven.org/maven2/com/mysql/mysql-connector-j/8.0.32/mysql-connector-j-8.0.32.jar";
+
+ @TestContainerExtension
+ private final ContainerExtendedFactory extendedFactory =
+ container -> {
+ Container.ExecResult extraCommands =
+ container.execInContainer(
+ "bash",
+ "-c",
+ "mkdir -p /tmp/seatunnel/plugins/Jdbc/lib && cd /tmp/seatunnel/plugins/Jdbc/lib && curl -O "
+ + MYSQL_DRIVER_JAR);
+ Assertions.assertEquals(0, extraCommands.getExitCode());
+ };
+
+ private static MySqlContainer createMySqlContainer(MySqlVersion version) {
+ MySqlContainer mySqlContainer =
+ new MySqlContainer(version)
+ .withConfigurationOverride("docker/server-gtids/my.cnf")
+ .withSetupSQL("docker/setup.sql")
+ .withNetwork(NETWORK)
+ .withNetworkAliases(MYSQL_HOST)
+ .withDatabaseName("seatunnel")
+ .withUsername("st_user")
+ .withPassword("seatunnel")
+ .withLogConsumer(new Slf4jLogConsumer(LOG));
+ mySqlContainer.setPortBindings(
+ com.google.common.collect.Lists.newArrayList(
+ String.format("%s:%s", MYSQL_PORT, MYSQL_PORT)));
+ return mySqlContainer;
+ }
+
+ private void createKafkaContainer() {
+ KAFKA_CONTAINER =
+ new KafkaContainer(DockerImageName.parse(KAFKA_IMAGE_NAME))
+ .withNetwork(NETWORK)
+ .withNetworkAliases(KAFKA_HOST)
+ .withLogConsumer(
+ new Slf4jLogConsumer(
+ DockerLoggerFactory.getLogger(KAFKA_IMAGE_NAME)));
+ }
+
+ @BeforeAll
+ @Override
+ public void startUp() {
+
+ LOG.info("The first stage: Starting Kafka containers...");
+ createKafkaContainer();
+ Startables.deepStart(Stream.of(KAFKA_CONTAINER)).join();
+ LOG.info("Kafka Containers are started");
+
+ given().ignoreExceptions()
+ .atLeast(100, TimeUnit.MILLISECONDS)
+ .pollInterval(500, TimeUnit.MILLISECONDS)
+ .atMost(2, TimeUnit.MINUTES)
+ .untilAsserted(this::initKafkaProducer);
+
+ LOG.info("The second stage: Starting Mysql containers...");
+ MYSQL_CONTAINER = createMySqlContainer(MySqlVersion.V8_0);
+ Startables.deepStart(Stream.of(MYSQL_CONTAINER)).join();
+ LOG.info("Mysql Containers are started");
+
+ given().ignoreExceptions()
+ .await()
+ .atLeast(100, TimeUnit.MILLISECONDS)
+ .pollInterval(500, TimeUnit.MILLISECONDS)
+ .atMost(2, TimeUnit.MINUTES)
+ .untilAsserted(this::initializeDatabase);
+
+ given().ignoreExceptions()
+ .await()
+ .atLeast(100, TimeUnit.MILLISECONDS)
+ .pollInterval(500, TimeUnit.MILLISECONDS)
+ .atMost(2, TimeUnit.MINUTES)
+ .untilAsserted(this::initializeJdbcTable);
+
+ log.info("Write 3 records to topic " + KAFKA_JDBC_TOPIC);
+ generateConnectJdbcRecord();
+ }
+
+ @TestTemplate
+ public void testJdbcRecordKafkaToMysql(TestContainer container)
+ throws IOException, InterruptedException, SQLException {
+ Container.ExecResult execResult =
+ container.executeJob("/kafkasource_jdbc_record_to_mysql.conf");
+ Assertions.assertEquals(0, execResult.getExitCode(), execResult.getStderr());
+ List