diff --git a/.github/actions/action-sh-checker b/.github/actions/action-sh-checker deleted file mode 160000 index 76ab0b22e1f194..00000000000000 --- a/.github/actions/action-sh-checker +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 76ab0b22e1f194e4a582edc7969df6485c4e9246 diff --git a/.github/actions/clang-format-lint-action b/.github/actions/clang-format-lint-action deleted file mode 160000 index 6adbe14579e5b8..00000000000000 --- a/.github/actions/clang-format-lint-action +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 6adbe14579e5b8e19eb3e31e5ff2479f3bd302c7 diff --git a/.github/actions/clang-tidy-review b/.github/actions/clang-tidy-review deleted file mode 160000 index 2c55ef8cfc9acb..00000000000000 --- a/.github/actions/clang-tidy-review +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 2c55ef8cfc9acb3715d433e58aea086dcec9b206 diff --git a/.github/workflows/auto_trigger_teamcity.yml b/.github/workflows/auto_trigger_teamcity.yml index 61be077fed6857..ab3e85fa6b560f 100644 --- a/.github/workflows/auto_trigger_teamcity.yml +++ b/.github/workflows/auto_trigger_teamcity.yml @@ -57,40 +57,32 @@ jobs: echo "latest_commit_id : ${{ env.LATEST_COMMIT }}" set -x - if [[ "${comment_message}" =~ "run" && "${comment_message}" =~ "buildall" && ! "${comment_message}" =~ "Thanks for your contribution" ]]; then - trigger_pipelines="Doris_Doris_FeUt Doris_DorisBeUt_BeUt Doris_DorisCompile_Compile Doris_Performance_Clickbench_ClickbenchNew Doris_ArmPipeline_P0Regression ${trigger_pipelines}" + reg="run (buildall|compile|p0|p1|feut|beut|external|clickbench|pipelinex_p0|arm|tpch)( [1-9]*[0-9]+)*" + comment_trigger_type="$(echo "${comment_message}" | grep -E "${reg}" | awk -F' ' '{print $2}' | sed -n 1p)" + comment_repeat_times="$(echo "${comment_message}" | grep -E "${reg}" | awk -F' ' '{print $3}' | sed -n 1p)" + if [[ "${comment_trigger_type}" == "buildall" ]]; then + trigger_pipelines="Doris_Doris_FeUt Doris_DorisBeUt_BeUt Doris_DorisCompile_Compile Doris_Performance_Clickbench_ClickbenchNew Doris_ArmPipeline_P0Regression" + elif [[ "${comment_trigger_type}" == "compile" ]]; then + trigger_pipelines="Doris_DorisCompile_Compile" + elif [[ "${comment_trigger_type}" == "p0" ]]; then + trigger_pipelines="Doris_DorisRegression_P0Regression" + elif [[ "${comment_trigger_type}" == "p1" ]]; then + trigger_pipelines="Doris_DorisRegression_P1Regression" + elif [[ "${comment_trigger_type}" == "feut" ]]; then + trigger_pipelines="Doris_Doris_FeUt" + elif [[ "${comment_trigger_type}" == "beut" ]]; then + trigger_pipelines="Doris_DorisBeUt_BeUt" + elif [[ "${comment_trigger_type}" == "external" ]]; then + trigger_pipelines="Doris_External_Regression" + elif [[ "${comment_trigger_type}" == "clickbench" ]]; then + trigger_pipelines="Doris_Performance_Clickbench_ClickbenchNew" + elif [[ "${comment_trigger_type}" == "pipelinex_p0" ]]; then + trigger_pipelines="Doris_DorisRegression_P0RegressionPipelineX" + elif [[ "${comment_trigger_type}" == "arm" ]]; then + trigger_pipelines="Doris_ArmPipeline_P0Regression" + elif [[ "${comment_trigger_type}" == "tpch" ]]; then + trigger_pipelines="Tpch_TpchSf100" fi - if [[ "${comment_message}" =~ "run" && "${comment_message}" =~ " p0" && ! "${comment_message}" =~ "Thanks for your contribution" ]]; then - trigger_pipelines="Doris_DorisRegression_P0Regression ${trigger_pipelines}" - fi - if [[ "${comment_message}" =~ "run" && "${comment_message}" =~ " pipelinex_p0" && ! "${comment_message}" =~ "Thanks for your contribution" ]]; then - trigger_pipelines="Doris_DorisRegression_P0RegressionPipelineX ${trigger_pipelines}" - fi - if [[ "${comment_message}" =~ "run" && "${comment_message}" =~ "p1" && ! "${comment_message}" =~ "Thanks for your contribution" ]]; then - trigger_pipelines="Doris_DorisRegression_P1Regression ${trigger_pipelines}" - fi - if [[ "${comment_message}" =~ "run" && "${comment_message}" =~ "feut" && ! "${comment_message}" =~ "Thanks for your contribution" ]]; then - trigger_pipelines="Doris_Doris_FeUt ${trigger_pipelines}" - fi - if [[ "${comment_message}" =~ "run" && "${comment_message}" =~ "beut" && ! "${comment_message}" =~ "Thanks for your contribution" ]]; then - trigger_pipelines="Doris_DorisBeUt_BeUt ${trigger_pipelines}" - fi - if [[ "${comment_message}" =~ "run" && "${comment_message}" =~ "compile" && ! "${comment_message}" =~ "Thanks for your contribution" ]]; then - trigger_pipelines="Doris_DorisCompile_Compile ${trigger_pipelines}" - fi - if [[ "${comment_message}" =~ "run" && "${comment_message}" =~ "clickbench" && ! "${comment_message}" =~ "Thanks for your contribution" ]]; then - trigger_pipelines="Doris_Performance_Clickbench_ClickbenchNew ${trigger_pipelines}" - fi - if [[ "${comment_message}" =~ "run" && "${comment_message}" =~ "arm" && ! "${comment_message}" =~ "Thanks for your contribution" ]]; then - trigger_pipelines="Doris_ArmPipeline_P0Regression ${trigger_pipelines}" - fi - if [[ "${comment_message}" =~ "run" && "${comment_message}" =~ "external" && ! "${comment_message}" =~ "Thanks for your contribution" ]]; then - trigger_pipelines="Doris_External_Regression ${trigger_pipelines}" - fi - if [[ "${comment_message}" =~ "run" && "${comment_message}" =~ "just_for_test" && ! "${comment_message}" =~ "Thanks for your contribution" ]]; then - trigger_pipelines="Doris_DorisRegression_ExternalRegression ${trigger_pipelines}" - fi - if [ -z "${trigger_pipelines}" ];then echo "Just a general comment that doesn't match any pipeline rules" fi @@ -133,7 +125,12 @@ jobs: if [ "_""${same_build_sign}" == "_false" ];then sleep 10s echo "there is no running build or queue build, so trigger a new !" - execute_command="curl -s -X POST ${teamcity_url}/httpAuth/action.html\?add2Queue\=${pipeline}\&branchName\=pull/${pull_request_num}\&name=env.latest_pr_comment\&value=${encoded_string}\&name=env.latest_commit_id\&value=${latest_commit_id}" + echo "comment_repeat_times: ${comment_repeat_times}" + if [[ -n "${comment_repeat_times}" ]]; then + execute_command="curl -s -X POST ${teamcity_url}/httpAuth/action.html\?add2Queue\=${pipeline}\&branchName\=pull/${pull_request_num}\&name=env.latest_pr_comment\&value=${encoded_string}\&name=env.latest_commit_id\&value=${latest_commit_id}\&name=env.repeat_times\&value=${comment_repeat_times}" + else + execute_command="curl -s -X POST ${teamcity_url}/httpAuth/action.html\?add2Queue\=${pipeline}\&branchName\=pull/${pull_request_num}\&name=env.latest_pr_comment\&value=${encoded_string}\&name=env.latest_commit_id\&value=${latest_commit_id}" + fi echo "${execute_command}" eval "${execute_command}" echo "-----------------------------------------------------------------" diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml index f69cb953683895..c804b753a3bbc4 100644 --- a/.github/workflows/clang-format.yml +++ b/.github/workflows/clang-format.yml @@ -31,14 +31,21 @@ jobs: uses: actions/checkout@v3 with: persist-credentials: false - submodules: recursive - name: Checkout ${{ github.ref }} ( ${{ github.event.pull_request.head.sha }} ) if: ${{ github.event_name == 'pull_request_target' }} uses: actions/checkout@v3 with: ref: ${{ github.event.pull_request.head.sha }} - submodules: recursive + + - name: Checkout paths-filter + run: | + rm -rf ./.github/actions/paths-filter + git clone https://github.com/dorny/paths-filter .github/actions/paths-filter + + pushd .github/actions/paths-filter &>/dev/null + git checkout 4512585405083f25c027a35db413c2b3b9006d50 + popd &>/dev/null - name: Paths filter uses: ./.github/actions/paths-filter @@ -49,6 +56,15 @@ jobs: - 'be/src/**' - 'be/test/**' + - name: Checkout clang-format-lint-action + run: | + rm -rf ./.github/actions/clang-format-lint-action + git clone https://github.com/DoozyX/clang-format-lint-action .github/actions/clang-format-lint-action + + pushd .github/actions/clang-format-lint-action &>/dev/null + git checkout 6adbe14579e5b8e19eb3e31e5ff2479f3bd302c7 + popd &>/dev/null + - name: "Format it!" if: ${{ steps.filter.outputs.be_changes == 'true' }} uses: ./.github/actions/clang-format-lint-action diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 652aa7f81ea861..9e314bfb6dfc25 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -27,21 +27,22 @@ jobs: - name: Checkout ${{ github.ref }} ( ${{ github.sha }} ) if: ${{ github.event_name != 'pull_request_target' }} uses: actions/checkout@v3 - with: - submodules: recursive - name: Checkout ${{ github.ref }} ( ${{ github.event.pull_request.head.sha }} ) if: ${{ github.event_name == 'pull_request_target' }} uses: actions/checkout@v3 with: ref: ${{ github.event.pull_request.head.sha }} - submodules: recursive - - name: Patch + - name: Checkout action-sh-checker run: | - pushd .github/actions/action-sh-checker >/dev/null + rm -rf ./.github/actions/action-sh-checker + git clone https://github.com/luizm/action-sh-checker .github/actions/action-sh-checker + + pushd .github/actions/action-sh-checker &>/dev/null + git checkout 76ab0b22e1f194e4a582edc7969df6485c4e9246 sed -i 's/\[ "$GITHUB_EVENT_NAME" == "pull_request" \]/\[\[ "$GITHUB_EVENT_NAME" == "pull_request" || "$GITHUB_EVENT_NAME" == "pull_request_target" \]\]/' entrypoint.sh - popd >/dev/null + popd &>/dev/null - name: Run ShellCheck uses: ./.github/actions/action-sh-checker @@ -63,7 +64,15 @@ jobs: uses: actions/checkout@v3 with: ref: ${{ github.event.pull_request.head.sha }} - submodules: recursive + + - name: Checkout paths-filter + run: | + rm -rf ./.github/actions/paths-filter + git clone https://github.com/dorny/paths-filter .github/actions/paths-filter + + pushd .github/actions/paths-filter &>/dev/null + git checkout 4512585405083f25c027a35db413c2b3b9006d50 + popd &>/dev/null - name: Paths Filter uses: ./.github/actions/paths-filter @@ -117,7 +126,6 @@ jobs: uses: actions/checkout@v3 with: ref: ${{ github.event.pull_request.head.sha }} - submodules: recursive - name: Download uses: actions/download-artifact@v3 @@ -125,6 +133,15 @@ jobs: name: compile_commands path: ./be/build_Release + - name: Checkout clang-tidy review + run: | + rm -rf ./.github/actions/clang-tidy-review + git clone https://github.com/ZedThree/clang-tidy-review .github/actions/clang-tidy-review + + pushd .github/actions/clang-tidy-review &>/dev/null + git checkout 2c55ef8cfc9acb3715d433e58aea086dcec9b206 + popd &>/dev/null + - name: Run clang-tidy review uses: ./.github/actions/clang-tidy-review id: review diff --git a/.github/workflows/license-eyes.yml b/.github/workflows/license-eyes.yml index 823ac845b3b0c1..890efb2d9d1196 100644 --- a/.github/workflows/license-eyes.yml +++ b/.github/workflows/license-eyes.yml @@ -30,15 +30,12 @@ jobs: - name: "Checkout ${{ github.ref }} ( ${{ github.sha }} )" if: ${{ github.event_name != 'pull_request_target' }} uses: actions/checkout@v3 - with: - submodules: recursive - name: Checkout ${{ github.ref }} ( ${{ github.event.pull_request.head.sha }} ) if: ${{ github.event_name == 'pull_request_target' }} uses: actions/checkout@v3 with: ref: ${{ github.event.pull_request.head.sha }} - submodules: recursive - name: Check License uses: apache/skywalking-eyes@v0.2.0 diff --git a/.github/workflows/pr-approve-status.yml b/.github/workflows/pr-approve-status.yml index cda6abc8f1c15b..ccb418ebb72e9d 100644 --- a/.github/workflows/pr-approve-status.yml +++ b/.github/workflows/pr-approve-status.yml @@ -33,7 +33,7 @@ jobs: echo "PR number is not set" exit 1 fi - response=$(curl -s -H "Authorization: token ${{ secrets.GITHUB_TOKEN }} " "https://api.github.com/repos/apache/doris/pulls/${pr_num}/reviews") + response=$(curl -s -H "Authorization: token ${{ secrets.GITHUB_TOKEN }} " "https://api.github.com/repos/apache/doris/pulls/${pr_num}/reviews?per_page=100") # shellcheck disable=SC2207 reviewers=($(echo $response | jq -r '.[] | .user.login')) # shellcheck disable=SC2207 diff --git a/.gitmodules b/.gitmodules index 9fe51bfd1d056d..a4e579b1794833 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,9 +4,6 @@ [submodule ".github/actions/get-workflow-origin"] path = .github/actions/get-workflow-origin url = https://github.com/potiuk/get-workflow-origin.git -[submodule ".github/actions/clang-format-lint-action"] - path = .github/actions/clang-format-lint-action - url = https://github.com/DoozyX/clang-format-lint-action.git [submodule ".github/actions/setup-maven"] path = .github/actions/setup-maven url = https://github.com/stCarolas/setup-maven.git @@ -19,12 +16,6 @@ [submodule ".github/actions/ccache-action"] path = .github/actions/ccache-action url = https://github.com/hendrikmuhs/ccache-action -[submodule ".github/actions/action-sh-checker"] - path = .github/actions/action-sh-checker - url = https://github.com/luizm/action-sh-checker -[submodule ".github/actions/clang-tidy-review"] - path = .github/actions/clang-tidy-review - url = https://github.com/ZedThree/clang-tidy-review.git [submodule "be/src/apache-orc"] path = be/src/apache-orc url = https://github.com/apache/doris-thirdparty.git diff --git a/.idea/vcs.xml b/.idea/vcs.xml index cc4ed74dc0854a..0a185d9bdc7c46 100644 --- a/.idea/vcs.xml +++ b/.idea/vcs.xml @@ -6,9 +6,9 @@ The ASF licenses this file to You under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - + http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -16,19 +16,19 @@ limitations under the License. --> - - - - - - - - - \ No newline at end of file + + + + + + + + + diff --git a/README.md b/README.md index cecece032825c6..d14ad11deb98a5 100644 --- a/README.md +++ b/README.md @@ -35,8 +35,6 @@ Apache Doris is an easy-to-use, high-performance and real-time analytical databa All this makes Apache Doris an ideal tool for scenarios including report analysis, ad-hoc query, unified data warehouse, and data lake query acceleration. On Apache Doris, users can build various applications, such as user behavior analysis, AB test platform, log retrieval analysis, user portrait analysis, and order analysis. -Doris Summit Asia 2023 is coming and warmly invite you to join! Click Now 🔗[doris-summit.org.cn](https://doris-summit.org.cn/?utm_source=website&utm_medium=readme&utm_campaign=2023&utm_id=2023) - 🎉 Version 2.0.2 version released now. The 2.0.2 version has achieved over 10x performance improvements on standard Benchmark, comprehensive enhancement in log analysis and lakehouse scenarios, more efficient and stable data update and write efficiency, support for more comprehensive multi-tenant and resource isolation mechanisms, and take a new step in the direction of resource elasticity and storage computing separation. It has also been added a series of usability features for enterprise users. We welcome all users who have requirements for the new features of the 2.0 version to deploy and upgrade. Check out the 🔗[Release Notes](https://github.com/apache/doris/issues/25011) here. 🎉 Version 1.2.7 released now! It is fully evolved release and all users are encouraged to upgrade to this release. Check out the 🔗[Release Notes](https://doris.apache.org/docs/dev/releasenotes/release-1.2.7) here. diff --git a/be/cmake/thirdparty.cmake b/be/cmake/thirdparty.cmake index 5adf108ab61191..4a83fdc92f9187 100644 --- a/be/cmake/thirdparty.cmake +++ b/be/cmake/thirdparty.cmake @@ -145,17 +145,6 @@ endif() add_thirdparty(minizip LIB64) add_thirdparty(simdjson LIB64) add_thirdparty(idn LIB64) -add_thirdparty(opentelemetry_common LIB64) -add_thirdparty(opentelemetry_exporter_zipkin_trace LIB64) -add_thirdparty(opentelemetry_resources LIB64) -add_thirdparty(opentelemetry_version LIB64) -add_thirdparty(opentelemetry_exporter_ostream_span LIB64) -add_thirdparty(opentelemetry_trace LIB64) -add_thirdparty(opentelemetry_http_client_curl LIB64) -add_thirdparty(opentelemetry_exporter_otlp_http LIB64) -add_thirdparty(opentelemetry_exporter_otlp_http_client LIB64) -add_thirdparty(opentelemetry_otlp_recordable LIB64) -add_thirdparty(opentelemetry_proto LIB64) add_thirdparty(xml2 LIB64) add_thirdparty(lzma LIB64) add_thirdparty(gsasl) diff --git a/be/src/agent/cgroup_cpu_ctl.cpp b/be/src/agent/cgroup_cpu_ctl.cpp index 4a9aed2bb0f19c..d16a32b7be5433 100644 --- a/be/src/agent/cgroup_cpu_ctl.cpp +++ b/be/src/agent/cgroup_cpu_ctl.cpp @@ -54,6 +54,19 @@ void CgroupCpuCtl::update_cpu_hard_limit(int cpu_hard_limit) { } } +void CgroupCpuCtl::update_cpu_soft_limit(int cpu_shares) { + if (!_init_succ) { + return; + } + std::lock_guard w_lock(_lock_mutex); + if (_cpu_shares != cpu_shares) { + Status ret = modify_cg_cpu_soft_limit_no_lock(cpu_shares); + if (ret.ok()) { + _cpu_shares = cpu_shares; + } + } +} + Status CgroupCpuCtl::write_cg_sys_file(std::string file_path, int value, std::string msg, bool is_append) { int fd = open(file_path.c_str(), is_append ? O_RDWR | O_APPEND : O_RDWR); @@ -97,9 +110,11 @@ Status CgroupV1CpuCtl::init() { } } - // quota path + // quota file _cgroup_v1_cpu_tg_quota_file = _cgroup_v1_cpu_tg_path + "/cpu.cfs_quota_us"; - // task path + // cpu.shares file + _cgroup_v1_cpu_tg_shares_file = _cgroup_v1_cpu_tg_path + "/cpu.shares"; + // task file _cgroup_v1_cpu_tg_task_file = _cgroup_v1_cpu_tg_path + "/tasks"; LOG(INFO) << "cgroup v1 cpu path init success" << ", query tg path=" << _cgroup_v1_cpu_tg_path @@ -110,6 +125,11 @@ Status CgroupV1CpuCtl::init() { return Status::OK(); } +Status CgroupV1CpuCtl::modify_cg_cpu_soft_limit_no_lock(int cpu_shares) { + std::string msg = "modify cpu shares to " + std::to_string(cpu_shares); + return CgroupCpuCtl::write_cg_sys_file(_cgroup_v1_cpu_tg_shares_file, cpu_shares, msg, false); +} + Status CgroupV1CpuCtl::modify_cg_cpu_hard_limit_no_lock(int cpu_hard_limit) { int val = _cpu_cfs_period_us * _cpu_core_num * cpu_hard_limit / 100; std::string msg = "modify cpu quota value to " + std::to_string(val); @@ -120,9 +140,14 @@ Status CgroupV1CpuCtl::add_thread_to_cgroup() { if (!_init_succ) { return Status::OK(); } +#if defined(__APPLE__) + //unsupported now + return Status::OK(); +#else int tid = static_cast(syscall(SYS_gettid)); std::string msg = "add thread " + std::to_string(tid) + " to group"; std::lock_guard w_lock(_lock_mutex); return CgroupCpuCtl::write_cg_sys_file(_cgroup_v1_cpu_tg_task_file, tid, msg, true); +#endif } } // namespace doris diff --git a/be/src/agent/cgroup_cpu_ctl.h b/be/src/agent/cgroup_cpu_ctl.h index c3a30660147d63..b98e268da09464 100644 --- a/be/src/agent/cgroup_cpu_ctl.h +++ b/be/src/agent/cgroup_cpu_ctl.h @@ -28,6 +28,12 @@ namespace doris { +// cgroup cpu.cfs_quota_us default value, it means disable cpu hard limit +const static int CPU_HARD_LIMIT_DEFAULT_VALUE = -1; + +// cgroup cpu.shares default value +const static uint64_t CPU_SOFT_LIMIT_DEFAULT_VALUE = 1024; + class CgroupCpuCtl { public: virtual ~CgroupCpuCtl() = default; @@ -35,15 +41,19 @@ class CgroupCpuCtl { virtual Status init(); - virtual Status modify_cg_cpu_hard_limit_no_lock(int cpu_hard_limit) = 0; - virtual Status add_thread_to_cgroup() = 0; void update_cpu_hard_limit(int cpu_hard_limit); + void update_cpu_soft_limit(int cpu_shares); + protected: Status write_cg_sys_file(std::string file_path, int value, std::string msg, bool is_append); + virtual Status modify_cg_cpu_hard_limit_no_lock(int cpu_hard_limit) = 0; + + virtual Status modify_cg_cpu_soft_limit_no_lock(int cpu_shares) = 0; + std::string _doris_cgroup_cpu_path; uint64_t _cpu_core_num = CpuInfo::num_cores(); uint64_t _cpu_cfs_period_us = 100000; @@ -51,6 +61,7 @@ class CgroupCpuCtl { std::shared_mutex _lock_mutex; bool _init_succ = false; uint64_t _tg_id; // workload group id + uint64_t _cpu_shares = 0; }; /* @@ -73,20 +84,25 @@ class CgroupCpuCtl { 6 workload group quota file: /sys/fs/cgroup/cpu/{doris_home}/query/{workload group id}/cpu.cfs_quota_us - 7 workload group tasks file: + 7 workload group tasks file: /sys/fs/cgroup/cpu/{doris_home}/query/{workload group id}/tasks + + 8 workload group cpu.shares file: + /sys/fs/cgroup/cpu/{doris_home}/query/{workload group id}/cpu.shares */ class CgroupV1CpuCtl : public CgroupCpuCtl { public: CgroupV1CpuCtl(uint64_t tg_id) : CgroupCpuCtl(tg_id) {} Status init() override; Status modify_cg_cpu_hard_limit_no_lock(int cpu_hard_limit) override; + Status modify_cg_cpu_soft_limit_no_lock(int cpu_shares) override; Status add_thread_to_cgroup() override; private: std::string _cgroup_v1_cpu_query_path; std::string _cgroup_v1_cpu_tg_path; // workload group path std::string _cgroup_v1_cpu_tg_quota_file; + std::string _cgroup_v1_cpu_tg_shares_file; std::string _cgroup_v1_cpu_tg_task_file; }; diff --git a/be/src/agent/task_worker_pool.cpp b/be/src/agent/task_worker_pool.cpp index 6717af3ce4b837..698a71aec16c13 100644 --- a/be/src/agent/task_worker_pool.cpp +++ b/be/src/agent/task_worker_pool.cpp @@ -1949,6 +1949,10 @@ void StorageMediumMigrateTaskPool::_storage_medium_migrate_worker_thread_callbac EngineStorageMigrationTask engine_task(tablet, dest_store); status = StorageEngine::instance()->execute_task(&engine_task); } + // fe should ignore this err + if (status.is()) { + status = Status::OK(); + } if (!status.ok()) { LOG_WARNING("failed to migrate storage medium") .tag("signature", agent_task_req.signature) @@ -2011,8 +2015,9 @@ Status StorageMediumMigrateTaskPool::_check_migrate_request(const TStorageMedium *dest_store = stores[0]; } if (tablet->data_dir()->path() == (*dest_store)->path()) { - return Status::InternalError("tablet is already on specified path {}", - tablet->data_dir()->path()); + LOG_WARNING("tablet is already on specified path").tag("path", tablet->data_dir()->path()); + return Status::Error("tablet is already on specified path: {}", + tablet->data_dir()->path()); } // check local disk capacity @@ -2021,7 +2026,6 @@ Status StorageMediumMigrateTaskPool::_check_migrate_request(const TStorageMedium return Status::InternalError("reach the capacity limit of path {}, tablet_size={}", (*dest_store)->path(), tablet_size); } - return Status::OK(); } diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index deb99682fed450..a9b0848984b56a 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -319,7 +319,7 @@ DEFINE_Int32(index_page_cache_percentage, "10"); // whether to disable page cache feature in storage DEFINE_Bool(disable_storage_page_cache, "false"); // whether to disable row cache feature in storage -DEFINE_Bool(disable_storage_row_cache, "true"); +DEFINE_mBool(disable_storage_row_cache, "true"); // whether to disable pk page cache feature in storage DEFINE_Bool(disable_pk_storage_page_cache, "false"); @@ -457,7 +457,7 @@ DEFINE_Int32(webserver_num_workers, "48"); // Period to update rate counters and sampling counters in ms. DEFINE_mInt32(periodic_counter_update_period_ms, "500"); -DEFINE_Bool(enable_single_replica_load, "false"); +DEFINE_Bool(enable_single_replica_load, "true"); // Number of download workers for single replica load DEFINE_Int32(single_replica_load_download_num_workers, "64"); @@ -635,8 +635,6 @@ DEFINE_Bool(path_gc_check, "true"); DEFINE_mInt32(path_gc_check_interval_second, "86400"); DEFINE_mInt32(path_gc_check_step, "1000"); DEFINE_mInt32(path_gc_check_step_interval_ms, "10"); -DEFINE_mInt32(path_scan_interval_second, "86400"); -DEFINE_mInt32(path_scan_step_interval_ms, "70"); // The following 2 configs limit the max usage of disk capacity of a data dir. // If both of these 2 threshold reached, no more data can be writen into that data dir. @@ -742,8 +740,6 @@ DEFINE_mDouble(tablet_version_graph_orphan_vertex_ratio, "0.1"); // share delta writers when memtable_on_sink_node = true DEFINE_Bool(share_delta_writers, "true"); -// number of brpc stream per load -DEFINE_Int32(num_streams_per_load, "5"); // timeout for open load stream rpc in ms DEFINE_Int64(open_load_stream_timeout_ms, "500"); @@ -817,27 +813,6 @@ DEFINE_String(function_service_protocol, "h2:grpc"); // use which load balancer to select server to connect DEFINE_String(rpc_load_balancer, "rr"); -// Enable tracing -// If this configuration is enabled, you should also specify the trace_export_url. -DEFINE_Bool(enable_tracing, "false"); - -// Enable opentelemtry collector -DEFINE_Bool(enable_otel_collector, "false"); - -// Current support for exporting traces: -// zipkin: Export traces directly to zipkin, which is used to enable the tracing feature quickly. -// collector: The collector can be used to receive and process traces and support export to a variety of -// third-party systems. -DEFINE_mString(trace_exporter, "zipkin"); -DEFINE_Validator(trace_exporter, [](const std::string& config) -> bool { - return config == "zipkin" || config == "collector"; -}); - -// The endpoint to export spans to. -// export to zipkin like: http://127.0.0.1:9411/api/v2/spans -// export to collector like: http://127.0.0.1:4318/v1/traces -DEFINE_String(trace_export_url, "http://127.0.0.1:9411/api/v2/spans"); - // The maximum buffer/queue size to collect span. After the size is reached, spans are dropped. // An export will be triggered when the number of spans in the queue reaches half of the maximum. DEFINE_Int32(max_span_queue_size, "2048"); @@ -969,6 +944,9 @@ DEFINE_Bool(enable_workload_group_for_scan, "false"); // Will remove after fully test. DEFINE_Bool(enable_index_apply_preds_except_leafnode_of_andnode, "true"); +DEFINE_mBool(enable_flatten_nested_for_variant, "false"); +DEFINE_mDouble(ratio_of_defaults_as_sparse_column, "0.95"); + // block file cache DEFINE_Bool(enable_file_cache, "false"); // format: [{"path":"/path/to/file_cache","total_size":21474836480,"query_limit":10737418240}] @@ -1013,8 +991,6 @@ DEFINE_Int32(max_depth_in_bkd_tree, "32"); DEFINE_Bool(inverted_index_compaction_enable, "false"); // use num_broadcast_buffer blocks as buffer to do broadcast DEFINE_Int32(num_broadcast_buffer, "32"); -// semi-structure configs -DEFINE_Bool(enable_parse_multi_dimession_array, "false"); // max depth of expression tree allowed. DEFINE_Int32(max_depth_of_expr_tree, "600"); @@ -1034,6 +1010,8 @@ DEFINE_mInt32(s3_write_buffer_size, "5242880"); // can at most buffer 50MB data. And the num of multi part upload task is // s3_write_buffer_whole_size / s3_write_buffer_size DEFINE_mInt32(s3_write_buffer_whole_size, "524288000"); +// The timeout config for S3 buffer allocation +DEFINE_mInt32(s3_writer_buffer_allocation_timeout, "300"); DEFINE_mInt64(file_cache_max_file_reader_cache_size, "1000000"); //disable shrink memory by default @@ -1096,6 +1074,7 @@ DEFINE_String(group_commit_replay_wal_dir, "./wal"); DEFINE_Int32(group_commit_replay_wal_retry_num, "10"); DEFINE_Int32(group_commit_replay_wal_retry_interval_seconds, "5"); DEFINE_Int32(group_commit_sync_wal_batch, "10"); +DEFINE_Bool(wait_internal_group_commit_finish, "false"); // the count of thread to group commit insert DEFINE_Int32(group_commit_insert_threads, "10"); @@ -1110,12 +1089,25 @@ DEFINE_Bool(enable_flush_file_cache_async, "true"); // cgroup DEFINE_String(doris_cgroup_cpu_path, ""); +DEFINE_Bool(enable_cgroup_cpu_soft_limit, "false"); DEFINE_Bool(ignore_always_true_predicate_for_segment, "true"); // Dir of default timezone files DEFINE_String(default_tzfiles_path, "${DORIS_HOME}/zoneinfo"); +// Max size(bytes) of group commit queues, used for mem back pressure, defult 64M. +DEFINE_Int32(group_commit_max_queue_size, "67108864"); + +// Max size(bytes) of wal disk using, used for disk space back pressure, default 64M. +DEFINE_Int32(wal_max_disk_size, "67108864"); + +// Ingest binlog work pool size, -1 is disable, 0 is hardware concurrency +DEFINE_Int32(ingest_binlog_work_pool_size, "-1"); + +// Download binlog rate limit, unit is KB/s, 0 means no limit +DEFINE_Int32(download_binlog_rate_limit_kbs, "0"); + // clang-format off #ifdef BE_TEST // test s3 diff --git a/be/src/common/config.h b/be/src/common/config.h index 9855ba0ffecbd6..b6e911438ecad9 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -106,11 +106,11 @@ DECLARE_Int32(brpc_num_threads); // Declare a selection strategy for those servers have many ips. // Note that there should at most one ip match this list. -// this is a list in semicolon-delimited format, in CIDR notation, e.g. 10.10.10.0/24 +// This is a list in semicolon-delimited format, in CIDR notation, e.g. 10.10.10.0/24 // If no ip match this rule, will choose one randomly. DECLARE_String(priority_networks); -// performance moderate or or compact, only tcmalloc compile +// performance moderate or compact, only tcmalloc compile DECLARE_String(memory_mode); // process memory limit specified as number of bytes @@ -369,7 +369,7 @@ DECLARE_Int32(index_page_cache_percentage); // TODO delete it. Divided into Data page, Index page, pk index page DECLARE_Bool(disable_storage_page_cache); // whether to disable row cache feature in storage -DECLARE_Bool(disable_storage_row_cache); +DECLARE_mBool(disable_storage_row_cache); // whether to disable pk page cache feature in storage DECLARE_Bool(disable_pk_storage_page_cache); @@ -690,8 +690,6 @@ DECLARE_Bool(path_gc_check); DECLARE_mInt32(path_gc_check_interval_second); DECLARE_mInt32(path_gc_check_step); DECLARE_mInt32(path_gc_check_step_interval_ms); -DECLARE_mInt32(path_scan_interval_second); -DECLARE_mInt32(path_scan_step_interval_ms); // The following 2 configs limit the max usage of disk capacity of a data dir. // If both of these 2 threshold reached, no more data can be writen into that data dir. @@ -799,8 +797,6 @@ DECLARE_mDouble(tablet_version_graph_orphan_vertex_ratio); // share delta writers when memtable_on_sink_node = true DECLARE_Bool(share_delta_writers); -// number of brpc stream per load -DECLARE_Int32(num_streams_per_load); // timeout for open load stream rpc in ms DECLARE_Int64(open_load_stream_timeout_ms); @@ -875,24 +871,6 @@ DECLARE_String(function_service_protocol); // use which load balancer to select server to connect DECLARE_String(rpc_load_balancer); -// Enable tracing -// If this configuration is enabled, you should also specify the trace_export_url. -DECLARE_Bool(enable_tracing); - -// Enable opentelemtry collector -DECLARE_Bool(enable_otel_collector); - -// Current support for exporting traces: -// zipkin: Export traces directly to zipkin, which is used to enable the tracing feature quickly. -// collector: The collector can be used to receive and process traces and support export to a variety of -// third-party systems. -DECLARE_mString(trace_exporter); - -// The endpoint to export spans to. -// export to zipkin like: http://127.0.0.1:9411/api/v2/spans -// export to collector like: http://127.0.0.1:4318/v1/traces -DECLARE_String(trace_export_url); - // The maximum buffer/queue size to collect span. After the size is reached, spans are dropped. // An export will be triggered when the number of spans in the queue reaches half of the maximum. DECLARE_Int32(max_span_queue_size); @@ -1055,8 +1033,6 @@ DECLARE_Int32(max_depth_in_bkd_tree); DECLARE_Bool(inverted_index_compaction_enable); // use num_broadcast_buffer blocks as buffer to do broadcast DECLARE_Int32(num_broadcast_buffer); -// semi-structure configs -DECLARE_Bool(enable_parse_multi_dimession_array); // max depth of expression tree allowed. DECLARE_Int32(max_depth_of_expr_tree); @@ -1076,6 +1052,8 @@ DECLARE_mInt32(s3_write_buffer_size); // can at most buffer 50MB data. And the num of multi part upload task is // s3_write_buffer_whole_size / s3_write_buffer_size DECLARE_mInt32(s3_write_buffer_whole_size); +// The timeout config for S3 buffer allocation +DECLARE_mInt32(s3_writer_buffer_allocation_timeout); // the max number of cached file handle for block segemnt DECLARE_mInt64(file_cache_max_file_reader_cache_size); //enable shrink memory @@ -1132,6 +1110,12 @@ DECLARE_mInt64(lookup_connection_cache_bytes_limit); // level of compression when using LZ4_HC, whose defalut value is LZ4HC_CLEVEL_DEFAULT DECLARE_mInt64(LZ4_HC_compression_level); +// Whether flatten nested arrays in variant column +// Notice: TEST ONLY +DECLARE_mBool(enable_flatten_nested_for_variant); +// Threshold of a column as sparse column +// Notice: TEST ONLY +DECLARE_mDouble(ratio_of_defaults_as_sparse_column); DECLARE_mBool(enable_merge_on_write_correctness_check); @@ -1159,6 +1143,7 @@ DECLARE_String(group_commit_replay_wal_dir); DECLARE_Int32(group_commit_replay_wal_retry_num); DECLARE_Int32(group_commit_replay_wal_retry_interval_seconds); DECLARE_Int32(group_commit_sync_wal_batch); +DECLARE_Bool(wait_internal_group_commit_finish); // This config can be set to limit thread number in group commit insert thread pool. DECLARE_mInt32(group_commit_insert_threads); @@ -1177,6 +1162,8 @@ DECLARE_mBool(exit_on_exception); // cgroup DECLARE_String(doris_cgroup_cpu_path); +DECLARE_Bool(enable_cgroup_cpu_soft_limit); + // This config controls whether the s3 file writer would flush cache asynchronously DECLARE_Bool(enable_flush_file_cache_async); @@ -1186,6 +1173,18 @@ DECLARE_Bool(ignore_always_true_predicate_for_segment); // Dir of default timezone files DECLARE_String(default_tzfiles_path); +// Max size(bytes) of group commit queues, used for mem back pressure. +DECLARE_Int32(group_commit_max_queue_size); + +// Max size(bytes) of wal disk using, used for disk space back pressure. +DECLARE_Int32(wal_max_disk_size); + +// Ingest binlog work pool size +DECLARE_Int32(ingest_binlog_work_pool_size); + +// Download binlog rate limit, unit is KB/s +DECLARE_Int32(download_binlog_rate_limit_kbs); + #ifdef BE_TEST // test s3 DECLARE_String(test_s3_resource); diff --git a/be/src/common/exception.cpp b/be/src/common/exception.cpp index 9da69bc67827f3..37e357ec32f89c 100644 --- a/be/src/common/exception.cpp +++ b/be/src/common/exception.cpp @@ -21,7 +21,7 @@ #include "util/stack_util.h" namespace doris { -Exception::Exception(int code, const std::string_view msg) { +Exception::Exception(int code, const std::string_view& msg) { _code = code; _err_msg = std::make_unique(); _err_msg->_msg = msg; @@ -31,7 +31,7 @@ Exception::Exception(int code, const std::string_view msg) { } } -Exception::Exception(const Exception& nested, int code, const std::string_view msg) { +Exception::Exception(const Exception& nested, int code, const std::string_view& msg) { _code = code; _err_msg = std::make_unique(); _err_msg->_msg = msg; diff --git a/be/src/common/exception.h b/be/src/common/exception.h index 325bb67aa6b55b..299c0da043735e 100644 --- a/be/src/common/exception.h +++ b/be/src/common/exception.h @@ -37,14 +37,14 @@ inline thread_local int enable_thread_catch_bad_alloc = 0; class Exception : public std::exception { public: Exception() : _code(ErrorCode::OK) {} - Exception(int code, const std::string_view msg); + Exception(int code, const std::string_view& msg); // add nested exception as first param, or the template may could not find // the correct method for ...args - Exception(const Exception& nested, int code, const std::string_view msg); + Exception(const Exception& nested, int code, const std::string_view& msg); // Format message with fmt::format, like the logging functions. template - Exception(int code, const std::string_view fmt, Args&&... args) + Exception(int code, const std::string_view& fmt, Args&&... args) : Exception(code, fmt::format(fmt, std::forward(args)...)) {} int code() const { return _code; } @@ -96,9 +96,8 @@ inline const std::string& Exception::to_string() const { return Status::MemoryLimitExceeded(fmt::format( \ "PreCatch error code:{}, {}, __FILE__:{}, __LINE__:{}, __FUNCTION__:{}", \ e.code(), e.to_string(), __FILE__, __LINE__, __PRETTY_FUNCTION__)); \ - } else { \ - return Status::Error(e.code(), e.to_string()); \ } \ + return Status::Error(e.code(), e.to_string()); \ } \ } while (0) @@ -118,8 +117,7 @@ inline const std::string& Exception::to_string() const { return Status::MemoryLimitExceeded(fmt::format( \ "PreCatch error code:{}, {}, __FILE__:{}, __LINE__:{}, __FUNCTION__:{}", \ e.code(), e.to_string(), __FILE__, __LINE__, __PRETTY_FUNCTION__)); \ - } else { \ - return Status::Error(e.code(), e.to_string()); \ } \ + return Status::Error(e.code(), e.to_string()); \ } \ } while (0) diff --git a/be/src/common/status.h b/be/src/common/status.h index 88977ce310d6c1..be232f08522f17 100644 --- a/be/src/common/status.h +++ b/be/src/common/status.h @@ -16,7 +16,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #ifdef ENABLE_STACKTRACE #include "util/stack_util.h" @@ -321,7 +320,8 @@ constexpr bool capture_stacktrace(int code) { && code != ErrorCode::UNINITIALIZED && code != ErrorCode::PIP_WAIT_FOR_RF && code != ErrorCode::PIP_WAIT_FOR_SC - && code != ErrorCode::INVALID_ARGUMENT; + && code != ErrorCode::INVALID_ARGUMENT + && code != ErrorCode::DATA_QUALITY_ERR; } // clang-format on @@ -575,15 +575,6 @@ inline std::string Status::to_string() const { } \ } while (false); -#define RETURN_WITH_WARN_IF_ERROR(stmt, ret_code, warning_prefix) \ - do { \ - Status _s = (stmt); \ - if (UNLIKELY(!_s.ok())) { \ - LOG(WARNING) << (warning_prefix) << ", error: " << _s; \ - return ret_code; \ - } \ - } while (false); - #define RETURN_NOT_OK_STATUS_WITH_WARN(stmt, warning_prefix) \ do { \ Status _s = (stmt); \ @@ -606,17 +597,28 @@ using ResultError = unexpected; } \ } while (false) -// clang-format off -#define DORIS_TRY(stmt) \ - ({ \ - auto&& res = (stmt); \ - using T = std::decay_t; \ - static_assert(tl::detail::is_expected::value); \ - if (!res.has_value()) [[unlikely]] { \ - return std::forward(res).error(); \ - } \ - std::forward(res).value(); \ +#define DORIS_TRY(stmt) \ + ({ \ + auto&& res = (stmt); \ + using T = std::decay_t; \ + if (!res.has_value()) [[unlikely]] { \ + return std::forward(res).error(); \ + } \ + std::forward(res).value(); \ }); -// clang-format on } // namespace doris + +// specify formatter for Status +template <> +struct fmt::formatter { + template + constexpr auto parse(ParseContext& ctx) { + return ctx.begin(); + } + + template + auto format(doris::Status const& status, FormatContext& ctx) { + return fmt::format_to(ctx.out(), "{}", status.to_string()); + } +}; diff --git a/be/src/exec/data_sink.h b/be/src/exec/data_sink.h index 89f29b58998e43..285653c43b8d38 100644 --- a/be/src/exec/data_sink.h +++ b/be/src/exec/data_sink.h @@ -20,9 +20,8 @@ #pragma once -#include #include -// IWYU pragma: no_include + #include #include #include @@ -30,7 +29,6 @@ #include "common/status.h" #include "runtime/descriptors.h" #include "util/runtime_profile.h" -#include "util/telemetry/telemetry.h" namespace doris { @@ -84,7 +82,6 @@ class DataSink { // It must be okay to call this multiple times. Subsequent calls should // be ignored. virtual Status close(RuntimeState* state, Status exec_status) { - profile()->add_to_span(_span); _closed = true; return Status::OK(); } @@ -129,8 +126,6 @@ class DataSink { // Maybe this will be transferred to BufferControlBlock. std::shared_ptr _query_statistics; - - OpentelemetrySpan _span {}; }; } // namespace doris diff --git a/be/src/exec/exec_node.cpp b/be/src/exec/exec_node.cpp index ed2c54a0704a28..c416420ef22974 100644 --- a/be/src/exec/exec_node.cpp +++ b/be/src/exec/exec_node.cpp @@ -29,7 +29,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/config.h" #include "common/logging.h" @@ -127,9 +126,6 @@ Status ExecNode::init(const TPlanNode& tnode, RuntimeState* state) { Status ExecNode::prepare(RuntimeState* state) { DCHECK(_runtime_profile.get() != nullptr); - _span = state->get_tracer()->StartSpan(get_name()); - OpentelemetryScope scope {_span}; - _exec_timer = ADD_TIMER_WITH_LEVEL(runtime_profile(), "ExecTime", 1); _rows_returned_counter = ADD_COUNTER_WITH_LEVEL(_runtime_profile, "RowsReturned", TUnit::UNIT, 1); @@ -201,7 +197,6 @@ void ExecNode::release_resource(doris::RuntimeState* state) { COUNTER_SET(_rows_returned_counter, _num_rows_returned); } - runtime_profile()->add_to_span(_span); _is_resource_released = true; } } diff --git a/be/src/exec/exec_node.h b/be/src/exec/exec_node.h index d4d90546ffb5c9..f5f918731f7bca 100644 --- a/be/src/exec/exec_node.h +++ b/be/src/exec/exec_node.h @@ -36,7 +36,6 @@ #include "common/status.h" #include "runtime/descriptors.h" #include "util/runtime_profile.h" -#include "util/telemetry/telemetry.h" #include "vec/core/block.h" #include "vec/exprs/vexpr_fwd.h" @@ -231,8 +230,6 @@ class ExecNode { MemTracker* mem_tracker() const { return _mem_tracker.get(); } - OpentelemetrySpan get_next_span() { return _span; } - virtual std::string get_name(); // Names of counters shared by all exec nodes @@ -289,9 +286,6 @@ class ExecNode { // Account for peak memory used by this node RuntimeProfile::Counter* _peak_memory_usage_counter; - // - OpentelemetrySpan _span; - //NOTICE: now add a faker profile, because sometimes the profile record is useless //so we want remove some counters and timers, eg: in join node, if it's broadcast_join //and shared hash table, some counter/timer about build hash table is useless, diff --git a/be/src/exec/olap_common.h b/be/src/exec/olap_common.h index 47005ef042faad..91a27d980f826d 100644 --- a/be/src/exec/olap_common.h +++ b/be/src/exec/olap_common.h @@ -55,7 +55,7 @@ std::string cast_to_string(T value, int scale) { } else if constexpr (primitive_type == TYPE_DECIMAL128I) { return ((vectorized::Decimal)value).to_string(scale); } else if constexpr (primitive_type == TYPE_DECIMAL256) { - return ((vectorized::Decimal)value).to_string(scale); + return ((vectorized::Decimal)value).to_string(scale); } else if constexpr (primitive_type == TYPE_TINYINT) { return std::to_string(static_cast(value)); } else if constexpr (primitive_type == TYPE_LARGEINT) { diff --git a/be/src/exec/rowid_fetcher.cpp b/be/src/exec/rowid_fetcher.cpp index 94fcd814bea6de..c7519c5b05ad14 100644 --- a/be/src/exec/rowid_fetcher.cpp +++ b/be/src/exec/rowid_fetcher.cpp @@ -193,6 +193,24 @@ Status RowIDFetcher::_merge_rpc_results(const PMultiGetRequest& request, return Status::OK(); } +bool _has_char_type(const TypeDescriptor& desc) { + switch (desc.type) { + case TYPE_CHAR: + return true; + case TYPE_ARRAY: + case TYPE_MAP: + case TYPE_STRUCT: + for (int idx = 0; idx < desc.children.size(); ++idx) { + if (_has_char_type(desc.children[idx])) { + return true; + } + } + return false; + default: + return false; + } +} + Status RowIDFetcher::fetch(const vectorized::ColumnPtr& column_row_ids, vectorized::Block* res_block) { CHECK(!_stubs.empty()); @@ -238,17 +256,10 @@ Status RowIDFetcher::fetch(const vectorized::ColumnPtr& column_row_ids, std::vector char_type_idx; for (size_t i = 0; i < _fetch_option.desc->slots().size(); i++) { const auto& column_desc = _fetch_option.desc->slots()[i]; - const TypeDescriptor* type_desc = &column_desc->type(); - do { - if (type_desc->type == TYPE_CHAR) { - char_type_idx.emplace_back(i); - break; - } else if (type_desc->type != TYPE_ARRAY) { - break; - } - // for Array or Array> - type_desc = &type_desc->children[0]; - } while (true); + const TypeDescriptor& type_desc = column_desc->type(); + if (_has_char_type(type_desc)) { + char_type_idx.push_back(i); + } } res_block->shrink_char_type_column_suffix_zero(char_type_idx); VLOG_DEBUG << "dump block:" << res_block->dump_data(0, 10); diff --git a/be/src/exec/schema_scanner.cpp b/be/src/exec/schema_scanner.cpp index 9733558284a8fd..8d640341bfb9a6 100644 --- a/be/src/exec/schema_scanner.cpp +++ b/be/src/exec/schema_scanner.cpp @@ -60,8 +60,6 @@ namespace doris { class ObjectPool; -DorisServer* SchemaScanner::_s_doris_server; - SchemaScanner::SchemaScanner(const std::vector& columns) : _is_init(false), _param(nullptr), diff --git a/be/src/exec/schema_scanner.h b/be/src/exec/schema_scanner.h index 2d4e468c592676..ec4bc9e0d3da99 100644 --- a/be/src/exec/schema_scanner.h +++ b/be/src/exec/schema_scanner.h @@ -33,7 +33,7 @@ namespace doris { // forehead declare class, because jni function init in DorisServer. -class DorisServer; + class RuntimeState; class ObjectPool; class TUserIdentity; @@ -101,8 +101,6 @@ class SchemaScanner { static std::unique_ptr create(TSchemaTableType::type type); TSchemaTableType::type type() const { return _schema_table_type; } - static void set_doris_server(DorisServer* doris_server) { _s_doris_server = doris_server; } - protected: Status fill_dest_column_for_range(vectorized::Block* block, size_t pos, const std::vector& datas); @@ -113,8 +111,6 @@ class SchemaScanner { // schema table's column desc std::vector _columns; - static DorisServer* _s_doris_server; - TSchemaTableType::type _schema_table_type; RuntimeProfile::Counter* _get_db_timer = nullptr; diff --git a/be/src/exec/schema_scanner/schema_helper.cpp b/be/src/exec/schema_scanner/schema_helper.cpp index 3184aed4d2e4f5..b1d6900e0f5c2f 100644 --- a/be/src/exec/schema_scanner/schema_helper.cpp +++ b/be/src/exec/schema_scanner/schema_helper.cpp @@ -69,15 +69,6 @@ Status SchemaHelper::list_table_metadata_name_ids(const std::string& ip, const i }); } -Status SchemaHelper::describe_table(const std::string& ip, const int32_t port, - const TDescribeTableParams& request, - TDescribeTableResult* result) { - return ThriftRpcHelper::rpc( - ip, port, [&request, &result](FrontendServiceConnection& client) { - client->describeTable(*result, request); - }); -} - Status SchemaHelper::describe_tables(const std::string& ip, const int32_t port, const TDescribeTablesParams& request, TDescribeTablesResult* result) { diff --git a/be/src/exec/schema_scanner/schema_helper.h b/be/src/exec/schema_scanner/schema_helper.h index 900f963f7893cc..c0143136115bf1 100644 --- a/be/src/exec/schema_scanner/schema_helper.h +++ b/be/src/exec/schema_scanner/schema_helper.h @@ -55,10 +55,6 @@ class SchemaHelper { const doris::TGetTablesParams& request, TListTableMetadataNameIdsResult* result); - static Status describe_table(const std::string& ip, const int32_t port, - const TDescribeTableParams& desc_params, - TDescribeTableResult* desc_result); - static Status describe_tables(const std::string& ip, const int32_t port, const TDescribeTablesParams& desc_params, TDescribeTablesResult* desc_result); diff --git a/be/src/exec/schema_scanner/schema_metadata_name_ids_scanner.cpp b/be/src/exec/schema_scanner/schema_metadata_name_ids_scanner.cpp index e69596ef8fa0a7..ef7b2b69c1e710 100644 --- a/be/src/exec/schema_scanner/schema_metadata_name_ids_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_metadata_name_ids_scanner.cpp @@ -193,13 +193,13 @@ Status SchemaMetadataNameIdsScanner::_fill_block_impl(vectorized::Block* block) RETURN_IF_ERROR(fill_dest_column_for_range(block, 3, null_datas)); } } - // table_id + // table_id { int64_t srcs[table_num]; for (int i = 0; i < table_num; ++i) { if (_table_result.tables[i].__isset.id) { srcs[i] = _table_result.tables[i].id; - datas[i] = &srcs; + datas[i] = srcs + i; } else { datas[i] = nullptr; } diff --git a/be/src/exec/tablet_info.cpp b/be/src/exec/tablet_info.cpp index 9a24771edc08c1..90d434625816ef 100644 --- a/be/src/exec/tablet_info.cpp +++ b/be/src/exec/tablet_info.cpp @@ -17,7 +17,6 @@ #include "exec/tablet_info.h" -#include #include #include #include @@ -26,6 +25,7 @@ #include #include +#include #include #include @@ -130,7 +130,7 @@ Status OlapTableSchemaParam::init(const POlapTableSchemaParam& pschema) { for (auto& col : pschema.partial_update_input_columns()) { _partial_update_input_columns.insert(col); } - std::unordered_map, SlotDescriptor*> slots_map; + std::unordered_map, SlotDescriptor*> slots_map; _tuple_desc = _obj_pool.add(new TupleDescriptor(pschema.tuple_desc())); for (auto& p_slot_desc : pschema.slot_descs()) { @@ -138,7 +138,8 @@ Status OlapTableSchemaParam::init(const POlapTableSchemaParam& pschema) { _tuple_desc->add_slot(slot_desc); string data_type; EnumToString(TPrimitiveType, to_thrift(slot_desc->col_type()), data_type); - slots_map.emplace(std::make_pair(to_lower(slot_desc->col_name()), std::move(data_type)), + slots_map.emplace(std::make_pair(to_lower(slot_desc->col_name()), + TabletColumn::get_field_type_by_string(data_type)), slot_desc); } @@ -149,8 +150,9 @@ Status OlapTableSchemaParam::init(const POlapTableSchemaParam& pschema) { for (auto& pcolumn_desc : p_index.columns_desc()) { if (!_is_partial_update || _partial_update_input_columns.count(pcolumn_desc.name()) > 0) { - auto it = slots_map.find( - std::make_pair(to_lower(pcolumn_desc.name()), pcolumn_desc.type())); + auto it = slots_map.find(std::make_pair( + to_lower(pcolumn_desc.name()), + TabletColumn::get_field_type_by_string(pcolumn_desc.type()))); if (it == std::end(slots_map)) { return Status::InternalError("unknown index column, column={}, type={}", pcolumn_desc.name(), pcolumn_desc.type()); @@ -322,49 +324,20 @@ Status VOlapTablePartitionParam::init() { } } - _partitions_map.reset( - new std::map( - VOlapTablePartKeyComparator(_partition_slot_locs, _transformed_slot_locs))); + _partitions_map = std::make_unique< + std::map>( + VOlapTablePartKeyComparator(_partition_slot_locs, _transformed_slot_locs)); if (_t_param.__isset.distributed_columns) { for (auto& col : _t_param.distributed_columns) { RETURN_IF_ERROR(find_slot_locs(col, _distributed_slot_locs, "distributed")); } } - if (_distributed_slot_locs.empty()) { - _compute_tablet_index = [](BlockRow* key, - const VOlapTablePartition& partition) -> uint32_t { - if (partition.load_tablet_idx == -1) { - // load_to_single_tablet = false, just do random - return butil::fast_rand() % partition.num_buckets; - } - // load_to_single_tablet = ture, do round-robin - return partition.load_tablet_idx % partition.num_buckets; - }; - } else { - _compute_tablet_index = [this](BlockRow* key, - const VOlapTablePartition& partition) -> uint32_t { - uint32_t hash_val = 0; - for (int i = 0; i < _distributed_slot_locs.size(); ++i) { - auto slot_desc = _slots[_distributed_slot_locs[i]]; - auto& column = key->first->get_by_position(_distributed_slot_locs[i]).column; - auto val = column->get_data_at(key->second); - if (val.data != nullptr) { - hash_val = RawValue::zlib_crc32(val.data, val.size, slot_desc->type().type, - hash_val); - } else { - hash_val = HashUtil::zlib_crc_hash_null(hash_val); - } - } - return hash_val % partition.num_buckets; - }; - } // for both auto/non-auto partition table. _is_in_partition = _part_type == TPartitionType::type::LIST_PARTITIONED; // initial partitions - for (int i = 0; i < _t_param.partitions.size(); ++i) { - const TOlapTablePartition& t_part = _t_param.partitions[i]; + for (const auto& t_part : _t_param.partitions) { VOlapTablePartition* part = nullptr; RETURN_IF_ERROR(generate_partition_from(t_part, part)); _partitions.emplace_back(part); @@ -383,26 +356,6 @@ Status VOlapTablePartitionParam::init() { return Status::OK(); } -bool VOlapTablePartitionParam::find_partition(BlockRow* block_row, - const VOlapTablePartition** partition) const { - // block_row is gave by inserting process. So try to use transformed index. - auto it = - _is_in_partition - ? _partitions_map->find(std::tuple {block_row->first, block_row->second, true}) - : _partitions_map->upper_bound( - std::tuple {block_row->first, block_row->second, true}); - // for list partition it might result in default partition - if (_is_in_partition) { - *partition = (it != _partitions_map->end()) ? it->second : _default_partition; - it = _partitions_map->end(); - } - if (it != _partitions_map->end() && - _part_contains(it->second, std::tuple {block_row->first, block_row->second, true})) { - *partition = it->second; - } - return (*partition != nullptr); -} - bool VOlapTablePartitionParam::_part_contains(VOlapTablePartition* part, BlockRowWithIndicator key) const { // start_key.second == -1 means only single partition @@ -411,11 +364,6 @@ bool VOlapTablePartitionParam::_part_contains(VOlapTablePartition* part, !comparator(key, std::tuple {part->start_key.first, part->start_key.second, false}); } -uint32_t VOlapTablePartitionParam::find_tablet(BlockRow* block_row, - const VOlapTablePartition& partition) const { - return _compute_tablet_index(block_row, partition); -} - Status VOlapTablePartitionParam::_create_partition_keys(const std::vector& t_exprs, BlockRow* part_key) { for (int i = 0; i < t_exprs.size(); i++) { diff --git a/be/src/exec/tablet_info.h b/be/src/exec/tablet_info.h index ec12dcbfcd377c..bb9fbd8bc6013a 100644 --- a/be/src/exec/tablet_info.h +++ b/be/src/exec/tablet_info.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include @@ -33,6 +34,8 @@ #include "common/object_pool.h" #include "common/status.h" +#include "runtime/descriptors.h" +#include "runtime/raw_value.h" #include "vec/columns/column.h" #include "vec/core/block.h" #include "vec/core/column_with_type_and_name.h" @@ -162,9 +165,78 @@ class VOlapTablePartitionParam { int64_t version() const { return _t_param.version; } // return true if we found this block_row in partition - bool find_partition(BlockRow* block_row, const VOlapTablePartition** partition) const; + //TODO: use virtual function to refactor it + ALWAYS_INLINE bool find_partition(vectorized::Block* block, int row, + VOlapTablePartition*& partition) const { + auto it = _is_in_partition ? _partitions_map->find(std::tuple {block, row, true}) + : _partitions_map->upper_bound(std::tuple {block, row, true}); + // for list partition it might result in default partition + if (_is_in_partition) { + partition = (it != _partitions_map->end()) ? it->second : _default_partition; + it = _partitions_map->end(); + } + if (it != _partitions_map->end() && + _part_contains(it->second, std::tuple {block, row, true})) { + partition = it->second; + } + return (partition != nullptr); + } + + ALWAYS_INLINE void find_tablets( + vectorized::Block* block, const std::vector& indexes, + const std::vector& partitions, + std::vector& tablet_indexes /*result*/, + /*TODO: check if flat hash map will be better*/ + std::map* partition_tablets_buffer = nullptr) const { + std::function + compute_function; + if (!_distributed_slot_locs.empty()) { + //TODO: refactor by saving the hash values. then we can calculate in columnwise. + compute_function = [this](vectorized::Block* block, uint32_t row, + const VOlapTablePartition& partition) -> uint32_t { + uint32_t hash_val = 0; + for (unsigned short _distributed_slot_loc : _distributed_slot_locs) { + auto* slot_desc = _slots[_distributed_slot_loc]; + auto& column = block->get_by_position(_distributed_slot_loc).column; + auto val = column->get_data_at(row); + if (val.data != nullptr) { + hash_val = RawValue::zlib_crc32(val.data, val.size, slot_desc->type().type, + hash_val); + } else { + hash_val = HashUtil::zlib_crc_hash_null(hash_val); + } + } + return hash_val % partition.num_buckets; + }; + } else { // random distribution + compute_function = [](vectorized::Block* block, uint32_t row, + const VOlapTablePartition& partition) -> uint32_t { + if (partition.load_tablet_idx == -1) { + // load_to_single_tablet = false, just do random + return butil::fast_rand() % partition.num_buckets; + } + // load_to_single_tablet = ture, do round-robin + return partition.load_tablet_idx % partition.num_buckets; + }; + } - uint32_t find_tablet(BlockRow* block_row, const VOlapTablePartition& partition) const; + if (partition_tablets_buffer == nullptr) { + for (auto index : indexes) { + tablet_indexes[index] = compute_function(block, index, *partitions[index]); + } + } else { // use buffer + for (auto index : indexes) { + auto& partition_id = partitions[index]->id; + if (auto it = partition_tablets_buffer->find(partition_id); + it != partition_tablets_buffer->end()) { + tablet_indexes[index] = it->second; // tablet + } + // compute and save in buffer + (*partition_tablets_buffer)[partition_id] = tablet_indexes[index] = + compute_function(block, index, *partitions[index]); + } + } + } const std::vector& get_partitions() const { return _partitions; } @@ -193,8 +265,6 @@ class VOlapTablePartitionParam { Status _create_partition_key(const TExprNode& t_expr, BlockRow* part_key, uint16_t pos); - std::function _compute_tablet_index; - // check if this partition contain this key bool _part_contains(VOlapTablePartition* part, BlockRowWithIndicator key) const; diff --git a/be/src/exprs/json_functions.cpp b/be/src/exprs/json_functions.cpp index 30608adeb2546a..ace18f1090f1cc 100644 --- a/be/src/exprs/json_functions.cpp +++ b/be/src/exprs/json_functions.cpp @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include #include // IWYU pragma: keep #include @@ -32,7 +34,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/logging.h" @@ -316,4 +317,33 @@ Status JsonFunctions::extract_from_object(simdjson::ondemand::object& obj, return Status::OK(); } +std::string JsonFunctions::print_json_value(const rapidjson::Value& value) { + rapidjson::StringBuffer buffer; + buffer.Clear(); + rapidjson::Writer writer(buffer); + value.Accept(writer); + return std::string(buffer.GetString()); +} + +void JsonFunctions::merge_objects(rapidjson::Value& dst_object, rapidjson::Value& src_object, + rapidjson::Document::AllocatorType& allocator) { + if (!src_object.IsObject()) { + return; + } + for (auto src_it = src_object.MemberBegin(); src_it != src_object.MemberEnd(); ++src_it) { + auto dst_it = dst_object.FindMember(src_it->name); + if (dst_it != dst_object.MemberEnd()) { + if (src_it->value.IsObject()) { + merge_objects(dst_it->value, src_it->value, allocator); + } else { + if (dst_it->value.IsNull()) { + dst_it->value = src_it->value; + } + } + } else { + dst_object.AddMember(src_it->name, src_it->value, allocator); + } + } +} + } // namespace doris diff --git a/be/src/exprs/json_functions.h b/be/src/exprs/json_functions.h index fcec257142fc7b..72aa522ff374fa 100644 --- a/be/src/exprs/json_functions.h +++ b/be/src/exprs/json_functions.h @@ -108,6 +108,13 @@ class JsonFunctions { static Status extract_from_object(simdjson::ondemand::object& obj, const std::vector& jsonpath, simdjson::ondemand::value* value) noexcept; + // src: {"a" : "b" {"c" : 1}, "e" : 123} + // dst: {"a" : "b" {"d" : 1}} + // merged: {"a" : "b" : {"c" : 1, "d" : 1}, "e" : 123} + static void merge_objects(rapidjson::Value& dst_object, rapidjson::Value& src_object, + rapidjson::Document::AllocatorType& allocator); + + static std::string print_json_value(const rapidjson::Value& value); private: static rapidjson::Value* match_value(const std::vector& parsed_paths, diff --git a/be/src/exprs/math_functions.cpp b/be/src/exprs/math_functions.cpp index bbfdc6053dc950..202a6eabea3e80 100644 --- a/be/src/exprs/math_functions.cpp +++ b/be/src/exprs/math_functions.cpp @@ -23,7 +23,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep // IWYU pragma: no_include #include diff --git a/be/src/exprs/runtime_filter.cpp b/be/src/exprs/runtime_filter.cpp index c6e64fd0e55ef3..1c7cb4b5c13b04 100644 --- a/be/src/exprs/runtime_filter.cpp +++ b/be/src/exprs/runtime_filter.cpp @@ -42,6 +42,7 @@ #include "exprs/hybrid_set.h" #include "exprs/minmax_predicate.h" #include "gutil/strings/substitute.h" +#include "pipeline/pipeline_x/dependency.h" #include "runtime/define_primitive_type.h" #include "runtime/large_int_value.h" #include "runtime/primitive_type.h" @@ -62,7 +63,6 @@ #include "vec/exprs/vliteral.h" #include "vec/exprs/vruntimefilter_wrapper.h" #include "vec/runtime/shared_hash_table_controller.h" - namespace doris { // PrimitiveType-> PColumnType @@ -1185,17 +1185,17 @@ bool IRuntimeFilter::await() { return true; } +// NOTE: Wait infinitely will not make scan task wait really forever. +// Because BlockTaskSchedule will make it run when query is timedout. +bool IRuntimeFilter::wait_infinitely() const { + // bitmap filter is precise filter and only filter once, so it must be applied. + return _wait_infinitely || + (_wrapper != nullptr && _wrapper->get_real_type() == RuntimeFilterType::BITMAP_FILTER); +} + bool IRuntimeFilter::is_ready_or_timeout() { DCHECK(is_consumer()); auto cur_state = _rf_state_atomic.load(std::memory_order_acquire); - auto execution_timeout = _state == nullptr ? _query_ctx->execution_timeout() * 1000 - : _state->execution_timeout() * 1000; - auto runtime_filter_wait_time_ms = _state == nullptr ? _query_ctx->runtime_filter_wait_time_ms() - : _state->runtime_filter_wait_time_ms(); - // bitmap filter is precise filter and only filter once, so it must be applied. - int64_t wait_times_ms = _wrapper->get_real_type() == RuntimeFilterType::BITMAP_FILTER - ? execution_timeout - : runtime_filter_wait_time_ms; int64_t ms_since_registration = MonotonicMillis() - registration_time_; if (!_enable_pipeline_exec) { _rf_state = RuntimeFilterState::TIME_OUT; @@ -1212,7 +1212,7 @@ bool IRuntimeFilter::is_ready_or_timeout() { if (is_ready()) { return true; } - bool timeout = wait_times_ms <= ms_since_registration; + bool timeout = wait_infinitely() ? false : _rf_wait_time_ms <= ms_since_registration; auto expected = RuntimeFilterState::NOT_READY; if (timeout) { if (!_rf_state_atomic.compare_exchange_strong(expected, RuntimeFilterState::TIME_OUT, @@ -1235,6 +1235,11 @@ void IRuntimeFilter::signal() { DCHECK(is_consumer()); if (_enable_pipeline_exec) { _rf_state_atomic.store(RuntimeFilterState::READY); + if (!_filter_timer.empty()) { + for (auto& timer : _filter_timer) { + timer->call_ready(); + } + } } else { std::unique_lock lock(_inner_mutex); _rf_state = RuntimeFilterState::READY; @@ -1255,6 +1260,10 @@ void IRuntimeFilter::signal() { } } +void IRuntimeFilter::set_filter_timer(std::shared_ptr timer) { + _filter_timer.push_back(timer); +} + BloomFilterFuncBase* IRuntimeFilter::get_bloomfilter() const { return _wrapper->get_bloomfilter(); } diff --git a/be/src/exprs/runtime_filter.h b/be/src/exprs/runtime_filter.h index fdd3b02ad6351a..7a65706f5ad851 100644 --- a/be/src/exprs/runtime_filter.h +++ b/be/src/exprs/runtime_filter.h @@ -69,6 +69,10 @@ class VExprContext; struct SharedRuntimeFilterContext; } // namespace vectorized +namespace pipeline { +class RuntimeFilterTimer; +} // namespace pipeline + enum class RuntimeFilterType { UNKNOWN_FILTER = -1, IN_FILTER = 0, @@ -207,6 +211,8 @@ class IRuntimeFilter { _always_true(false), _is_ignored(false), registration_time_(MonotonicMillis()), + _wait_infinitely(_state->runtime_filter_wait_infinitely()), + _rf_wait_time_ms(_state->runtime_filter_wait_time_ms()), _enable_pipeline_exec(_state->enable_pipeline_exec()), _profile(new RuntimeProfile(_name)) { if (desc->__isset.min_max_type && desc->type == TRuntimeFilterType::MIN_MAX) { @@ -232,6 +238,8 @@ class IRuntimeFilter { _always_true(false), _is_ignored(false), registration_time_(MonotonicMillis()), + _wait_infinitely(query_ctx->runtime_filter_wait_infinitely()), + _rf_wait_time_ms(query_ctx->runtime_filter_wait_time_ms()), _enable_pipeline_exec(query_ctx->enable_pipeline_exec()), _profile(new RuntimeProfile(_name)) { if (desc->__isset.min_max_type && desc->type == TRuntimeFilterType::MIN_MAX) { @@ -384,6 +392,25 @@ class IRuntimeFilter { } } + // For pipelineX & Producer + int32_t wait_time_ms() const { + int32_t res = 0; + if (wait_infinitely()) { + res = _state == nullptr ? _query_ctx->execution_timeout() : _state->execution_timeout(); + // Convert to ms + res *= 1000; + } else { + res = _rf_wait_time_ms; + } + return res; + } + + bool wait_infinitely() const; + + int64_t registration_time() const { return registration_time_; } + + void set_filter_timer(std::shared_ptr); + protected: // serialize _wrapper to protobuf void to_protobuf(PInFilter* filter); @@ -464,6 +491,9 @@ class IRuntimeFilter { /// Time in ms (from MonotonicMillis()), that the filter was registered. const int64_t registration_time_; + /// runtime filter wait time will be ignored if wait_infinitly is true + const bool _wait_infinitely; + const int32_t _rf_wait_time_ms; const bool _enable_pipeline_exec; @@ -475,6 +505,8 @@ class IRuntimeFilter { // only effect on consumer std::unique_ptr _profile; bool _opt_remote_rf; + + std::vector> _filter_timer; }; // avoid expose RuntimePredicateWrapper diff --git a/be/src/exprs/runtime_filter_rpc.cpp b/be/src/exprs/runtime_filter_rpc.cpp index 2220ca86060a2d..00540b8382c667 100644 --- a/be/src/exprs/runtime_filter_rpc.cpp +++ b/be/src/exprs/runtime_filter_rpc.cpp @@ -74,7 +74,7 @@ Status IRuntimeFilter::push_to_remote(RuntimeState* state, const TNetworkAddress _rpc_context->request.set_filter_id(_filter_id); _rpc_context->request.set_opt_remote_rf(opt_remote_rf); _rpc_context->request.set_is_pipeline(state->enable_pipeline_exec()); - _rpc_context->cntl.set_timeout_ms(state->runtime_filter_wait_time_ms()); + _rpc_context->cntl.set_timeout_ms(wait_time_ms()); _rpc_context->cid = _rpc_context->cntl.call_id(); Status serialize_status = serialize(&_rpc_context->request, &data, &len); diff --git a/be/src/gutil/endian.h b/be/src/gutil/endian.h index 66d849f73cd554..f1a9cf2a1a2da1 100644 --- a/be/src/gutil/endian.h +++ b/be/src/gutil/endian.h @@ -61,7 +61,8 @@ inline unsigned __int128 gbswap_128(unsigned __int128 host_int) { } inline wide::UInt256 gbswap_256(wide::UInt256 host_int) { - wide::UInt256 result{gbswap_64(host_int.items[0]), gbswap_64(host_int.items[1]), gbswap_64(host_int.items[2]), gbswap_64(host_int.items[3])}; + wide::UInt256 result{gbswap_64(host_int.items[3]), gbswap_64(host_int.items[2]), + gbswap_64(host_int.items[1]), gbswap_64(host_int.items[0])}; return result; } diff --git a/be/src/http/action/download_action.cpp b/be/src/http/action/download_action.cpp index f2068d83c7bbb1..f271b4f1916708 100644 --- a/be/src/http/action/download_action.cpp +++ b/be/src/http/action/download_action.cpp @@ -33,13 +33,20 @@ #include "runtime/exec_env.h" namespace doris { - -const std::string FILE_PARAMETER = "file"; -const std::string TOKEN_PARAMETER = "token"; - -DownloadAction::DownloadAction(ExecEnv* exec_env, const std::vector& allow_dirs, - int32_t num_workers) - : _exec_env(exec_env), _download_type(NORMAL), _num_workers(num_workers) { +namespace { +static const std::string FILE_PARAMETER = "file"; +static const std::string TOKEN_PARAMETER = "token"; +static const std::string CHANNEL_PARAMETER = "channel"; +static const std::string CHANNEL_INGEST_BINLOG_TYPE = "ingest_binlog"; +} // namespace + +DownloadAction::DownloadAction(ExecEnv* exec_env, + std::shared_ptr rate_limit_group, + const std::vector& allow_dirs, int32_t num_workers) + : _exec_env(exec_env), + _download_type(NORMAL), + _num_workers(num_workers), + _rate_limit_group(std::move(rate_limit_group)) { for (auto& dir : allow_dirs) { std::string p; Status st = io::global_local_filesystem()->canonicalize(dir, &p); @@ -107,7 +114,13 @@ void DownloadAction::handle_normal(HttpRequest* req, const std::string& file_par if (is_dir) { do_dir_response(file_param, req); } else { - do_file_response(file_param, req); + const auto& channel = req->param(CHANNEL_PARAMETER); + bool ingest_binlog = (channel == CHANNEL_INGEST_BINLOG_TYPE); + if (ingest_binlog) { + do_file_response(file_param, req, _rate_limit_group.get()); + } else { + do_file_response(file_param, req); + } } } diff --git a/be/src/http/action/download_action.h b/be/src/http/action/download_action.h index d8e468d95851aa..3aab1a0d31423d 100644 --- a/be/src/http/action/download_action.h +++ b/be/src/http/action/download_action.h @@ -24,6 +24,8 @@ #include "http/http_handler.h" #include "util/threadpool.h" +struct bufferevent_rate_limit_group; + namespace doris { class ExecEnv; @@ -36,8 +38,9 @@ class HttpRequest; // We use parameter named 'file' to specify the static resource path, it is an absolute path. class DownloadAction : public HttpHandler { public: - DownloadAction(ExecEnv* exec_env, const std::vector& allow_dirs, - int32_t num_workers = 0); + DownloadAction(ExecEnv* exec_env, + std::shared_ptr rate_limit_group, + const std::vector& allow_dirs, int32_t num_workers = 0); // for load error DownloadAction(ExecEnv* exec_env, const std::string& error_log_root_dir); @@ -67,6 +70,8 @@ class DownloadAction : public HttpHandler { std::string _error_log_root_dir; int32_t _num_workers; std::unique_ptr _download_workers; + + std::shared_ptr _rate_limit_group {nullptr}; }; // end class DownloadAction } // end namespace doris diff --git a/be/src/http/action/download_binlog_action.cpp b/be/src/http/action/download_binlog_action.cpp index a23d5ec109f907..697512b2a301ee 100644 --- a/be/src/http/action/download_binlog_action.cpp +++ b/be/src/http/action/download_binlog_action.cpp @@ -21,8 +21,10 @@ #include #include +#include #include #include +#include #include #include "common/config.h" @@ -96,7 +98,7 @@ void handle_get_binlog_info(HttpRequest* req) { } /// handle get segment file, need tablet_id, rowset_id && index -void handle_get_segment_file(HttpRequest* req) { +void handle_get_segment_file(HttpRequest* req, bufferevent_rate_limit_group* rate_limit_group) { // Step 1: get download file path std::string segment_file_path; try { @@ -125,7 +127,7 @@ void handle_get_segment_file(HttpRequest* req) { LOG(WARNING) << "file not exist, file path: " << segment_file_path; return; } - do_file_response(segment_file_path, req); + do_file_response(segment_file_path, req, rate_limit_group); } void handle_get_rowset_meta(HttpRequest* req) { @@ -149,7 +151,9 @@ void handle_get_rowset_meta(HttpRequest* req) { } // namespace -DownloadBinlogAction::DownloadBinlogAction(ExecEnv* exec_env) : _exec_env(exec_env) {} +DownloadBinlogAction::DownloadBinlogAction( + ExecEnv* exec_env, std::shared_ptr rate_limit_group) + : _exec_env(exec_env), _rate_limit_group(std::move(rate_limit_group)) {} void DownloadBinlogAction::handle(HttpRequest* req) { VLOG_CRITICAL << "accept one download binlog request " << req->debug_string(); @@ -178,7 +182,7 @@ void DownloadBinlogAction::handle(HttpRequest* req) { if (method == "get_binlog_info") { handle_get_binlog_info(req); } else if (method == "get_segment_file") { - handle_get_segment_file(req); + handle_get_segment_file(req, _rate_limit_group.get()); } else if (method == "get_rowset_meta") { handle_get_rowset_meta(req); } else { diff --git a/be/src/http/action/download_binlog_action.h b/be/src/http/action/download_binlog_action.h index 3cbd9b9e5b0fb9..77a2ed0878059a 100644 --- a/be/src/http/action/download_binlog_action.h +++ b/be/src/http/action/download_binlog_action.h @@ -17,12 +17,15 @@ #pragma once +#include #include #include #include "common/status.h" #include "http/http_handler.h" +struct bufferevent_rate_limit_group; + namespace doris { class ExecEnv; @@ -30,7 +33,8 @@ class HttpRequest; class DownloadBinlogAction : public HttpHandler { public: - DownloadBinlogAction(ExecEnv* exec_env); + DownloadBinlogAction(ExecEnv* exec_env, + std::shared_ptr rate_limit_group); virtual ~DownloadBinlogAction() = default; void handle(HttpRequest* req) override; @@ -40,6 +44,7 @@ class DownloadBinlogAction : public HttpHandler { private: ExecEnv* _exec_env; + std::shared_ptr _rate_limit_group {nullptr}; }; } // namespace doris diff --git a/be/src/http/action/http_stream.cpp b/be/src/http/action/http_stream.cpp index 067f8c5d28db41..d81f42824c2bbd 100644 --- a/be/src/http/action/http_stream.cpp +++ b/be/src/http/action/http_stream.cpp @@ -167,7 +167,8 @@ int HttpStreamAction::on_header(HttpRequest* req) { ctx->load_type = TLoadType::MANUL_LOAD; ctx->load_src_type = TLoadSourceType::RAW; - ctx->group_commit = iequal(req->header(HTTP_GROUP_COMMIT), "true"); + ctx->group_commit = iequal(req->header(HTTP_GROUP_COMMIT), "true") || + config::wait_internal_group_commit_finish; ctx->two_phase_commit = req->header(HTTP_TWO_PHASE_COMMIT) == "true" ? true : false; diff --git a/be/src/http/action/pad_rowset_action.cpp b/be/src/http/action/pad_rowset_action.cpp index 95164f53fa5a1c..52604f23ff4fc1 100644 --- a/be/src/http/action/pad_rowset_action.cpp +++ b/be/src/http/action/pad_rowset_action.cpp @@ -99,14 +99,13 @@ Status PadRowsetAction::_pad_rowset(TabletSharedPtr tablet, const Version& versi return Status::InternalError("Input version {} exists", version.to_string()); } - std::unique_ptr writer; RowsetWriterContext ctx; ctx.version = version; ctx.rowset_state = VISIBLE; ctx.segments_overlap = NONOVERLAPPING; ctx.tablet_schema = tablet->tablet_schema(); ctx.newest_write_timestamp = UnixSeconds(); - RETURN_IF_ERROR(tablet->create_rowset_writer(ctx, &writer)); + auto writer = DORIS_TRY(tablet->create_rowset_writer(ctx, false)); RowsetSharedPtr rowset; RETURN_IF_ERROR(writer->build(rowset)); rowset->make_visible(version); diff --git a/be/src/http/action/stream_load.cpp b/be/src/http/action/stream_load.cpp index 99a98afbc37b8d..896a1f27755c32 100644 --- a/be/src/http/action/stream_load.cpp +++ b/be/src/http/action/stream_load.cpp @@ -76,6 +76,9 @@ DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(streaming_load_requests_total, MetricUnit:: DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(streaming_load_duration_ms, MetricUnit::MILLISECONDS); DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(streaming_load_current_processing, MetricUnit::REQUESTS); +static constexpr size_t MIN_CHUNK_SIZE = 64 * 1024; +static const string CHUNK = "chunked"; + #ifdef BE_TEST TStreamLoadPutResult k_stream_load_put_result; #endif @@ -185,8 +188,9 @@ int StreamLoadAction::on_header(HttpRequest* req) { url_decode(req->param(HTTP_TABLE_KEY), &ctx->table); ctx->label = req->header(HTTP_LABEL_KEY); Status st = Status::OK(); - if (iequal(req->header(HTTP_GROUP_COMMIT), "true")) { - if (!ctx->label.empty()) { + if (iequal(req->header(HTTP_GROUP_COMMIT), "true") || + config::wait_internal_group_commit_finish) { + if (iequal(req->header(HTTP_GROUP_COMMIT), "true") && !ctx->label.empty()) { st = Status::InternalError("label and group_commit can't be set at the same time"); } ctx->group_commit = true; @@ -292,6 +296,12 @@ Status StreamLoadAction::_on_header(HttpRequest* http_req, std::shared_ptrheader(HttpHeaders::TRANSFER_ENCODING).empty()) { + if (http_req->header(HttpHeaders::TRANSFER_ENCODING).find(CHUNK) != std::string::npos) { + ctx->is_chunked_transfer = true; + } + } + if (!http_req->header(HTTP_TIMEOUT).empty()) { try { ctx->timeout_second = std::stoi(http_req->header(HTTP_TIMEOUT)); @@ -369,9 +379,15 @@ Status StreamLoadAction::_process_put(HttpRequest* http_req, request.__set_header_type(ctx->header_type); request.__set_loadId(ctx->id.to_thrift()); if (ctx->use_streaming) { - auto pipe = std::make_shared( - io::kMaxPipeBufferedBytes /* max_buffered_bytes */, 64 * 1024 /* min_chunk_size */, - ctx->body_bytes /* total_length */); + std::shared_ptr pipe; + if (ctx->is_chunked_transfer) { + pipe = std::make_shared( + io::kMaxPipeBufferedBytes /* max_buffered_bytes */); + } else { + pipe = std::make_shared( + io::kMaxPipeBufferedBytes /* max_buffered_bytes */, + MIN_CHUNK_SIZE /* min_chunk_size */, ctx->body_bytes /* total_length */); + } request.fileType = TFileType::FILE_STREAM; ctx->body_sink = pipe; ctx->pipe = pipe; diff --git a/be/src/http/ev_http_server.cpp b/be/src/http/ev_http_server.cpp index 5d231cb9087ecd..1bbd2c0e178fd2 100644 --- a/be/src/http/ev_http_server.cpp +++ b/be/src/http/ev_http_server.cpp @@ -84,7 +84,17 @@ static int on_connection(struct evhttp_request* req, void* param) { EvHttpServer::EvHttpServer(int port, int num_workers) : _port(port), _num_workers(num_workers), _real_port(0) { _host = BackendOptions::get_service_bind_address(); + + evthread_use_pthreads(); DCHECK_GT(_num_workers, 0); + _event_bases.resize(_num_workers); + for (int i = 0; i < _num_workers; ++i) { + std::shared_ptr base(event_base_new(), + [](event_base* base) { event_base_free(base); }); + CHECK(base != nullptr) << "Couldn't create an event_base."; + std::lock_guard lock(_event_bases_lock); + _event_bases[i] = base; + } } EvHttpServer::EvHttpServer(const std::string& host, int port, int num_workers) @@ -107,34 +117,28 @@ void EvHttpServer::start() { .set_min_threads(_num_workers) .set_max_threads(_num_workers) .build(&_workers)); - - evthread_use_pthreads(); - _event_bases.resize(_num_workers); for (int i = 0; i < _num_workers; ++i) { - CHECK(_workers->submit_func([this, i]() { - std::shared_ptr base(event_base_new(), [](event_base* base) { - event_base_free(base); - }); - CHECK(base != nullptr) << "Couldn't create an event_base."; - { - std::lock_guard lock(_event_bases_lock); - _event_bases[i] = base; - } - - /* Create a new evhttp object to handle requests. */ - std::shared_ptr http(evhttp_new(base.get()), - [](evhttp* http) { evhttp_free(http); }); - CHECK(http != nullptr) << "Couldn't create an evhttp."; - - auto res = evhttp_accept_socket(http.get(), _server_fd); - CHECK(res >= 0) << "evhttp accept socket failed, res=" << res; - - evhttp_set_newreqcb(http.get(), on_connection, this); - evhttp_set_gencb(http.get(), on_request, this); - - event_base_dispatch(base.get()); - }) - .ok()); + auto status = _workers->submit_func([this, i]() { + std::shared_ptr base; + { + std::lock_guard lock(_event_bases_lock); + base = _event_bases[i]; + } + + /* Create a new evhttp object to handle requests. */ + std::shared_ptr http(evhttp_new(base.get()), + [](evhttp* http) { evhttp_free(http); }); + CHECK(http != nullptr) << "Couldn't create an evhttp."; + + auto res = evhttp_accept_socket(http.get(), _server_fd); + CHECK(res >= 0) << "evhttp accept socket failed, res=" << res; + + evhttp_set_newreqcb(http.get(), on_connection, this); + evhttp_set_gencb(http.get(), on_request, this); + + event_base_dispatch(base.get()); + }); + CHECK(status.ok()); } } diff --git a/be/src/http/ev_http_server.h b/be/src/http/ev_http_server.h index e7ad1c052ab16a..d74a8cb4efd283 100644 --- a/be/src/http/ev_http_server.h +++ b/be/src/http/ev_http_server.h @@ -55,6 +55,11 @@ class EvHttpServer { // get real port int get_real_port() const { return _real_port; } + std::vector> get_event_bases() { + std::lock_guard lock(_event_bases_lock); + return _event_bases; + } + private: Status _bind(); HttpHandler* _find_handler(HttpRequest* req); diff --git a/be/src/http/http_channel.cpp b/be/src/http/http_channel.cpp index 5727ba3902efec..96679195316dac 100644 --- a/be/src/http/http_channel.cpp +++ b/be/src/http/http_channel.cpp @@ -18,6 +18,7 @@ #include "http/http_channel.h" #include +#include #include #include @@ -69,11 +70,17 @@ void HttpChannel::send_reply(HttpRequest* request, HttpStatus status, const std: evbuffer_free(evb); } -void HttpChannel::send_file(HttpRequest* request, int fd, size_t off, size_t size) { +void HttpChannel::send_file(HttpRequest* request, int fd, size_t off, size_t size, + bufferevent_rate_limit_group* rate_limit_group) { auto evb = evbuffer_new(); evbuffer_add_file(evb, fd, off, size); - evhttp_send_reply(request->get_evhttp_request(), HttpStatus::OK, - default_reason(HttpStatus::OK).c_str(), evb); + auto* evhttp_request = request->get_evhttp_request(); + if (rate_limit_group) { + auto* evhttp_connection = evhttp_request_get_connection(evhttp_request); + auto* buffer_event = evhttp_connection_get_bufferevent(evhttp_connection); + bufferevent_add_to_rate_limit_group(buffer_event, rate_limit_group); + } + evhttp_send_reply(evhttp_request, HttpStatus::OK, default_reason(HttpStatus::OK).c_str(), evb); evbuffer_free(evb); } diff --git a/be/src/http/http_channel.h b/be/src/http/http_channel.h index 478f013af82079..ee1e6c0888f1d3 100644 --- a/be/src/http/http_channel.h +++ b/be/src/http/http_channel.h @@ -23,6 +23,7 @@ #include "http/http_status.h" +struct bufferevent_rate_limit_group; namespace doris { class HttpRequest; @@ -43,7 +44,8 @@ class HttpChannel { static void send_reply(HttpRequest* request, HttpStatus status, const std::string& content); - static void send_file(HttpRequest* request, int fd, size_t off, size_t size); + static void send_file(HttpRequest* request, int fd, size_t off, size_t size, + bufferevent_rate_limit_group* rate_limit_group = nullptr); static bool compress_content(const std::string& accept_encoding, const std::string& input, std::string* output); diff --git a/be/src/http/http_handler_with_auth.cpp b/be/src/http/http_handler_with_auth.cpp index 9d93b823c6b0e2..6a4b28beb279d4 100644 --- a/be/src/http/http_handler_with_auth.cpp +++ b/be/src/http/http_handler_with_auth.cpp @@ -64,13 +64,16 @@ int HttpHandlerWithAuth::on_header(HttpRequest* req) { #ifndef BE_TEST TNetworkAddress master_addr = _exec_env->master_info()->network_address; - RETURN_WITH_WARN_IF_ERROR( - ThriftRpcHelper::rpc( - master_addr.hostname, master_addr.port, - [&auth_result, &auth_request](FrontendServiceConnection& client) { - client->checkAuth(auth_result, auth_request); - }), - -1, "checkAuth failed"); + { + auto status = ThriftRpcHelper::rpc( + master_addr.hostname, master_addr.port, + [&auth_result, &auth_request](FrontendServiceConnection& client) { + client->checkAuth(auth_result, auth_request); + }); + if (!status) { + return -1; + } + } #else CHECK(_exec_env == nullptr); #endif diff --git a/be/src/http/utils.cpp b/be/src/http/utils.cpp index f55b5d47696c11..31550456c552ea 100644 --- a/be/src/http/utils.cpp +++ b/be/src/http/utils.cpp @@ -124,7 +124,8 @@ std::string get_content_type(const std::string& file_name) { return ""; } -void do_file_response(const std::string& file_path, HttpRequest* req) { +void do_file_response(const std::string& file_path, HttpRequest* req, + bufferevent_rate_limit_group* rate_limit_group) { if (file_path.find("..") != std::string::npos) { LOG(WARNING) << "Not allowed to read relative path: " << file_path; HttpChannel::send_error(req, HttpStatus::FORBIDDEN); @@ -165,7 +166,7 @@ void do_file_response(const std::string& file_path, HttpRequest* req) { return; } - HttpChannel::send_file(req, fd, 0, file_size); + HttpChannel::send_file(req, fd, 0, file_size, rate_limit_group); } void do_dir_response(const std::string& dir_path, HttpRequest* req) { diff --git a/be/src/http/utils.h b/be/src/http/utils.h index 5928039c492b2d..2d1e13fbe4e78e 100644 --- a/be/src/http/utils.h +++ b/be/src/http/utils.h @@ -22,6 +22,8 @@ #include "common/utils.h" #include "http/http_request.h" +struct bufferevent_rate_limit_group; + namespace doris { struct AuthInfo; @@ -34,7 +36,8 @@ bool parse_basic_auth(const HttpRequest& req, std::string* user, std::string* pa bool parse_basic_auth(const HttpRequest& req, AuthInfo* auth); -void do_file_response(const std::string& dir_path, HttpRequest* req); +void do_file_response(const std::string& dir_path, HttpRequest* req, + bufferevent_rate_limit_group* rate_limit_group = nullptr); void do_dir_response(const std::string& dir_path, HttpRequest* req); diff --git a/be/src/index-tools/index_tool.cpp b/be/src/index-tools/index_tool.cpp index 2fd51feb63a9ad..c356449175994f 100644 --- a/be/src/index-tools/index_tool.cpp +++ b/be/src/index-tools/index_tool.cpp @@ -19,13 +19,16 @@ #include #include +#include #include #include +#include #include #include #include #include "io/fs/local_file_system.h" +#include "olap/rowset/segment_v2/inverted_index/query/conjunction_query.h" #include "olap/rowset/segment_v2/inverted_index_compound_directory.h" #include "olap/rowset/segment_v2/inverted_index_compound_reader.h" @@ -60,6 +63,16 @@ std::string get_usage(const std::string& progname) { return ss.str(); } +std::vector split(const std::string& s, char delimiter) { + std::vector tokens; + std::string token; + std::istringstream tokenStream(s); + while (getline(tokenStream, token, delimiter)) { + tokens.push_back(token); + } + return tokens; +} + void search(lucene::store::Directory* dir, std::string& field, std::string& token, std::string& pred) { IndexReader* reader = IndexReader::open(dir); @@ -72,36 +85,59 @@ void search(lucene::store::Directory* dir, std::string& field, std::string& toke IndexSearcher s(reader); std::unique_ptr query; + std::cout << "version: " << (int32_t)(reader->getIndexVersion()) << std::endl; + std::wstring field_ws(field.begin(), field.end()); - std::wstring token_ws(token.begin(), token.end()); - lucene::index::Term* term = _CLNEW lucene::index::Term(field_ws.c_str(), token_ws.c_str()); - if (pred == "eq" || pred == "match") { - query.reset(new lucene::search::TermQuery(term)); - } else if (pred == "lt") { - query.reset(new lucene::search::RangeQuery(nullptr, term, false)); - } else if (pred == "gt") { - query.reset(new lucene::search::RangeQuery(term, nullptr, false)); - } else if (pred == "le") { - query.reset(new lucene::search::RangeQuery(nullptr, term, true)); - } else if (pred == "ge") { - query.reset(new lucene::search::RangeQuery(term, nullptr, true)); + if (pred == "match_all") { + } else if (pred == "match_phrase") { + std::vector terms = split(token, '|'); + auto* phrase_query = new lucene::search::PhraseQuery(); + for (auto& term : terms) { + std::wstring term_ws = StringUtil::string_to_wstring(term); + auto* t = _CLNEW lucene::index::Term(field_ws.c_str(), term_ws.c_str()); + phrase_query->add(t); + _CLDECDELETE(t); + } + query.reset(phrase_query); } else { - std::cout << "invalid predicate type:" << pred << std::endl; - exit(-1); + std::wstring token_ws(token.begin(), token.end()); + lucene::index::Term* term = _CLNEW lucene::index::Term(field_ws.c_str(), token_ws.c_str()); + if (pred == "eq" || pred == "match") { + query.reset(new lucene::search::TermQuery(term)); + } else if (pred == "lt") { + query.reset(new lucene::search::RangeQuery(nullptr, term, false)); + } else if (pred == "gt") { + query.reset(new lucene::search::RangeQuery(term, nullptr, false)); + } else if (pred == "le") { + query.reset(new lucene::search::RangeQuery(nullptr, term, true)); + } else if (pred == "ge") { + query.reset(new lucene::search::RangeQuery(term, nullptr, true)); + } else { + std::cout << "invalid predicate type:" << pred << std::endl; + exit(-1); + } + _CLDECDELETE(term); } - _CLDECDELETE(term); - std::vector result; - int total = 0; - - s._search(query.get(), [&result, &total](const int32_t docid, const float_t /*score*/) { - // docid equal to rowid in segment - result.push_back(docid); - if (FLAGS_print_row_id) { - printf("RowID is %d\n", docid); - } - total += 1; - }); + int32_t total = 0; + if (pred == "match_all") { + roaring::Roaring result; + std::vector terms = split(token, '|'); + doris::ConjunctionQuery query(s.getReader()); + query.add(field_ws, terms); + query.search(result); + total += result.cardinality(); + } else { + roaring::Roaring result; + s._search(query.get(), [&result](const int32_t docid, const float_t /*score*/) { + // docid equal to rowid in segment + result.add(docid); + if (FLAGS_print_row_id) { + printf("RowID is %d\n", docid); + } + }); + total += result.cardinality(); + } std::cout << "Term queried count:" << total << std::endl; s.close(); diff --git a/be/src/io/cache/block/block_lru_file_cache.cpp b/be/src/io/cache/block/block_lru_file_cache.cpp index 0578c82e34e55e..9d81e5c539f458 100644 --- a/be/src/io/cache/block/block_lru_file_cache.cpp +++ b/be/src/io/cache/block/block_lru_file_cache.cpp @@ -27,7 +27,6 @@ #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep // IWYU pragma: no_include #include // IWYU pragma: keep diff --git a/be/src/io/cache/block/cached_remote_file_reader.cpp b/be/src/io/cache/block/cached_remote_file_reader.cpp index fd19f88af631cf..bbd7516dfaaf01 100644 --- a/be/src/io/cache/block/cached_remote_file_reader.cpp +++ b/be/src/io/cache/block/cached_remote_file_reader.cpp @@ -26,7 +26,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/config.h" #include "io/cache/block/block_file_cache.h" diff --git a/be/src/io/file_factory.cpp b/be/src/io/file_factory.cpp index 4648309fc57c03..63ddc3d43096be 100644 --- a/be/src/io/file_factory.cpp +++ b/be/src/io/file_factory.cpp @@ -175,7 +175,7 @@ Status FileFactory::create_pipe_reader(const TUniqueId& load_id, io::FileReaderS } else { pipe_id = runtime_state->fragment_instance_id(); } - *file_reader = multi_table_pipe->getPipe(pipe_id); + *file_reader = multi_table_pipe->get_pipe(pipe_id); LOG(INFO) << "create pipe reader for fragment instance: " << pipe_id << " pipe: " << (*file_reader).get(); diff --git a/be/src/io/fs/broker_file_reader.cpp b/be/src/io/fs/broker_file_reader.cpp index 73d79b703886fc..4d370cfb4d8387 100644 --- a/be/src/io/fs/broker_file_reader.cpp +++ b/be/src/io/fs/broker_file_reader.cpp @@ -28,26 +28,22 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/logging.h" #include "common/status.h" #include "io/fs/broker_file_system.h" #include "util/doris_metrics.h" -namespace doris { -namespace io { +namespace doris::io { struct IOContext; -BrokerFileReader::BrokerFileReader(const TNetworkAddress& broker_addr, const Path& path, - size_t file_size, TBrokerFD fd, - std::shared_ptr fs) - : _path(path), +BrokerFileReader::BrokerFileReader(const TNetworkAddress& broker_addr, Path path, size_t file_size, + TBrokerFD fd, std::shared_ptr fs) + : _path(std::move(path)), _file_size(file_size), _broker_addr(broker_addr), _fd(fd), _fs(std::move(fs)) { - static_cast(_fs->get_client(&_client)); DorisMetrics::instance()->broker_file_open_reading->increment(1); DorisMetrics::instance()->broker_file_reader_total->increment(1); } @@ -59,32 +55,7 @@ BrokerFileReader::~BrokerFileReader() { Status BrokerFileReader::close() { bool expected = false; if (_closed.compare_exchange_strong(expected, true, std::memory_order_acq_rel)) { - TBrokerCloseReaderRequest request; - request.__set_version(TBrokerVersion::VERSION_ONE); - request.__set_fd(_fd); - - TBrokerOperationStatus response; - try { - try { - (*_client)->closeReader(response, request); - } catch (apache::thrift::transport::TTransportException&) { - std::this_thread::sleep_for(std::chrono::seconds(1)); - RETURN_IF_ERROR((*_client).reopen()); - (*_client)->closeReader(response, request); - } - } catch (apache::thrift::TException& e) { - std::stringstream ss; - ss << "Close broker reader failed, broker:" << _broker_addr << " failed:" << e.what(); - return Status::RpcError(ss.str()); - } - - if (response.statusCode != TBrokerOperationStatusCode::OK) { - std::stringstream ss; - ss << "close broker reader failed, broker:" << _broker_addr - << " failed:" << response.message; - return Status::InternalError(ss.str()); - } - + RETURN_IF_ERROR(_fs->close_file(_fd)); DorisMetrics::instance()->broker_file_open_reading->increment(-1); } return Status::OK(); @@ -100,45 +71,12 @@ Status BrokerFileReader::read_at_impl(size_t offset, Slice result, size_t* bytes return Status::OK(); } - TBrokerPReadRequest request; - request.__set_version(TBrokerVersion::VERSION_ONE); - request.__set_fd(_fd); - request.__set_offset(offset); - request.__set_length(bytes_req); - - TBrokerReadResponse response; - try { - VLOG_RPC << "send pread request to broker:" << _broker_addr << " position:" << offset - << ", read bytes length:" << bytes_req; - try { - (*_client)->pread(response, request); - } catch (apache::thrift::transport::TTransportException& e) { - std::this_thread::sleep_for(std::chrono::seconds(1)); - RETURN_IF_ERROR((*_client).reopen()); - LOG(INFO) << "retry reading from broker: " << _broker_addr << ". reason: " << e.what(); - (*_client)->pread(response, request); - } - } catch (apache::thrift::TException& e) { - std::stringstream ss; - ss << "Open broker reader failed, broker:" << _broker_addr << " failed:" << e.what(); - return Status::RpcError(ss.str()); - } - - if (response.opStatus.statusCode == TBrokerOperationStatusCode::END_OF_FILE) { - // read the end of broker's file - *bytes_read = 0; - return Status::OK(); - } else if (response.opStatus.statusCode != TBrokerOperationStatusCode::OK) { - std::stringstream ss; - ss << "Open broker reader failed, broker:" << _broker_addr - << " failed:" << response.opStatus.message; - return Status::InternalError(ss.str()); - } + std::string data; + RETURN_IF_ERROR(_fs->read_file(_fd, offset, bytes_req, &data)); - *bytes_read = response.data.size(); - memcpy(to, response.data.data(), *bytes_read); + *bytes_read = data.size(); + memcpy(to, data.data(), *bytes_read); return Status::OK(); } -} // namespace io -} // namespace doris +} // namespace doris::io diff --git a/be/src/io/fs/broker_file_reader.h b/be/src/io/fs/broker_file_reader.h index ff3d3d2273ee58..7acdcbcc0d578f 100644 --- a/be/src/io/fs/broker_file_reader.h +++ b/be/src/io/fs/broker_file_reader.h @@ -32,15 +32,14 @@ #include "runtime/client_cache.h" #include "util/slice.h" -namespace doris { -namespace io { +namespace doris::io { struct IOContext; class BrokerFileReader : public FileReader { public: - BrokerFileReader(const TNetworkAddress& broker_addr, const Path& path, size_t file_size, - TBrokerFD fd, std::shared_ptr fs); + BrokerFileReader(const TNetworkAddress& broker_addr, Path path, size_t file_size, TBrokerFD fd, + std::shared_ptr fs); ~BrokerFileReader() override; @@ -66,8 +65,6 @@ class BrokerFileReader : public FileReader { TBrokerFD _fd; std::shared_ptr _fs; - std::shared_ptr _client; std::atomic _closed = false; }; -} // namespace io -} // namespace doris +} // namespace doris::io diff --git a/be/src/io/fs/broker_file_system.cpp b/be/src/io/fs/broker_file_system.cpp index f35f9b0e69a956..863dc9c19e60dd 100644 --- a/be/src/io/fs/broker_file_system.cpp +++ b/be/src/io/fs/broker_file_system.cpp @@ -45,8 +45,7 @@ #include "runtime/exec_env.h" #include "util/slice.h" -namespace doris { -namespace io { +namespace doris::io { #ifdef BE_TEST inline BrokerServiceClientCache* client_cache() { @@ -70,8 +69,8 @@ inline const std::string& client_id(const TNetworkAddress& addr) { #ifndef CHECK_BROKER_CLIENT #define CHECK_BROKER_CLIENT(client) \ - if (!client) { \ - return Status::IOError("init Broker client error"); \ + if (!client || !client->is_alive()) { \ + return Status::IOError("connect to broker failed"); \ } #endif @@ -90,8 +89,8 @@ BrokerFileSystem::BrokerFileSystem(const TNetworkAddress& broker_addr, Status BrokerFileSystem::connect_impl() { Status status = Status::OK(); - _client.reset(new BrokerServiceConnection(client_cache(), _broker_addr, - config::thrift_rpc_timeout_ms, &status)); + _connection = std::make_unique(client_cache(), _broker_addr, + config::thrift_rpc_timeout_ms, &status); return status; } @@ -109,7 +108,7 @@ Status BrokerFileSystem::open_file_internal(const Path& file, FileReaderSPtr* re RETURN_IF_ERROR(file_size_impl(file, &fsize)); } - CHECK_BROKER_CLIENT(_client); + CHECK_BROKER_CLIENT(_connection); TBrokerOpenReaderRequest request; request.__set_version(TBrokerVersion::VERSION_ONE); request.__set_path(file); @@ -121,11 +120,11 @@ Status BrokerFileSystem::open_file_internal(const Path& file, FileReaderSPtr* re try { Status status; try { - (*_client)->openReader(*response, request); + (*_connection)->openReader(*response, request); } catch (apache::thrift::transport::TTransportException&) { std::this_thread::sleep_for(std::chrono::seconds(1)); - RETURN_IF_ERROR((*_client).reopen()); - (*_client)->openReader(*response, request); + RETURN_IF_ERROR((*_connection).reopen()); + (*_connection)->openReader(*response, request); } } catch (apache::thrift::TException& e) { return Status::RpcError("failed to open file {}: {}", file.native(), error_msg(e.what())); @@ -146,7 +145,7 @@ Status BrokerFileSystem::create_directory_impl(const Path& /*path*/, bool /*fail } Status BrokerFileSystem::delete_file_impl(const Path& file) { - CHECK_BROKER_CLIENT(_client); + CHECK_BROKER_CLIENT(_connection); try { // rm file from remote path TBrokerDeletePathRequest del_req; @@ -156,10 +155,10 @@ Status BrokerFileSystem::delete_file_impl(const Path& file) { del_req.__set_properties(_broker_prop); try { - (*_client)->deletePath(del_rep, del_req); + (*_connection)->deletePath(del_rep, del_req); } catch (apache::thrift::transport::TTransportException&) { - RETURN_IF_ERROR((*_client).reopen()); - (*_client)->deletePath(del_rep, del_req); + RETURN_IF_ERROR((*_connection).reopen()); + (*_connection)->deletePath(del_rep, del_req); } if (del_rep.statusCode == TBrokerOperationStatusCode::OK) { @@ -186,7 +185,7 @@ Status BrokerFileSystem::batch_delete_impl(const std::vector& files) { } Status BrokerFileSystem::exists_impl(const Path& path, bool* res) const { - CHECK_BROKER_CLIENT(_client); + CHECK_BROKER_CLIENT(_connection); *res = false; try { TBrokerCheckPathExistRequest check_req; @@ -196,10 +195,10 @@ Status BrokerFileSystem::exists_impl(const Path& path, bool* res) const { check_req.__set_properties(_broker_prop); try { - (*_client)->checkPathExist(check_rep, check_req); + (*_connection)->checkPathExist(check_rep, check_req); } catch (apache::thrift::transport::TTransportException&) { - RETURN_IF_ERROR((*_client).reopen()); - (*_client)->checkPathExist(check_rep, check_req); + RETURN_IF_ERROR((*_connection).reopen()); + (*_connection)->checkPathExist(check_rep, check_req); } if (check_rep.opStatus.statusCode != TBrokerOperationStatusCode::OK) { @@ -219,7 +218,7 @@ Status BrokerFileSystem::exists_impl(const Path& path, bool* res) const { } Status BrokerFileSystem::file_size_impl(const Path& path, int64_t* file_size) const { - CHECK_BROKER_CLIENT(_client); + CHECK_BROKER_CLIENT(_connection); try { TBrokerFileSizeRequest req; req.__set_version(TBrokerVersion::VERSION_ONE); @@ -228,10 +227,10 @@ Status BrokerFileSystem::file_size_impl(const Path& path, int64_t* file_size) co TBrokerFileSizeResponse resp; try { - (*_client)->fileSize(resp, req); + (*_connection)->fileSize(resp, req); } catch (apache::thrift::transport::TTransportException&) { - RETURN_IF_ERROR((*_client).reopen()); - (*_client)->fileSize(resp, req); + RETURN_IF_ERROR((*_connection).reopen()); + (*_connection)->fileSize(resp, req); } if (resp.opStatus.statusCode != TBrokerOperationStatusCode::OK) { @@ -256,7 +255,7 @@ Status BrokerFileSystem::list_impl(const Path& dir, bool only_file, std::vector< if (!(*exists)) { return Status::OK(); } - CHECK_BROKER_CLIENT(_client); + CHECK_BROKER_CLIENT(_connection); Status status = Status::OK(); try { // get existing files from remote path @@ -269,10 +268,10 @@ Status BrokerFileSystem::list_impl(const Path& dir, bool only_file, std::vector< list_req.__set_fileNameOnly(true); // we only need file name, not abs path try { - (*_client)->listPath(list_rep, list_req); + (*_connection)->listPath(list_rep, list_req); } catch (apache::thrift::transport::TTransportException&) { - RETURN_IF_ERROR((*_client).reopen()); - (*_client)->listPath(list_rep, list_req); + RETURN_IF_ERROR((*_connection).reopen()); + (*_connection)->listPath(list_rep, list_req); } if (list_rep.opStatus.statusCode == TBrokerOperationStatusCode::FILE_NOT_FOUND) { @@ -309,7 +308,7 @@ Status BrokerFileSystem::list_impl(const Path& dir, bool only_file, std::vector< } Status BrokerFileSystem::rename_impl(const Path& orig_name, const Path& new_name) { - CHECK_BROKER_CLIENT(_client); + CHECK_BROKER_CLIENT(_connection); try { TBrokerOperationStatus op_status; TBrokerRenamePathRequest rename_req; @@ -319,10 +318,10 @@ Status BrokerFileSystem::rename_impl(const Path& orig_name, const Path& new_name rename_req.__set_properties(_broker_prop); try { - (*_client)->renamePath(op_status, rename_req); + (*_connection)->renamePath(op_status, rename_req); } catch (apache::thrift::transport::TTransportException&) { - RETURN_IF_ERROR((*_client).reopen()); - (*_client)->renamePath(op_status, rename_req); + RETURN_IF_ERROR((*_connection).reopen()); + (*_connection)->renamePath(op_status, rename_req); } if (op_status.statusCode != TBrokerOperationStatusCode::OK) { @@ -465,9 +464,76 @@ Status BrokerFileSystem::direct_download_impl(const Path& remote_file, std::stri return Status::OK(); } -Status BrokerFileSystem::get_client(std::shared_ptr* client) const { - CHECK_BROKER_CLIENT(_client); - *client = _client; +Status BrokerFileSystem::read_file(const TBrokerFD& fd, size_t offset, size_t bytes_req, + std::string* data) const { + if (data == nullptr) { + return Status::InvalidArgument("data should be not null"); + } + CHECK_BROKER_CLIENT(_connection); + TBrokerPReadRequest request; + request.__set_version(TBrokerVersion::VERSION_ONE); + request.__set_fd(fd); + request.__set_offset(offset); + request.__set_length(bytes_req); + + TBrokerReadResponse response; + try { + VLOG_RPC << "send pread request to broker:" << _broker_addr << " position:" << offset + << ", read bytes length:" << bytes_req; + try { + (*_connection)->pread(response, request); + } catch (apache::thrift::transport::TTransportException& e) { + std::this_thread::sleep_for(std::chrono::seconds(1)); + RETURN_IF_ERROR((*_connection).reopen()); + LOG(INFO) << "retry reading from broker: " << _broker_addr << ". reason: " << e.what(); + (*_connection)->pread(response, request); + } + } catch (apache::thrift::TException& e) { + std::stringstream ss; + ss << "read broker file failed, broker:" << _broker_addr << " failed:" << e.what(); + return Status::RpcError(ss.str()); + } + + if (response.opStatus.statusCode == TBrokerOperationStatusCode::END_OF_FILE) { + // read the end of broker's file + return Status::OK(); + } + if (response.opStatus.statusCode != TBrokerOperationStatusCode::OK) { + std::stringstream ss; + ss << "Open broker reader failed, broker:" << _broker_addr + << " failed:" << response.opStatus.message; + return Status::InternalError(ss.str()); + } + *data = std::move(response.data); + return Status::OK(); +} + +Status BrokerFileSystem::close_file(const TBrokerFD& fd) const { + CHECK_BROKER_CLIENT(_connection); + TBrokerCloseReaderRequest request; + request.__set_version(TBrokerVersion::VERSION_ONE); + request.__set_fd(fd); + + TBrokerOperationStatus response; + try { + try { + (*_connection)->closeReader(response, request); + } catch (apache::thrift::transport::TTransportException&) { + std::this_thread::sleep_for(std::chrono::seconds(1)); + RETURN_IF_ERROR((*_connection).reopen()); + (*_connection)->closeReader(response, request); + } + } catch (apache::thrift::TException& e) { + std::stringstream ss; + ss << "close broker file failed, broker:" << _broker_addr << " failed:" << e.what(); + return Status::RpcError(ss.str()); + } + + if (response.statusCode != TBrokerOperationStatusCode::OK) { + std::stringstream ss; + ss << "close broker file failed, broker:" << _broker_addr << " failed:" << response.message; + return Status::InternalError(ss.str()); + } return Status::OK(); } @@ -475,5 +541,4 @@ std::string BrokerFileSystem::error_msg(const std::string& err) const { return fmt::format("({}:{}), {}", _broker_addr.hostname, _broker_addr.port, err); } -} // namespace io -} // namespace doris +} // namespace doris::io diff --git a/be/src/io/fs/broker_file_system.h b/be/src/io/fs/broker_file_system.h index 1da18cb29e9a76..4c4b9604e8df5d 100644 --- a/be/src/io/fs/broker_file_system.h +++ b/be/src/io/fs/broker_file_system.h @@ -19,6 +19,7 @@ #include +#include #include #include #include @@ -43,7 +44,9 @@ class BrokerFileSystem final : public RemoteFileSystem { ~BrokerFileSystem() override = default; - Status get_client(std::shared_ptr* client) const; + Status read_file(const TBrokerFD& fd, size_t offset, size_t bytes_req, std::string* data) const; + + Status close_file(const TBrokerFD& fd) const; protected: Status connect_impl() override; @@ -79,7 +82,7 @@ class BrokerFileSystem final : public RemoteFileSystem { const TNetworkAddress& _broker_addr; const std::map& _broker_prop; - std::shared_ptr _client; + std::unique_ptr _connection; }; } // namespace io } // namespace doris diff --git a/be/src/io/fs/buffered_reader.cpp b/be/src/io/fs/buffered_reader.cpp index 0c4496fd3ab001..caa80616d35303 100644 --- a/be/src/io/fs/buffered_reader.cpp +++ b/be/src/io/fs/buffered_reader.cpp @@ -24,7 +24,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/config.h" #include "common/status.h" @@ -152,7 +151,6 @@ Status MergeRangeFileReader::read_at_impl(size_t offset, Slice result, size_t* b } content_size = 0; hollow_size = 0; - double amplified_ratio = config::max_amplified_read_ratio; std::vector> ratio_and_size; // Calculate the read amplified ratio for each merge operation and the size of the merged data. // Find the largest size of the merged data whose amplified ratio is less than config::max_amplified_read_ratio @@ -168,9 +166,12 @@ Status MergeRangeFileReader::read_at_impl(size_t offset, Slice result, size_t* b } } size_t best_merged_size = 0; - for (const std::pair& rs : ratio_and_size) { + for (int i = 0; i < ratio_and_size.size(); ++i) { + const std::pair& rs = ratio_and_size[i]; + size_t equivalent_size = rs.second / (i + 1); if (rs.second > best_merged_size) { - if (rs.first < amplified_ratio || rs.second <= MIN_READ_SIZE) { + if (rs.first <= _max_amplified_ratio || + (_max_amplified_ratio < 1 && equivalent_size <= _equivalent_io_size)) { best_merged_size = rs.second; } } diff --git a/be/src/io/fs/buffered_reader.h b/be/src/io/fs/buffered_reader.h index 586aa9546fec98..0bde47d79d9c82 100644 --- a/be/src/io/fs/buffered_reader.h +++ b/be/src/io/fs/buffered_reader.h @@ -130,8 +130,9 @@ class MergeRangeFileReader : public io::FileReader { static constexpr size_t READ_SLICE_SIZE = 8 * 1024 * 1024; // 8MB static constexpr size_t BOX_SIZE = 1 * 1024 * 1024; // 1MB static constexpr size_t SMALL_IO = 2 * 1024 * 1024; // 2MB + static constexpr size_t HDFS_MIN_IO_SIZE = 4 * 1024; // 4KB + static constexpr size_t OSS_MIN_IO_SIZE = 512 * 1024; // 512KB static constexpr size_t NUM_BOX = TOTAL_BUFFER_SIZE / BOX_SIZE; // 128 - static constexpr size_t MIN_READ_SIZE = 4096; // 4KB MergeRangeFileReader(RuntimeProfile* profile, io::FileReaderSPtr reader, const std::vector& random_access_ranges) @@ -141,6 +142,11 @@ class MergeRangeFileReader : public io::FileReader { _range_cached_data.resize(random_access_ranges.size()); _size = _reader->size(); _remaining = TOTAL_BUFFER_SIZE; + _is_oss = typeid_cast(_reader.get()) != nullptr; + _max_amplified_ratio = config::max_amplified_read_ratio; + // Equivalent min size of each IO that can reach the maximum storage speed limit: + // 512KB for oss, 4KB for hdfs + _equivalent_io_size = _is_oss ? OSS_MIN_IO_SIZE : HDFS_MIN_IO_SIZE; if (_profile != nullptr) { const char* random_profile = "MergedSmallIO"; ADD_TIMER_WITH_LEVEL(_profile, random_profile, 1); @@ -237,6 +243,9 @@ class MergeRangeFileReader : public io::FileReader { int16 _last_box_ref = -1; uint32 _last_box_usage = 0; std::vector _box_ref; + bool _is_oss; + double _max_amplified_ratio; + size_t _equivalent_io_size; Statistics _statistics; }; diff --git a/be/src/io/fs/file_handle_cache.cpp b/be/src/io/fs/file_handle_cache.cpp index 815be0f99bfd38..ab48125a7804d5 100644 --- a/be/src/io/fs/file_handle_cache.cpp +++ b/be/src/io/fs/file_handle_cache.cpp @@ -21,6 +21,7 @@ #include "io/fs/file_handle_cache.h" +#include #include #include "io/fs/err_utils.h" diff --git a/be/src/io/fs/file_system.h b/be/src/io/fs/file_system.h index d58af4b9724685..54fd61e497bac2 100644 --- a/be/src/io/fs/file_system.h +++ b/be/src/io/fs/file_system.h @@ -33,19 +33,20 @@ namespace doris { namespace io { #ifndef FILESYSTEM_M -#define FILESYSTEM_M(stmt) \ - do { \ - Status _s; \ - if (bthread_self() == 0) { \ - _s = (stmt); \ - } else { \ - auto task = [&] { _s = (stmt); }; \ - AsyncIO::run_task(task, _type); \ - } \ - if (!_s) { \ - LOG(WARNING) << _s; \ - } \ - return _s; \ +#define FILESYSTEM_M(stmt) \ + do { \ + Status _s; \ + if (bthread_self() == 0) { \ + _s = (stmt); \ + } else { \ + auto task = [&] { _s = (stmt); }; \ + AsyncIO::run_task(task, _type); \ + } \ + if (!_s) { \ + LOG(WARNING) << _s; \ + _s = Status::Error(_s.code(), _s.msg()); \ + } \ + return _s; \ } while (0); #endif diff --git a/be/src/io/fs/hdfs_file_reader.cpp b/be/src/io/fs/hdfs_file_reader.cpp index 0ea9c43015dfc3..f37b8551a75d17 100644 --- a/be/src/io/fs/hdfs_file_reader.cpp +++ b/be/src/io/fs/hdfs_file_reader.cpp @@ -24,7 +24,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/logging.h" #include "io/fs/err_utils.h" diff --git a/be/src/io/fs/local_file_reader.cpp b/be/src/io/fs/local_file_reader.cpp index 587a6cb9b59dbb..bca5d815a041a1 100644 --- a/be/src/io/fs/local_file_reader.cpp +++ b/be/src/io/fs/local_file_reader.cpp @@ -30,7 +30,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "io/fs/err_utils.h" #include "util/async_io.h" diff --git a/be/src/io/fs/local_file_writer.cpp b/be/src/io/fs/local_file_writer.cpp index 9ebdfebe36142e..af913d60a985c0 100644 --- a/be/src/io/fs/local_file_writer.cpp +++ b/be/src/io/fs/local_file_writer.cpp @@ -32,7 +32,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/status.h" #include "gutil/macros.h" diff --git a/be/src/io/fs/multi_table_pipe.cpp b/be/src/io/fs/multi_table_pipe.cpp index 1036f7a30df926..a11d6412df26e0 100644 --- a/be/src/io/fs/multi_table_pipe.cpp +++ b/be/src/io/fs/multi_table_pipe.cpp @@ -215,12 +215,12 @@ Status MultiTablePipe::exec_plans(ExecEnv* exec_env, std::vector para if constexpr (std::is_same_v) { RETURN_IF_ERROR( - putPipe(plan.params.fragment_instance_id, _planned_pipes[plan.table_name])); + put_pipe(plan.params.fragment_instance_id, _planned_pipes[plan.table_name])); LOG(INFO) << "fragment_instance_id=" << print_id(plan.params.fragment_instance_id) << " table=" << plan.table_name; } else if constexpr (std::is_same_v) { auto pipe_id = calculate_pipe_id(plan.query_id, plan.fragment_id); - RETURN_IF_ERROR(putPipe(pipe_id, _planned_pipes[plan.table_name])); + RETURN_IF_ERROR(put_pipe(pipe_id, _planned_pipes[plan.table_name])); LOG(INFO) << "pipe_id=" << pipe_id << "table=" << plan.table_name; } else { LOG(WARNING) << "illegal exec param type, need `TExecPlanFragmentParams` or " @@ -247,7 +247,7 @@ Status MultiTablePipe::exec_plans(ExecEnv* exec_env, std::vector para if (num_selected_rows > 0 && (double)state->num_rows_load_filtered() / num_selected_rows > _ctx->max_filter_ratio) { - *status = Status::InternalError("too many filtered rows"); + *status = Status::DataQualityError("too many filtered rows"); } if (_number_filtered_rows > 0 && !state->get_error_log_file_path().empty()) { _ctx->error_url = to_load_error_http_path(state->get_error_log_file_path()); @@ -300,7 +300,8 @@ Status MultiTablePipe::exec_plans(ExecEnv* exec_env, std::vector para #endif -Status MultiTablePipe::putPipe(const TUniqueId& pipe_id, std::shared_ptr pipe) { +Status MultiTablePipe::put_pipe(const TUniqueId& pipe_id, + std::shared_ptr pipe) { std::lock_guard l(_pipe_map_lock); auto it = _pipe_map.find(pipe_id); if (it != std::end(_pipe_map)) { @@ -310,16 +311,16 @@ Status MultiTablePipe::putPipe(const TUniqueId& pipe_id, std::shared_ptr MultiTablePipe::getPipe(const TUniqueId& pipe_id) { +std::shared_ptr MultiTablePipe::get_pipe(const TUniqueId& pipe_id) { std::lock_guard l(_pipe_map_lock); auto it = _pipe_map.find(pipe_id); if (it == std::end(_pipe_map)) { - return std::shared_ptr(nullptr); + return {}; } return it->second; } -void MultiTablePipe::removePipe(const TUniqueId& pipe_id) { +void MultiTablePipe::remove_pipe(const TUniqueId& pipe_id) { std::lock_guard l(_pipe_map_lock); auto it = _pipe_map.find(pipe_id); if (it != std::end(_pipe_map)) { diff --git a/be/src/io/fs/multi_table_pipe.h b/be/src/io/fs/multi_table_pipe.h index fd307222563b63..9f89672689d5a6 100644 --- a/be/src/io/fs/multi_table_pipe.h +++ b/be/src/io/fs/multi_table_pipe.h @@ -55,11 +55,11 @@ class MultiTablePipe : public KafkaConsumerPipe { void cancel(const std::string& reason) override; // register pair - Status putPipe(const TUniqueId& fragment_instance_id, std::shared_ptr pipe); + Status put_pipe(const TUniqueId& pipe_id, std::shared_ptr pipe); - std::shared_ptr getPipe(const TUniqueId& fragment_instance_id); + std::shared_ptr get_pipe(const TUniqueId& pipe_id); - void removePipe(const TUniqueId& fragment_instance_id); + void remove_pipe(const TUniqueId& pipe_id); private: // parse table name from data diff --git a/be/src/io/fs/s3_file_bufferpool.cpp b/be/src/io/fs/s3_file_bufferpool.cpp index 42459914d316e9..a4f75319ec0939 100644 --- a/be/src/io/fs/s3_file_bufferpool.cpp +++ b/be/src/io/fs/s3_file_bufferpool.cpp @@ -28,6 +28,9 @@ namespace doris { namespace io { +bvar::Adder s3_file_buffer_allocated("s3_file_buffer_allocated"); +bvar::Adder s3_file_buffer_allocating("s3_file_buffer_allocating"); + /** * 0. check if the inner memory buffer is empty or not * 1. relcaim the memory buffer if it's mot empty @@ -143,6 +146,10 @@ Status UploadFileBuffer::append_data(const Slice& data) { } else { // wait allocate buffer pool auto tmp = S3FileBufferPool::GetInstance()->allocate(true); + if (tmp.empty()) [[unlikely]] { + return Status::InternalError("Failed to allocate S3 buffer for {} seconds", + config::s3_writer_buffer_allocation_timeout); + } swap_buffer(tmp); } } @@ -156,6 +163,11 @@ Status UploadFileBuffer::append_data(const Slice& data) { */ void UploadFileBuffer::read_from_cache() { auto tmp = S3FileBufferPool::GetInstance()->allocate(true); + if (tmp.empty()) [[unlikely]] { + set_val(Status::InternalError("Failed to allocate S3 buffer for {} seconds", + config::s3_writer_buffer_allocation_timeout)); + return; + } swap_buffer(tmp); DCHECK(_holder != nullptr); @@ -285,6 +297,16 @@ std::shared_ptr FileBufferBuilder::build() { return nullptr; } +void S3FileBufferPool::reclaim(Slice buf) { + { + std::unique_lock lck {_lock}; + _free_raw_buffers.emplace_back(buf); + // only works when not set file cache + _cv.notify_all(); + } + s3_file_buffer_allocated << -1; +} + void S3FileBufferPool::init(int32_t s3_write_buffer_whole_size, int32_t s3_write_buffer_size, ThreadPool* thread_pool) { // the nums could be one configuration @@ -305,13 +327,23 @@ void S3FileBufferPool::init(int32_t s3_write_buffer_whole_size, int32_t s3_write Slice S3FileBufferPool::allocate(bool reserve) { Slice buf; + Defer defer {[&]() { + if (!buf.empty()) { + s3_file_buffer_allocated << 1; + } + s3_file_buffer_allocating << -1; + }}; + s3_file_buffer_allocating << 1; // if need reserve or no cache then we must ensure return buf with memory preserved if (reserve || !config::enable_file_cache) { { std::unique_lock lck {_lock}; - _cv.wait(lck, [this]() { return !_free_raw_buffers.empty(); }); - buf = _free_raw_buffers.front(); - _free_raw_buffers.pop_front(); + _cv.wait_for(lck, std::chrono::seconds(config::s3_writer_buffer_allocation_timeout), + [this]() { return !_free_raw_buffers.empty(); }); + if (!_free_raw_buffers.empty()) { + buf = _free_raw_buffers.front(); + _free_raw_buffers.pop_front(); + } } return buf; } diff --git a/be/src/io/fs/s3_file_bufferpool.h b/be/src/io/fs/s3_file_bufferpool.h index 8dd61aac4de748..01d31928748035 100644 --- a/be/src/io/fs/s3_file_bufferpool.h +++ b/be/src/io/fs/s3_file_bufferpool.h @@ -329,11 +329,7 @@ class S3FileBufferPool { return ExecEnv::GetInstance()->get_s3_file_buffer_pool(); } - void reclaim(Slice buf) { - std::unique_lock lck {_lock}; - _free_raw_buffers.emplace_front(buf); - _cv.notify_all(); - } + void reclaim(Slice buf); /** * diff --git a/be/src/io/fs/s3_file_reader.cpp b/be/src/io/fs/s3_file_reader.cpp index 2116ed35558847..417d2a15f0034d 100644 --- a/be/src/io/fs/s3_file_reader.cpp +++ b/be/src/io/fs/s3_file_reader.cpp @@ -30,8 +30,6 @@ #include #include -// IWYU pragma: no_include - #include "common/compiler_util.h" // IWYU pragma: keep #include "io/fs/s3_common.h" #include "util/doris_metrics.h" diff --git a/be/src/io/fs/s3_file_system.cpp b/be/src/io/fs/s3_file_system.cpp index 2c3869b28ef8df..9b4e447c56d416 100644 --- a/be/src/io/fs/s3_file_system.cpp +++ b/be/src/io/fs/s3_file_system.cpp @@ -54,7 +54,6 @@ #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep // IWYU pragma: no_include #include // IWYU pragma: keep diff --git a/be/src/io/fs/s3_file_writer.cpp b/be/src/io/fs/s3_file_writer.cpp index aef406a03ceaa5..2438ba2cfdb22c 100644 --- a/be/src/io/fs/s3_file_writer.cpp +++ b/be/src/io/fs/s3_file_writer.cpp @@ -137,18 +137,19 @@ Status S3FileWriter::_create_multi_upload_request() { } void S3FileWriter::_wait_until_finish(std::string_view task_name) { - auto msg = - fmt::format("{} multipart upload already takes 5 min, bucket={}, key={}, upload_id={}", - task_name, _bucket, _path.native(), _upload_id); + auto timeout_duration = config::s3_writer_buffer_allocation_timeout; + auto msg = fmt::format( + "{} multipart upload already takes {} seconds, bucket={}, key={}, upload_id={}", + task_name, timeout_duration, _bucket, _path.native(), _upload_id); timespec current_time; // We don't need high accuracy here, so we use time(nullptr) // since it's the fastest way to get current time(second) auto current_time_second = time(nullptr); - current_time.tv_sec = current_time_second + 300; + current_time.tv_sec = current_time_second + timeout_duration; current_time.tv_nsec = 0; // bthread::countdown_event::timed_wait() should use absolute time while (0 != _countdown_event.timed_wait(current_time)) { - current_time.tv_sec += 300; + current_time.tv_sec += timeout_duration; LOG(WARNING) << msg; } } @@ -466,8 +467,8 @@ void S3FileWriter::_put_object(UploadFileBuffer& buf) { response.GetError().GetExceptionName(), response.GetError().GetMessage(), static_cast(response.GetError().GetResponseCode())); - buf.set_val(_st); LOG(WARNING) << _st; + buf.set_val(_st); return; } _bytes_written += buf.get_size(); diff --git a/be/src/io/fs/stream_load_pipe.cpp b/be/src/io/fs/stream_load_pipe.cpp index 25be598e90eb23..3e2f6bdf3ea511 100644 --- a/be/src/io/fs/stream_load_pipe.cpp +++ b/be/src/io/fs/stream_load_pipe.cpp @@ -23,7 +23,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/status.h" #include "runtime/exec_env.h" @@ -44,7 +43,6 @@ StreamLoadPipe::StreamLoadPipe(size_t max_buffered_bytes, size_t min_chunk_size, _use_proto(use_proto) {} StreamLoadPipe::~StreamLoadPipe() { - SCOPED_TRACK_MEMORY_TO_UNKNOWN(); while (!_buf_queue.empty()) { _buf_queue.pop_front(); } @@ -90,7 +88,7 @@ Status StreamLoadPipe::read_at_impl(size_t /*offset*/, Slice result, size_t* byt return Status::OK(); } -// If _total_length == -1, this should be a Kafka routine load task, +// If _total_length == -1, this should be a Kafka routine load task or stream load with chunked transfer HTTP request, // just get the next buffer directly from the buffer queue, because one buffer contains a complete piece of data. // Otherwise, this should be a stream load task that needs to read the specified amount of data. Status StreamLoadPipe::read_one_message(std::unique_ptr* data, size_t* length) { diff --git a/be/src/olap/base_tablet.cpp b/be/src/olap/base_tablet.cpp index d118c26c8bfdd8..d9d4a4c537ce17 100644 --- a/be/src/olap/base_tablet.cpp +++ b/be/src/olap/base_tablet.cpp @@ -21,6 +21,7 @@ #include "olap/tablet_schema_cache.h" #include "util/doris_metrics.h" +#include "vec/common/schema_util.h" namespace doris { using namespace ErrorCode; @@ -65,4 +66,14 @@ void BaseTablet::update_max_version_schema(const TabletSchemaSPtr& tablet_schema } } +void BaseTablet::update_by_least_common_schema(const TabletSchemaSPtr& update_schema) { + std::lock_guard wrlock(_meta_lock); + auto final_schema = std::make_shared(); + CHECK(_max_version_schema->schema_version() >= update_schema->schema_version()); + vectorized::schema_util::get_least_common_schema({_max_version_schema, update_schema}, + final_schema); + _max_version_schema = final_schema; + VLOG_DEBUG << "dump updated tablet schema: " << final_schema->dump_structure(); +} + } /* namespace doris */ diff --git a/be/src/olap/base_tablet.h b/be/src/olap/base_tablet.h index 5e34be62dcd513..d7ed2ca83fae6a 100644 --- a/be/src/olap/base_tablet.h +++ b/be/src/olap/base_tablet.h @@ -65,6 +65,8 @@ class BaseTablet { void update_max_version_schema(const TabletSchemaSPtr& tablet_schema); + void update_by_least_common_schema(const TabletSchemaSPtr& update_schema); + TabletSchemaSPtr tablet_schema() const { std::shared_lock rlock(_meta_lock); return _max_version_schema; @@ -72,11 +74,12 @@ class BaseTablet { virtual bool exceed_version_limit(int32_t limit) const = 0; - virtual Status create_rowset_writer(RowsetWriterContext& context, - std::unique_ptr* rowset_writer) = 0; + virtual Result> create_rowset_writer(RowsetWriterContext& context, + bool vertical) = 0; virtual Status capture_rs_readers(const Version& spec_version, - std::vector* rs_splits) const = 0; + std::vector* rs_splits, + bool skip_missing_version) const = 0; virtual size_t tablet_footprint() = 0; diff --git a/be/src/olap/cold_data_compaction.cpp b/be/src/olap/cold_data_compaction.cpp index fc3f7569aa744e..656ead0a1d45d3 100644 --- a/be/src/olap/cold_data_compaction.cpp +++ b/be/src/olap/cold_data_compaction.cpp @@ -26,7 +26,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/config.h" #include "olap/compaction.h" @@ -91,7 +90,6 @@ Status ColdDataCompaction::modify_rowsets(const Merger::Statistics* stats) { // TODO(plat1ko): process primary key _tablet->tablet_meta()->set_cooldown_meta_id(cooldown_meta_id); } - Tablet::erase_pending_remote_rowset(_output_rowset->rowset_id().to_string()); { std::shared_lock rlock(_tablet->get_header_lock()); _tablet->save_meta(); diff --git a/be/src/olap/compaction.cpp b/be/src/olap/compaction.cpp index d1fab950584b59..c8d0b2a29fa986 100644 --- a/be/src/olap/compaction.cpp +++ b/be/src/olap/compaction.cpp @@ -249,8 +249,7 @@ void Compaction::build_basic_info() { std::vector rowset_metas(_input_rowsets.size()); std::transform(_input_rowsets.begin(), _input_rowsets.end(), rowset_metas.begin(), [](const RowsetSharedPtr& rowset) { return rowset->rowset_meta(); }); - _cur_tablet_schema = - _tablet->rowset_meta_with_max_schema_version(rowset_metas)->tablet_schema(); + _cur_tablet_schema = _tablet->tablet_schema_with_merged_max_schema_version(rowset_metas); } bool Compaction::handle_ordered_data_compaction() { @@ -333,9 +332,6 @@ Status Compaction::do_compaction_impl(int64_t permits) { RowsetWriterContext ctx; RETURN_IF_ERROR(construct_input_rowset_readers()); RETURN_IF_ERROR(construct_output_rowset_writer(ctx, vertical_compaction)); - if (compaction_type() == ReaderType::READER_COLD_DATA_COMPACTION) { - Tablet::add_pending_remote_rowset(_output_rs_writer->rowset_id().to_string()); - } // 2. write merged rows to output rowset // The test results show that merger is low-memory-footprint, there is no need to tracker its mem pool @@ -645,10 +641,9 @@ Status Compaction::construct_output_rowset_writer(RowsetWriterContext& ctx, bool DCHECK(resource.fs->type() != io::FileSystemType::LOCAL); ctx.fs = std::move(resource.fs); } - if (is_vertical) { - return _tablet->create_vertical_rowset_writer(ctx, &_output_rs_writer); - } - return _tablet->create_rowset_writer(ctx, &_output_rs_writer); + _output_rs_writer = DORIS_TRY(_tablet->create_rowset_writer(ctx, is_vertical)); + _pending_rs_guard = StorageEngine::instance()->add_pending_rowset(ctx); + return Status::OK(); } Status Compaction::construct_input_rowset_readers() { @@ -719,24 +714,24 @@ Status Compaction::modify_rowsets(const Merger::Statistics* stats) { // Therefore, we need to check if every committed rowset has calculated delete bitmap for // all compaction input rowsets. continue; - } else { - DeleteBitmap txn_output_delete_bitmap(_tablet->tablet_id()); - _tablet->calc_compaction_output_rowset_delete_bitmap( - _input_rowsets, _rowid_conversion, 0, UINT64_MAX, &missed_rows, - &location_map, *it.delete_bitmap.get(), &txn_output_delete_bitmap); - if (config::enable_merge_on_write_correctness_check) { - RowsetIdUnorderedSet rowsetids; - rowsetids.insert(_output_rowset->rowset_id()); - _tablet->add_sentinel_mark_to_delete_bitmap(&txn_output_delete_bitmap, - rowsetids); - } - it.delete_bitmap->merge(txn_output_delete_bitmap); - // Step3: write back updated delete bitmap and tablet info. - it.rowset_ids.insert(_output_rowset->rowset_id()); - StorageEngine::instance()->txn_manager()->set_txn_related_delete_bitmap( - it.partition_id, it.transaction_id, _tablet->tablet_id(), - _tablet->tablet_uid(), true, it.delete_bitmap, it.rowset_ids, nullptr); } + DeleteBitmap txn_output_delete_bitmap(_tablet->tablet_id()); + _tablet->calc_compaction_output_rowset_delete_bitmap( + _input_rowsets, _rowid_conversion, 0, UINT64_MAX, &missed_rows, + &location_map, *it.delete_bitmap.get(), &txn_output_delete_bitmap); + if (config::enable_merge_on_write_correctness_check) { + RowsetIdUnorderedSet rowsetids; + rowsetids.insert(_output_rowset->rowset_id()); + _tablet->add_sentinel_mark_to_delete_bitmap(&txn_output_delete_bitmap, + rowsetids); + } + it.delete_bitmap->merge(txn_output_delete_bitmap); + // Step3: write back updated delete bitmap and tablet info. + it.rowset_ids.insert(_output_rowset->rowset_id()); + StorageEngine::instance()->txn_manager()->set_txn_related_delete_bitmap( + it.partition_id, it.transaction_id, _tablet->tablet_id(), + _tablet->tablet_uid(), true, it.delete_bitmap, it.rowset_ids, + it.partial_update_info); } // Convert the delete bitmap of the input rowsets to output rowset for @@ -807,7 +802,6 @@ bool Compaction::_check_if_includes_input_rowsets( void Compaction::gc_output_rowset() { if (_state != CompactionState::SUCCESS && _output_rowset != nullptr) { if (!_output_rowset->is_local()) { - Tablet::erase_pending_remote_rowset(_output_rowset->rowset_id().to_string()); _tablet->record_unused_remote_rowset(_output_rowset->rowset_id(), _output_rowset->rowset_meta()->resource_id(), _output_rowset->num_segments()); diff --git a/be/src/olap/compaction.h b/be/src/olap/compaction.h index ad29430fec6a47..2aff05ced83748 100644 --- a/be/src/olap/compaction.h +++ b/be/src/olap/compaction.h @@ -29,6 +29,7 @@ #include "olap/merger.h" #include "olap/olap_common.h" #include "olap/rowid_conversion.h" +#include "olap/rowset/pending_rowset_helper.h" #include "olap/rowset/rowset.h" #include "olap/rowset/rowset_reader.h" #include "olap/tablet.h" @@ -115,6 +116,7 @@ class Compaction { int64_t _input_index_size; RowsetSharedPtr _output_rowset; + PendingRowsetGuard _pending_rs_guard; std::unique_ptr _output_rs_writer; enum CompactionState { INITED = 0, SUCCESS = 1 }; diff --git a/be/src/olap/comparison_predicate.h b/be/src/olap/comparison_predicate.h index 2fa1d629811196..522939370cf444 100644 --- a/be/src/olap/comparison_predicate.h +++ b/be/src/olap/comparison_predicate.h @@ -179,7 +179,9 @@ class ComparisonPredicateBase : public ColumnPredicate { return false; } - DCHECK_LE(sizeof(T), statistic.first->size()); + DCHECK(sizeof(T) <= statistic.first->size() || Type == TYPE_DATE) + << " Type: " << Type << " sizeof(T): " << sizeof(T) + << " statistic.first->size(): " << statistic.first->size(); T tmp_min_value {}; T tmp_max_value {}; diff --git a/be/src/olap/data_dir.cpp b/be/src/olap/data_dir.cpp index 3bcdc156298421..3dfdc7d87fb900 100644 --- a/be/src/olap/data_dir.cpp +++ b/be/src/olap/data_dir.cpp @@ -49,6 +49,7 @@ #include "olap/olap_define.h" #include "olap/olap_meta.h" #include "olap/rowset/beta_rowset.h" +#include "olap/rowset/pending_rowset_helper.h" #include "olap/rowset/rowset.h" #include "olap/rowset/rowset_id_generator.h" #include "olap/rowset/rowset_meta.h" @@ -124,7 +125,6 @@ Status DataDir::init() { "check file exist failed"); } - update_trash_capacity(); RETURN_NOT_OK_STATUS_WITH_WARN(update_capacity(), "update_capacity failed"); RETURN_NOT_OK_STATUS_WITH_WARN(_init_cluster_id(), "_init_cluster_id failed"); RETURN_NOT_OK_STATUS_WITH_WARN(_init_capacity_and_create_shards(), @@ -137,8 +137,6 @@ Status DataDir::init() { void DataDir::stop_bg_worker() { _stop_bg_worker = true; - std::unique_lock lck(_check_path_mutex); - _check_path_cv.notify_one(); } Status DataDir::_init_cluster_id() { @@ -254,15 +252,14 @@ Status DataDir::_read_and_write_test_file() { return read_write_test_file(test_file); } -Status DataDir::get_shard(uint64_t* shard) { +uint64_t DataDir::get_shard() { uint32_t next_shard = 0; { std::lock_guard l(_mutex); next_shard = _current_shard; _current_shard = (_current_shard + 1) % MAX_SHARD_NUM; } - *shard = next_shard; - return Status::OK(); + return next_shard; } void DataDir::register_tablet(Tablet* tablet) { @@ -507,7 +504,7 @@ Status DataDir::load() { // 2. add visible rowset to tablet // ignore any errors when load tablet or rowset, because fe will repair them after report int64_t invalid_rowset_counter = 0; - for (auto rowset_meta : dir_rowset_metas) { + for (auto&& rowset_meta : dir_rowset_metas) { TabletSharedPtr tablet = _tablet_manager->get_tablet(rowset_meta->tablet_id()); // tablet maybe dropped, but not drop related rowset meta if (tablet == nullptr) { @@ -538,7 +535,10 @@ Status DataDir::load() { Status commit_txn_status = _txn_manager->commit_txn( _meta, rowset_meta->partition_id(), rowset_meta->txn_id(), rowset_meta->tablet_id(), rowset_meta->tablet_uid(), rowset_meta->load_id(), - rowset, true); + rowset, + StorageEngine::instance()->pending_local_rowsets().add( + rowset_meta->rowset_id()), + true); if (!commit_txn_status && !commit_txn_status.is()) { LOG(WARNING) << "failed to add committed rowset: " << rowset_meta->rowset_id() << " to tablet: " << rowset_meta->tablet_id() @@ -627,74 +627,60 @@ Status DataDir::load() { } void DataDir::perform_path_gc() { - std::unique_lock lck(_check_path_mutex); - _check_path_cv.wait(lck, [this] { - return _stop_bg_worker || !_all_tablet_schemahash_paths.empty() || - !_all_check_paths.empty(); - }); - if (_stop_bg_worker) { - return; - } - - _perform_path_gc_by_tablet(); - _perform_path_gc_by_rowsetid(); + auto tablet_paths = _perform_path_scan(); + _perform_path_gc_by_tablet(tablet_paths); + _perform_path_gc_by_rowset(tablet_paths); } // gc unused tablet schemahash dir -void DataDir::_perform_path_gc_by_tablet() { - if (_all_tablet_schemahash_paths.empty()) { +void DataDir::_perform_path_gc_by_tablet(std::vector& tablet_paths) { + if (_stop_bg_worker) return; + if (tablet_paths.empty()) { return; } - LOG(INFO) << "start to path gc by tablet schemahash."; + LOG(INFO) << "start to path gc by tablet, dir=" << _path; + // Use double pointer to avoid move elements after erase a element + auto forward = tablet_paths.begin(); + auto backward = tablet_paths.end(); int counter = 0; - for (const auto& path : _all_tablet_schemahash_paths) { - ++counter; + do { if (config::path_gc_check_step > 0 && counter % config::path_gc_check_step == 0) { std::this_thread::sleep_for( std::chrono::milliseconds(config::path_gc_check_step_interval_ms)); } + auto& path = *forward; TTabletId tablet_id = -1; TSchemaHash schema_hash = -1; bool is_valid = _tablet_manager->get_tablet_id_and_schema_hash_from_path(path, &tablet_id, &schema_hash); - if (!is_valid) { + if (!is_valid || tablet_id < 1 || schema_hash < 1) [[unlikely]] { LOG(WARNING) << "unknown path:" << path; + --backward; + std::swap(*forward, *backward); continue; } - // should not happen, because already check it is a valid tablet schema hash path in previous step - // so that log fatal here - if (tablet_id < 1 || schema_hash < 1) { - LOG(WARNING) << "invalid tablet id " << tablet_id << " or schema hash " << schema_hash - << ", path=" << path; + if (auto tablet = _tablet_manager->get_tablet(tablet_id); !tablet) { + _tablet_manager->try_delete_unused_tablet_path(this, tablet_id, schema_hash, path); + --backward; + std::swap(*forward, *backward); continue; } - TabletSharedPtr tablet = _tablet_manager->get_tablet(tablet_id); - if (tablet != nullptr) { - // could find the tablet, then skip check it - continue; - } - std::string data_dir_path = - io::Path(path).parent_path().parent_path().parent_path().parent_path(); - DataDir* data_dir = StorageEngine::instance()->get_store(data_dir_path); - if (data_dir == nullptr) { - LOG(WARNING) << "could not find data dir for tablet path " << path; - continue; - } - _tablet_manager->try_delete_unused_tablet_path(data_dir, tablet_id, schema_hash, path); - } - _all_tablet_schemahash_paths.clear(); - LOG(INFO) << "finished one time path gc by tablet."; + // could find the tablet, then skip check it + ++forward; + } while (forward != backward && !_stop_bg_worker); + tablet_paths.erase(backward, tablet_paths.end()); + LOG(INFO) << "finish path gc by tablet, dir=" << _path; } -void DataDir::_perform_path_gc_by_rowsetid() { - // init the set of valid path - // validate the path in data dir - if (_all_check_paths.empty()) { +void DataDir::_perform_path_gc_by_rowset(const std::vector& tablet_paths) { + if (_stop_bg_worker) return; + if (tablet_paths.empty()) { return; } - LOG(INFO) << "start to path gc by rowsetid."; + LOG(INFO) << "start to path gc by rowset"; int counter = 0; - for (const auto& path : _all_check_paths) { + for (const auto& path : tablet_paths) { + if (_stop_bg_worker) break; ++counter; if (config::path_gc_check_step > 0 && counter % config::path_gc_check_step == 0) { std::this_thread::sleep_for( @@ -704,55 +690,92 @@ void DataDir::_perform_path_gc_by_rowsetid() { TSchemaHash schema_hash = -1; bool is_valid = _tablet_manager->get_tablet_id_and_schema_hash_from_path(path, &tablet_id, &schema_hash); - if (!is_valid) { + if (!is_valid || tablet_id < 1 || schema_hash < 1) [[unlikely]] { LOG(WARNING) << "unknown path:" << path; continue; } - if (tablet_id > 0 && schema_hash > 0) { - // tablet schema hash path or rowset file path - // gc thread should get tablet include deleted tablet - // or it will delete rowset file before tablet is garbage collected - RowsetId rowset_id; - bool is_rowset_file = TabletManager::get_rowset_id_from_path(path, &rowset_id); - if (is_rowset_file) { - TabletSharedPtr tablet = _tablet_manager->get_tablet(tablet_id); - if (tablet != nullptr) { - if (!tablet->check_rowset_id(rowset_id) && - !StorageEngine::instance()->check_rowset_id_in_unused_rowsets(rowset_id)) { - _process_garbage_path(path); - } + bool exists; + std::vector files; + if (_stop_bg_worker) break; + auto st = io::global_local_filesystem()->list(path, true, &files, &exists); + if (!st.ok()) [[unlikely]] { + LOG(WARNING) << "fail to list tablet path " << path << " : " << st; + continue; + } + RowsetIdUnorderedSet useful_rowsets; + auto tablet = _tablet_manager->get_tablet(tablet_id); + if (!tablet) { + // Could not found the tablet, maybe it's a dropped tablet, will be reclaimed + // in the next time `_perform_path_gc_by_tablet` + continue; + } + tablet->traverse_rowsets( + [&useful_rowsets](auto& rs) { useful_rowsets.insert(rs->rowset_id()); }, true); + // rowset_id -> is_garbage + std::unordered_map checked_rowsets; + auto reclaim_rowset_file = [](const std::string& path) { + auto st = io::global_local_filesystem()->delete_file(path); + if (!st.ok()) [[unlikely]] { + LOG(WARNING) << "failed to delete garbage rowset file: " << st; + return; + } + LOG(INFO) << "delete garbage path: " << path; // Audit log + }; + auto should_reclaim = [&, this](const RowsetId& rowset_id) { + return !useful_rowsets.contains(rowset_id) && + !StorageEngine::instance()->pending_local_rowsets().contains(rowset_id) && + !StorageEngine::instance()->check_rowset_id_in_unused_rowsets(rowset_id) && + RowsetMetaManager::exists(get_meta(), tablet->tablet_uid(), rowset_id) + .is(); + }; + for (auto&& file : files) { + auto rowset_id = extract_rowset_id(file.file_name); + if (rowset_id.hi == 0) { + continue; // Not a rowset + } + auto find_it = checked_rowsets.find(rowset_id); + if (find_it != checked_rowsets.end()) { + if (find_it->second) { + reclaim_rowset_file(path + '/' + file.file_name); } + continue; + } + // Check rowset useful + if (should_reclaim(rowset_id)) { + reclaim_rowset_file(path + '/' + file.file_name); + checked_rowsets.emplace(rowset_id, true); + } else { + checked_rowsets.emplace(rowset_id, false); } } } - _all_check_paths.clear(); - LOG(INFO) << "finished one time path gc by rowsetid."; + LOG(INFO) << "finish path gc by rowset."; } // path producer -Status DataDir::perform_path_scan() { - std::unique_lock lck(_check_path_mutex); - if (!_all_check_paths.empty()) { - LOG(INFO) << "_all_check_paths is not empty when path scan."; - return Status::OK(); - } - LOG(INFO) << "start to scan data dir path:" << _path; +std::vector DataDir::_perform_path_scan() { + std::vector tablet_paths; + if (_stop_bg_worker) return tablet_paths; + LOG(INFO) << "start to scan data dir " << _path; auto data_path = fmt::format("{}/{}", _path, DATA_PREFIX); std::vector shards; bool exists = true; - RETURN_IF_ERROR(io::global_local_filesystem()->list(data_path, false, &shards, &exists)); - - Status ret; + const auto& fs = io::global_local_filesystem(); + auto st = fs->list(data_path, false, &shards, &exists); + if (!st.ok()) [[unlikely]] { + LOG(WARNING) << "failed to scan data dir: " << st; + return tablet_paths; + } for (const auto& shard : shards) { + if (_stop_bg_worker) break; if (shard.is_file) { continue; } auto shard_path = fmt::format("{}/{}", data_path, shard.file_name); std::vector tablet_ids; - ret = io::global_local_filesystem()->list(shard_path, false, &tablet_ids, &exists); - if (!ret.ok()) { - LOG(WARNING) << "fail to walk dir. [path=" << shard_path << "] error[" - << ret.to_string() << "]"; + st = io::global_local_filesystem()->list(shard_path, false, &tablet_ids, &exists); + if (!st.ok()) [[unlikely]] { + LOG(WARNING) << "fail to walk dir, shard_path=" << shard_path << " : " << st; continue; } for (const auto& tablet_id : tablet_ids) { @@ -762,71 +785,24 @@ Status DataDir::perform_path_scan() { if (tablet_id.is_file) { continue; } - auto tablet_id_path = fmt::format("{}/{}", shard_path, tablet_id.file_name); std::vector schema_hashes; - ret = io::global_local_filesystem()->list(tablet_id_path, false, &schema_hashes, - &exists); - if (!ret.ok()) { - LOG(WARNING) << "fail to walk dir. [path=" << tablet_id_path << "]" - << " error[" << ret.to_string() << "]"; + st = io::global_local_filesystem()->list(tablet_id_path, false, &schema_hashes, + &exists); + if (!st.ok()) [[unlikely]] { + LOG(WARNING) << "fail to walk dir, tablet_id_path=" << tablet_id_path << " : " + << st; continue; } - - for (const auto& schema_hash : schema_hashes) { - if (schema_hash.is_file) { - continue; - } - int32_t interval_ms = config::path_scan_step_interval_ms; - if (_stop_bg_worker) { - break; - } - if (interval_ms > 0) { - std::this_thread::sleep_for(std::chrono::milliseconds(interval_ms)); - } - auto tablet_schema_hash_path = - fmt::format("{}/{}", tablet_id_path, schema_hash.file_name); - _all_tablet_schemahash_paths.insert(tablet_schema_hash_path); - - std::vector rowset_files; - ret = io::global_local_filesystem()->list(tablet_schema_hash_path, true, - &rowset_files, &exists); - if (!ret.ok()) { - LOG(WARNING) << "fail to walk dir. [path=" << tablet_schema_hash_path - << "] error[" << ret.to_string() << "]"; - continue; - } - for (const auto& rowset_file : rowset_files) { - if (!rowset_file.is_file) { - continue; - } - auto rowset_file_path = - fmt::format("{}/{}", tablet_schema_hash_path, rowset_file.file_name); - _all_check_paths.insert(rowset_file_path); - } + if (schema_hashes.size() != 1 || schema_hashes[0].is_file) [[unlikely]] { + LOG(WARNING) << "invalid tablet_path, path=" << tablet_id_path; + continue; } + tablet_paths.push_back(tablet_id_path + '/' + schema_hashes[0].file_name); } } - LOG(INFO) << "scan data dir path: " << _path << " finished. path size: " - << _all_check_paths.size() + _all_tablet_schemahash_paths.size(); - _check_path_cv.notify_one(); - return Status::OK(); -} - -// This function is called for rowset_id path, only local rowset_id_path can be garbage. -// remote path is uploaded, moved or deleted by tablet_id, -// if local path has no remote path params, remote path doesn't exist. -void DataDir::_process_garbage_path(const std::string& path) { - bool exists = false; - Status st = io::global_local_filesystem()->exists(path, &exists); - if (!st) { - return; - } - if (exists) { - LOG(INFO) << "collect garbage path: " << path; - WARN_IF_ERROR(io::global_local_filesystem()->delete_directory_or_file(path), - "remove garbage failed"); - } + LOG(INFO) << "scan data dir path: " << _path << " finished. path size: " << tablet_paths.size(); + return tablet_paths; } Status DataDir::update_capacity() { diff --git a/be/src/olap/data_dir.h b/be/src/olap/data_dir.h index e71dc0f8d56322..0cdb15b24178c0 100644 --- a/be/src/olap/data_dir.h +++ b/be/src/olap/data_dir.h @@ -82,7 +82,7 @@ class DataDir { Status set_cluster_id(int32_t cluster_id); void health_check(); - Status get_shard(uint64_t* shard); + uint64_t get_shard(); OlapMeta* get_meta() { return _meta; } @@ -105,10 +105,6 @@ class DataDir { // load data from meta and data files Status load(); - // this function scans the paths in data dir to collect the paths to check - // this is a producer function. After scan, it will notify the perform_path_gc function to gc - Status perform_path_scan(); - void perform_path_gc(); void perform_remote_rowset_gc(); @@ -158,13 +154,11 @@ class DataDir { // process will log fatal. Status _check_incompatible_old_format_tablet(); - void _process_garbage_path(const std::string& path); - - void _remove_check_paths(const std::set& paths); + std::vector _perform_path_scan(); - void _perform_path_gc_by_tablet(); + void _perform_path_gc_by_tablet(std::vector& tablet_paths); - void _perform_path_gc_by_rowsetid(); + void _perform_path_gc_by_rowset(const std::vector& tablet_paths); private: std::atomic _stop_bg_worker = false; @@ -198,11 +192,6 @@ class DataDir { OlapMeta* _meta = nullptr; RowsetIdGenerator* _id_generator = nullptr; - std::mutex _check_path_mutex; - std::condition_variable _check_path_cv; - std::set _all_check_paths; - std::set _all_tablet_schemahash_paths; - std::shared_ptr _data_dir_metric_entity; IntGauge* disks_total_capacity; IntGauge* disks_avail_capacity; diff --git a/be/src/olap/delete_bitmap_calculator.h b/be/src/olap/delete_bitmap_calculator.h index 902db7cb71c9cd..dd17fe7b686b96 100644 --- a/be/src/olap/delete_bitmap_calculator.h +++ b/be/src/olap/delete_bitmap_calculator.h @@ -33,7 +33,6 @@ #include "olap/rowset/rowset.h" #include "olap/rowset/rowset_meta.h" #include "olap/rowset/rowset_reader.h" -#include "olap/rowset/rowset_tree.h" #include "olap/rowset/segment_v2/segment.h" #include "olap/tablet_meta.h" #include "olap/tablet_schema.h" diff --git a/be/src/olap/delta_writer.cpp b/be/src/olap/delta_writer.cpp index 23e1718cb7d3ca..68b97e49cb39a1 100644 --- a/be/src/olap/delta_writer.cpp +++ b/be/src/olap/delta_writer.cpp @@ -28,7 +28,6 @@ #include #include -// IWYU pragma: no_include #include "cloud/config.h" #include "common/compiler_util.h" // IWYU pragma: keep #include "common/config.h" @@ -243,21 +242,20 @@ void DeltaWriter::_request_slave_tablet_pull_rowset(PNodeInfo node_info) { } } - PTabletWriteSlaveRequest request; - RowsetMetaPB rowset_meta_pb = cur_rowset->rowset_meta()->get_rowset_pb(); - request.set_allocated_rowset_meta(&rowset_meta_pb); - request.set_host(BackendOptions::get_localhost()); - request.set_http_port(config::webserver_port); + auto request = std::make_shared(); + *(request->mutable_rowset_meta()) = cur_rowset->rowset_meta()->get_rowset_pb(); + request->set_host(BackendOptions::get_localhost()); + request->set_http_port(config::webserver_port); string tablet_path = _rowset_builder.tablet()->tablet_path(); - request.set_rowset_path(tablet_path); - request.set_token(ExecEnv::GetInstance()->token()); - request.set_brpc_port(config::brpc_port); - request.set_node_id(node_info.id()); + request->set_rowset_path(tablet_path); + request->set_token(ExecEnv::GetInstance()->token()); + request->set_brpc_port(config::brpc_port); + request->set_node_id(node_info.id()); for (int segment_id = 0; segment_id < cur_rowset->rowset_meta()->num_segments(); segment_id++) { std::stringstream segment_name; segment_name << cur_rowset->rowset_id() << "_" << segment_id << ".dat"; int64_t segment_size = std::filesystem::file_size(tablet_path + "/" + segment_name.str()); - request.mutable_segments_size()->insert({segment_id, segment_size}); + request->mutable_segments_size()->insert({segment_id, segment_size}); if (!indices_ids.empty()) { for (auto index_id : indices_ids) { @@ -269,41 +267,39 @@ void DeltaWriter::_request_slave_tablet_pull_rowset(PNodeInfo node_info) { index_size.set_size(size); // Fetch the map value for the current segment_id. // If it doesn't exist, this will insert a new default-constructed IndexSizeMapValue - auto& index_size_map_value = (*request.mutable_inverted_indices_size())[segment_id]; + auto& index_size_map_value = + (*(request->mutable_inverted_indices_size()))[segment_id]; // Add the new index size to the map value. *index_size_map_value.mutable_index_sizes()->Add() = std::move(index_size); } } } - RefCountClosure* closure = - new RefCountClosure(); - closure->ref(); - closure->ref(); - closure->cntl.set_timeout_ms(config::slave_replica_writer_rpc_timeout_sec * 1000); - closure->cntl.ignore_eovercrowded(); - stub->request_slave_tablet_pull_rowset(&closure->cntl, &request, &closure->result, closure); - static_cast(request.release_rowset_meta()); - - closure->join(); - if (closure->cntl.Failed()) { + + auto pull_callback = DummyBrpcCallback::create_shared(); + auto closure = AutoReleaseClosure< + PTabletWriteSlaveRequest, + DummyBrpcCallback>::create_unique(request, pull_callback); + closure->cntl_->set_timeout_ms(config::slave_replica_writer_rpc_timeout_sec * 1000); + closure->cntl_->ignore_eovercrowded(); + stub->request_slave_tablet_pull_rowset(closure->cntl_.get(), closure->request_.get(), + closure->response_.get(), closure.get()); + + closure.release(); + pull_callback->join(); + if (pull_callback->cntl_->Failed()) { if (!ExecEnv::GetInstance()->brpc_internal_client_cache()->available( stub, node_info.host(), node_info.async_internal_port())) { ExecEnv::GetInstance()->brpc_internal_client_cache()->erase( - closure->cntl.remote_side()); + pull_callback->cntl_->remote_side()); } LOG(WARNING) << "failed to send pull rowset request to slave replica, error=" - << berror(closure->cntl.ErrorCode()) - << ", error_text=" << closure->cntl.ErrorText() + << berror(pull_callback->cntl_->ErrorCode()) + << ", error_text=" << pull_callback->cntl_->ErrorText() << ". slave host: " << node_info.host() << ", tablet_id=" << _req.tablet_id << ", txn_id=" << _req.txn_id; std::lock_guard lock(_slave_node_lock); _unfinished_slave_node.erase(node_info.id()); } - - if (closure->unref()) { - delete closure; - } - closure = nullptr; } void DeltaWriter::finish_slave_tablet_pull_rowset(int64_t node_id, bool is_succeed) { diff --git a/be/src/olap/delta_writer.h b/be/src/olap/delta_writer.h index 303f17f14f8f36..d4519c36a785a3 100644 --- a/be/src/olap/delta_writer.h +++ b/be/src/olap/delta_writer.h @@ -105,7 +105,7 @@ class DeltaWriter { int64_t txn_id() const { return _req.txn_id; } - int64_t total_received_rows() const { return _total_received_rows; } + int64_t total_received_rows() const { return _memtable_writer->total_received_rows(); } int64_t num_rows_filtered() const; @@ -132,9 +132,6 @@ class DeltaWriter { PSuccessSlaveTabletNodeIds _success_slave_node_ids; std::shared_mutex _slave_node_lock; - // total rows num written by DeltaWriter - int64_t _total_received_rows = 0; - RuntimeProfile* _profile = nullptr; RuntimeProfile::Counter* _close_wait_timer = nullptr; RuntimeProfile::Counter* _commit_txn_timer = nullptr; diff --git a/be/src/olap/delta_writer_v2.cpp b/be/src/olap/delta_writer_v2.cpp index ef3ff23f9d88c3..0a4108970a60c4 100644 --- a/be/src/olap/delta_writer_v2.cpp +++ b/be/src/olap/delta_writer_v2.cpp @@ -28,7 +28,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/config.h" #include "common/logging.h" @@ -64,28 +63,27 @@ namespace doris { using namespace ErrorCode; -Status DeltaWriterV2::open(WriteRequest* req, - const std::vector>& streams, - DeltaWriterV2** writer, RuntimeProfile* profile) { - *writer = new DeltaWriterV2(req, streams, StorageEngine::instance(), profile); - return Status::OK(); +std::unique_ptr DeltaWriterV2::open( + WriteRequest* req, const std::vector>& streams) { + std::unique_ptr writer( + new DeltaWriterV2(req, streams, StorageEngine::instance())); + return writer; } DeltaWriterV2::DeltaWriterV2(WriteRequest* req, const std::vector>& streams, - StorageEngine* storage_engine, RuntimeProfile* profile) + StorageEngine* storage_engine) : _req(*req), _tablet_schema(new TabletSchema), - _profile(profile->create_child(fmt::format("DeltaWriterV2 {}", _req.tablet_id), true, - true)), _memtable_writer(new MemTableWriter(*req)), - _streams(streams) { - _init_profile(profile); -} - -void DeltaWriterV2::_init_profile(RuntimeProfile* profile) { - _write_memtable_timer = ADD_TIMER(_profile, "WriteMemTableTime"); - _close_wait_timer = ADD_TIMER(_profile, "CloseWaitTime"); + _streams(streams) {} + +void DeltaWriterV2::_update_profile(RuntimeProfile* profile) { + auto child = profile->create_child(fmt::format("DeltaWriterV2 {}", _req.tablet_id), true, true); + auto write_memtable_timer = ADD_TIMER(child, "WriteMemTableTime"); + auto close_wait_timer = ADD_TIMER(child, "CloseWaitTime"); + COUNTER_SET(write_memtable_timer, _write_memtable_time); + COUNTER_SET(close_wait_timer, _close_wait_time); } DeltaWriterV2::~DeltaWriterV2() { @@ -102,6 +100,9 @@ Status DeltaWriterV2::init() { return Status::OK(); } // build tablet schema in request level + if (_streams.size() == 0 || _streams[0]->tablet_schema(_req.index_id) == nullptr) { + return Status::InternalError("failed to find tablet schema for {}", _req.index_id); + } _build_current_tablet_schema(_req.index_id, _req.table_schema_param, *_streams[0]->tablet_schema(_req.index_id)); RowsetWriterContext context; @@ -149,7 +150,7 @@ Status DeltaWriterV2::write(const vectorized::Block* block, const std::vectorwrite(block, row_idxs, is_append); } @@ -168,13 +169,16 @@ Status DeltaWriterV2::close() { return _memtable_writer->close(); } -Status DeltaWriterV2::close_wait() { - SCOPED_TIMER(_close_wait_timer); +Status DeltaWriterV2::close_wait(RuntimeProfile* profile) { + SCOPED_RAW_TIMER(&_close_wait_time); std::lock_guard l(_lock); DCHECK(_is_init) << "delta writer is supposed be to initialized before close_wait() being called"; - RETURN_IF_ERROR(_memtable_writer->close_wait(_profile)); + if (profile != nullptr) { + _update_profile(profile); + } + RETURN_IF_ERROR(_memtable_writer->close_wait(profile)); _delta_written_success = true; return Status::OK(); diff --git a/be/src/olap/delta_writer_v2.h b/be/src/olap/delta_writer_v2.h index b2b1f5f1c19f1c..8a102c5706d496 100644 --- a/be/src/olap/delta_writer_v2.h +++ b/be/src/olap/delta_writer_v2.h @@ -63,9 +63,8 @@ class Block; // This class is NOT thread-safe, external synchronization is required. class DeltaWriterV2 { public: - static Status open(WriteRequest* req, - const std::vector>& streams, - DeltaWriterV2** writer, RuntimeProfile* profile); + static std::unique_ptr open( + WriteRequest* req, const std::vector>& streams); ~DeltaWriterV2(); @@ -80,7 +79,7 @@ class DeltaWriterV2 { Status close(); // wait for all memtables to be flushed. // mem_consumption() should be 0 after this function returns. - Status close_wait(); + Status close_wait(RuntimeProfile* profile = nullptr); // abandon current memtable and wait for all pending-flushing memtables to be destructed. // mem_consumption() should be 0 after this function returns. @@ -99,13 +98,13 @@ class DeltaWriterV2 { private: DeltaWriterV2(WriteRequest* req, const std::vector>& streams, - StorageEngine* storage_engine, RuntimeProfile* profile); + StorageEngine* storage_engine); void _build_current_tablet_schema(int64_t index_id, const OlapTableSchemaParam* table_schema_param, const TabletSchema& ori_tablet_schema); - void _init_profile(RuntimeProfile* profile); + void _update_profile(RuntimeProfile* profile); bool _is_init = false; bool _is_cancelled = false; @@ -119,9 +118,8 @@ class DeltaWriterV2 { // total rows num written by DeltaWriterV2 int64_t _total_received_rows = 0; - RuntimeProfile* _profile = nullptr; - RuntimeProfile::Counter* _write_memtable_timer = nullptr; - RuntimeProfile::Counter* _close_wait_timer = nullptr; + int64_t _write_memtable_time = 0; + int64_t _close_wait_time = 0; std::shared_ptr _memtable_writer; MonotonicStopWatch _lock_watch; diff --git a/be/src/olap/memtable.h b/be/src/olap/memtable.h index b98e7411e3b1cc..cade509aac5ce1 100644 --- a/be/src/olap/memtable.h +++ b/be/src/olap/memtable.h @@ -155,14 +155,14 @@ class MemTableStat { return *this; } - int64_t raw_rows = 0; - int64_t merged_rows = 0; + std::atomic raw_rows = 0; + std::atomic merged_rows = 0; int64_t sort_ns = 0; int64_t agg_ns = 0; int64_t put_into_output_ns = 0; int64_t duration_ns = 0; - int64_t sort_times = 0; - int64_t agg_times = 0; + std::atomic sort_times = 0; + std::atomic agg_times = 0; }; class MemTable { diff --git a/be/src/olap/memtable_flush_executor.cpp b/be/src/olap/memtable_flush_executor.cpp index c5d74a6de5e657..ac114291a0b251 100644 --- a/be/src/olap/memtable_flush_executor.cpp +++ b/be/src/olap/memtable_flush_executor.cpp @@ -18,9 +18,9 @@ #include "olap/memtable_flush_executor.h" #include -#include #include +#include #include #include "common/config.h" @@ -29,12 +29,18 @@ #include "olap/memtable.h" #include "olap/rowset/rowset_writer.h" #include "util/doris_metrics.h" +#include "util/metrics.h" #include "util/stopwatch.hpp" #include "util/time.h" namespace doris { using namespace ErrorCode; +DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(flush_thread_pool_queue_size, MetricUnit::NOUNIT); +DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(flush_thread_pool_thread_num, MetricUnit::NOUNIT); + +bvar::Adder g_flush_task_num("memtable_flush_task_num"); + class MemtableFlushTask final : public Runnable { public: MemtableFlushTask(FlushToken* flush_token, std::unique_ptr memtable, @@ -42,9 +48,11 @@ class MemtableFlushTask final : public Runnable { : _flush_token(flush_token), _memtable(std::move(memtable)), _segment_id(segment_id), - _submit_task_time(submit_task_time) {} + _submit_task_time(submit_task_time) { + g_flush_task_num << 1; + } - ~MemtableFlushTask() override = default; + ~MemtableFlushTask() override { g_flush_task_num << -1; } void run() override { _flush_token->_flush_memtable(_memtable.get(), _segment_id, _submit_task_time); @@ -122,7 +130,8 @@ Status FlushToken::_do_flush_memtable(MemTable* memtable, int32_t segment_id, in return Status::OK(); } -void FlushToken::_flush_memtable(MemTable* memtable, int32_t segment_id, int64_t submit_task_time) { +void FlushToken::_flush_memtable(MemTable* mem_table, int32_t segment_id, + int64_t submit_task_time) { uint64_t flush_wait_time_ns = MonotonicNanos() - submit_task_time; _stats.flush_wait_time_ns += flush_wait_time_ns; // If previous flush has failed, return directly @@ -135,10 +144,10 @@ void FlushToken::_flush_memtable(MemTable* memtable, int32_t segment_id, int64_t MonotonicStopWatch timer; timer.start(); - size_t memory_usage = memtable->memory_usage(); + size_t memory_usage = mem_table->memory_usage(); int64_t flush_size; - Status s = _do_flush_memtable(memtable, segment_id, &flush_size); + Status s = _do_flush_memtable(mem_table, segment_id, &flush_size); { std::shared_lock rdlk(_flush_status_lock); @@ -161,7 +170,7 @@ void FlushToken::_flush_memtable(MemTable* memtable, int32_t segment_id, int64_t _stats.flush_time_ns += timer.elapsed_time(); _stats.flush_finish_count++; _stats.flush_running_count--; - _stats.flush_size_bytes += memtable->memory_usage(); + _stats.flush_size_bytes += mem_table->memory_usage(); _stats.flush_disk_size_bytes += flush_size; } @@ -180,6 +189,7 @@ void MemTableFlushExecutor::init(const std::vector& data_dirs) { .set_min_threads(min_threads) .set_max_threads(max_threads) .build(&_high_prio_flush_pool)); + _register_metrics(); } // NOTE: we use SERIAL mode here to ensure all mem-tables from one tablet are flushed in order. @@ -189,26 +199,38 @@ Status MemTableFlushExecutor::create_flush_token(std::unique_ptr& fl if (!is_high_priority) { if (rowset_writer->type() == BETA_ROWSET && !should_serial) { // beta rowset can be flush in CONCURRENT, because each memtable using a new segment writer. - flush_token.reset( - new FlushToken(_flush_pool->new_token(ThreadPool::ExecutionMode::CONCURRENT))); + flush_token = std::make_unique( + _flush_pool->new_token(ThreadPool::ExecutionMode::CONCURRENT)); } else { // alpha rowset do not support flush in CONCURRENT. - flush_token.reset( - new FlushToken(_flush_pool->new_token(ThreadPool::ExecutionMode::SERIAL))); + flush_token = std::make_unique( + _flush_pool->new_token(ThreadPool::ExecutionMode::SERIAL)); } } else { if (rowset_writer->type() == BETA_ROWSET && !should_serial) { // beta rowset can be flush in CONCURRENT, because each memtable using a new segment writer. - flush_token.reset(new FlushToken( - _high_prio_flush_pool->new_token(ThreadPool::ExecutionMode::CONCURRENT))); + flush_token = std::make_unique( + _high_prio_flush_pool->new_token(ThreadPool::ExecutionMode::CONCURRENT)); } else { // alpha rowset do not support flush in CONCURRENT. - flush_token.reset(new FlushToken( - _high_prio_flush_pool->new_token(ThreadPool::ExecutionMode::SERIAL))); + flush_token = std::make_unique( + _high_prio_flush_pool->new_token(ThreadPool::ExecutionMode::SERIAL)); } } flush_token->set_rowset_writer(rowset_writer); return Status::OK(); } +void MemTableFlushExecutor::_register_metrics() { + REGISTER_HOOK_METRIC(flush_thread_pool_queue_size, + [this]() { return _flush_pool->get_queue_size(); }); + REGISTER_HOOK_METRIC(flush_thread_pool_thread_num, + [this]() { return _flush_pool->num_threads(); }) +} + +void MemTableFlushExecutor::_deregister_metrics() { + DEREGISTER_HOOK_METRIC(flush_thread_pool_queue_size); + DEREGISTER_HOOK_METRIC(flush_thread_pool_thread_num); +} + } // namespace doris diff --git a/be/src/olap/memtable_flush_executor.h b/be/src/olap/memtable_flush_executor.h index ee7194349f6d41..d2039ce8127690 100644 --- a/be/src/olap/memtable_flush_executor.h +++ b/be/src/olap/memtable_flush_executor.h @@ -17,9 +17,8 @@ #pragma once -#include - #include +#include #include #include #include @@ -108,8 +107,9 @@ class FlushToken { // ... class MemTableFlushExecutor { public: - MemTableFlushExecutor() {} + MemTableFlushExecutor() = default; ~MemTableFlushExecutor() { + _deregister_metrics(); _flush_pool->shutdown(); _high_prio_flush_pool->shutdown(); } @@ -122,6 +122,9 @@ class MemTableFlushExecutor { bool should_serial, bool is_high_priority); private: + void _register_metrics(); + static void _deregister_metrics(); + std::unique_ptr _flush_pool; std::unique_ptr _high_prio_flush_pool; }; diff --git a/be/src/olap/memtable_writer.cpp b/be/src/olap/memtable_writer.cpp index 2ef704f075a046..bd81fcd7852cc5 100644 --- a/be/src/olap/memtable_writer.cpp +++ b/be/src/olap/memtable_writer.cpp @@ -24,7 +24,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/config.h" #include "common/logging.h" diff --git a/be/src/olap/memtable_writer.h b/be/src/olap/memtable_writer.h index 3491f72abd58a0..1893e2cd6a6aab 100644 --- a/be/src/olap/memtable_writer.h +++ b/be/src/olap/memtable_writer.h @@ -20,9 +20,9 @@ #include #include #include -#include #include +#include #include #include #include @@ -103,7 +103,7 @@ class MemTableWriter { // Wait all memtable in flush queue to be flushed Status wait_flush(); - int64_t tablet_id() { return _req.tablet_id; } + int64_t tablet_id() const { return _req.tablet_id; } int64_t total_received_rows() const { return _total_received_rows; } @@ -137,7 +137,7 @@ class MemTableWriter { std::mutex _lock; // total rows num written by MemTableWriter - int64_t _total_received_rows = 0; + std::atomic _total_received_rows = 0; int64_t _wait_flush_time_ns = 0; int64_t _close_wait_time_ns = 0; int64_t _segment_num = 0; diff --git a/be/src/olap/olap_common.h b/be/src/olap/olap_common.h index 07a6d2964814de..1921902a9d038b 100644 --- a/be/src/olap/olap_common.h +++ b/be/src/olap/olap_common.h @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -380,11 +381,16 @@ struct RowsetId { int64_t mi = 0; int64_t lo = 0; - void init(const std::string& rowset_id_str) { + void init(std::string_view rowset_id_str) { // for new rowsetid its a 48 hex string // if the len < 48, then it is an old format rowset id - if (rowset_id_str.length() < 48) { - int64_t high = std::stol(rowset_id_str, nullptr, 10); + if (rowset_id_str.length() < 48) [[unlikely]] { + int64_t high; + auto [_, ec] = std::from_chars(rowset_id_str.data(), + rowset_id_str.data() + rowset_id_str.length(), high); + if (ec != std::errc {}) [[unlikely]] { + LOG(FATAL) << "failed to init rowset id: " << rowset_id_str; + } init(1, high, 0, 0); } else { int64_t high = 0; @@ -460,6 +466,30 @@ struct HashOfRowsetId { using RowsetIdUnorderedSet = std::unordered_set; +// Extract rowset id from filename, return uninitialized rowset id if filename is invalid +inline RowsetId extract_rowset_id(std::string_view filename) { + RowsetId rowset_id; + if (filename.ends_with(".dat")) { + // filename format: {rowset_id}_{segment_num}.dat + auto end = filename.find('_'); + if (end == std::string::npos) { + return rowset_id; + } + rowset_id.init(filename.substr(0, end)); + return rowset_id; + } + if (filename.ends_with(".idx")) { + // filename format: {rowset_id}_{segment_num}_{index_id}.idx + auto end = filename.find('_'); + if (end == std::string::npos) { + return rowset_id; + } + rowset_id.init(filename.substr(0, end)); + return rowset_id; + } + return rowset_id; +} + class DeleteBitmap; // merge on write context struct MowContext { diff --git a/be/src/olap/olap_server.cpp b/be/src/olap/olap_server.cpp index f3e412325208fc..fa474a6ae60352 100644 --- a/be/src/olap/olap_server.cpp +++ b/be/src/olap/olap_server.cpp @@ -184,13 +184,6 @@ Status StorageEngine::start_bg_threads() { // path scan and gc thread if (config::path_gc_check) { for (auto data_dir : get_stores()) { - scoped_refptr path_scan_thread; - RETURN_IF_ERROR(Thread::create( - "StorageEngine", "path_scan_thread", - [this, data_dir]() { this->_path_scan_thread_callback(data_dir); }, - &path_scan_thread)); - _path_scan_threads.emplace_back(path_scan_thread); - scoped_refptr path_gc_thread; RETURN_IF_ERROR(Thread::create( "StorageEngine", "path_gc_thread", @@ -198,7 +191,7 @@ Status StorageEngine::start_bg_threads() { &path_gc_thread)); _path_gc_threads.emplace_back(path_gc_thread); } - LOG(INFO) << "path scan/gc threads started. number:" << get_stores().size(); + LOG(INFO) << "path gc threads started. number:" << get_stores().size(); } RETURN_IF_ERROR(ThreadPoolBuilder("CooldownTaskThreadPool") @@ -281,7 +274,7 @@ void StorageEngine::_garbage_sweeper_thread_callback() { double usage = 1.0; // After the program starts, the first round of cleaning starts after min_interval. uint32_t curr_interval = min_interval; - while (!_stop_background_threads_latch.wait_for(std::chrono::seconds(curr_interval))) { + do { // Function properties: // when usage < 0.6, ratio close to 1.(interval close to max_interval) // when usage at [0.6, 0.75], ratio is rapidly decreasing from 0.87 to 0.27. @@ -305,7 +298,7 @@ void StorageEngine::_garbage_sweeper_thread_callback() { << "see previous message for detail. err code=" << res; // do nothing. continue next loop. } - } + } while (!_stop_background_threads_latch.wait_for(std::chrono::seconds(curr_interval))); } void StorageEngine::_disk_stat_monitor_thread_callback() { @@ -370,24 +363,6 @@ void StorageEngine::_path_gc_thread_callback(DataDir* data_dir) { LOG(INFO) << "stop path gc thread!"; } -void StorageEngine::_path_scan_thread_callback(DataDir* data_dir) { - int32_t interval = config::path_scan_interval_second; - do { - LOG(INFO) << "try to perform path scan!"; - Status st = data_dir->perform_path_scan(); - if (!st) { - LOG(WARNING) << "path scan failed: " << st; - } - - interval = config::path_scan_interval_second; - if (interval <= 0) { - LOG(WARNING) << "path gc thread check interval config is illegal:" << interval - << "will be forced set to one day"; - interval = 24 * 3600; // one day - } - } while (!_stop_background_threads_latch.wait_for(std::chrono::seconds(interval))); -} - void StorageEngine::_tablet_checkpoint_callback(const std::vector& data_dirs) { int64_t interval = config::generate_tablet_meta_checkpoint_tasks_interval_secs; do { diff --git a/be/src/olap/primary_key_index.cpp b/be/src/olap/primary_key_index.cpp index 3b8fea52a8c4d5..cc39441220206d 100644 --- a/be/src/olap/primary_key_index.cpp +++ b/be/src/olap/primary_key_index.cpp @@ -21,7 +21,6 @@ #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/config.h" #include "io/fs/file_writer.h" diff --git a/be/src/olap/push_handler.cpp b/be/src/olap/push_handler.cpp index ee62c5baf6649c..de669de60399f0 100644 --- a/be/src/olap/push_handler.cpp +++ b/be/src/olap/push_handler.cpp @@ -35,13 +35,13 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/config.h" #include "common/logging.h" #include "common/status.h" #include "olap/delete_handler.h" #include "olap/olap_define.h" +#include "olap/rowset/pending_rowset_helper.h" #include "olap/rowset/rowset_meta.h" #include "olap/rowset/rowset_writer.h" #include "olap/rowset/rowset_writer_context.h" @@ -195,8 +195,10 @@ Status PushHandler::_do_streaming_ingestion(TabletSharedPtr tablet, const TPushR rowset_to_add->rowset_meta()->set_delete_predicate(std::move(del_preds.front())); del_preds.pop(); } + // Transfer ownership of `PendingRowsetGuard` to `TxnManager` Status commit_status = StorageEngine::instance()->txn_manager()->commit_txn( - request.partition_id, *tablet, request.transaction_id, load_id, rowset_to_add, false); + request.partition_id, *tablet, request.transaction_id, load_id, rowset_to_add, + std::move(_pending_rs_guard), false); if (!commit_status.ok() && !commit_status.is()) { res = std::move(commit_status); } @@ -220,7 +222,6 @@ Status PushHandler::_convert_v2(TabletSharedPtr cur_tablet, RowsetSharedPtr* cur // although the spark load output files are fully sorted, // but it depends on thirparty implementation, so we conservatively // set this value to OVERLAP_UNKNOWN - std::unique_ptr rowset_writer; RowsetWriterContext context; context.txn_id = _request.transaction_id; context.load_id = load_id; @@ -228,12 +229,9 @@ Status PushHandler::_convert_v2(TabletSharedPtr cur_tablet, RowsetSharedPtr* cur context.segments_overlap = OVERLAP_UNKNOWN; context.tablet_schema = tablet_schema; context.newest_write_timestamp = UnixSeconds(); - res = cur_tablet->create_rowset_writer(context, &rowset_writer); - if (!res.ok()) { - LOG(WARNING) << "failed to init rowset writer, tablet=" << cur_tablet->tablet_id() - << ", txn_id=" << _request.transaction_id << ", res=" << res; - break; - } + auto rowset_writer = DORIS_TRY(cur_tablet->create_rowset_writer(context, false)); + _pending_rs_guard = + StorageEngine::instance()->pending_local_rowsets().add(context.rowset_id); // 2. Init PushBrokerReader to read broker file if exist, // in case of empty push this will be skipped. diff --git a/be/src/olap/push_handler.h b/be/src/olap/push_handler.h index f1f931de8653ed..46e8d47b10fa5b 100644 --- a/be/src/olap/push_handler.h +++ b/be/src/olap/push_handler.h @@ -31,6 +31,7 @@ #include "common/status.h" #include "exec/olap_common.h" #include "olap/olap_common.h" +#include "olap/rowset/pending_rowset_helper.h" #include "olap/rowset/rowset.h" #include "olap/tablet.h" #include "olap/tablet_schema.h" @@ -83,6 +84,7 @@ class PushHandler { int64_t _write_bytes = 0; int64_t _write_rows = 0; + PendingRowsetGuard _pending_rs_guard; DISALLOW_COPY_AND_ASSIGN(PushHandler); }; diff --git a/be/src/olap/reader.cpp b/be/src/olap/reader.cpp index e571b9d1c32fae..dfc99e58ce1920 100644 --- a/be/src/olap/reader.cpp +++ b/be/src/olap/reader.cpp @@ -28,7 +28,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/config.h" #include "common/logging.h" @@ -662,7 +661,7 @@ Status TabletReader::init_reader_params_and_create_block( std::transform(input_rowsets.begin(), input_rowsets.end(), rowset_metas.begin(), [](const RowsetSharedPtr& rowset) { return rowset->rowset_meta(); }); TabletSchemaSPtr read_tablet_schema = - tablet->rowset_meta_with_max_schema_version(rowset_metas)->tablet_schema(); + tablet->tablet_schema_with_merged_max_schema_version(rowset_metas); TabletSchemaSPtr merge_tablet_schema = std::make_shared(); merge_tablet_schema->copy_from(*read_tablet_schema); diff --git a/be/src/olap/rowset/beta_rowset.cpp b/be/src/olap/rowset/beta_rowset.cpp index 550698d397aead..b332d0e212d0fd 100644 --- a/be/src/olap/rowset/beta_rowset.cpp +++ b/be/src/olap/rowset/beta_rowset.cpp @@ -46,7 +46,7 @@ namespace doris { using namespace ErrorCode; -std::string BetaRowset::segment_file_path(int segment_id) { +std::string BetaRowset::segment_file_path(int segment_id) const { return segment_file_path(_rowset_dir, rowset_id(), segment_id); } diff --git a/be/src/olap/rowset/beta_rowset.h b/be/src/olap/rowset/beta_rowset.h index 372431cb4bae13..d404be13ea74a3 100644 --- a/be/src/olap/rowset/beta_rowset.h +++ b/be/src/olap/rowset/beta_rowset.h @@ -45,11 +45,11 @@ using BetaRowsetSharedPtr = std::shared_ptr; class BetaRowset final : public Rowset { public: - virtual ~BetaRowset(); + ~BetaRowset() override; Status create_reader(RowsetReaderSharedPtr* result) override; - std::string segment_file_path(int segment_id); + std::string segment_file_path(int segment_id) const; static std::string segment_file_path(const std::string& rowset_dir, const RowsetId& rowset_id, int segment_id); diff --git a/be/src/olap/rowset/beta_rowset_writer.cpp b/be/src/olap/rowset/beta_rowset_writer.cpp index c4a8dbb568a64a..f3db8689a3e5d8 100644 --- a/be/src/olap/rowset/beta_rowset_writer.cpp +++ b/be/src/olap/rowset/beta_rowset_writer.cpp @@ -28,7 +28,6 @@ #include #include -// IWYU pragma: no_include #include "cloud/config.h" #include "common/compiler_util.h" // IWYU pragma: keep #include "common/config.h" @@ -49,11 +48,12 @@ #include "olap/storage_engine.h" #include "olap/tablet_schema.h" #include "runtime/thread_context.h" +#include "util/debug_points.h" #include "util/slice.h" #include "util/time.h" #include "vec/columns/column.h" #include "vec/columns/column_object.h" -#include "vec/common/schema_util.h" // LocalSchemaChangeRecorder +#include "vec/common/schema_util.h" // variant column #include "vec/core/block.h" #include "vec/data_types/data_type_factory.hpp" @@ -122,8 +122,6 @@ Status BetaRowsetWriter::init(const RowsetWriterContext& rowset_writer_context) } _rowset_meta->set_tablet_uid(_context.tablet_uid); _rowset_meta->set_tablet_schema(_context.tablet_schema); - _context.schema_change_recorder = - std::make_shared(); _context.segment_collector = std::make_shared>(this); _context.file_writer_creator = std::make_shared>(this); RETURN_IF_ERROR(_segment_creator.init(_context)); @@ -445,6 +443,10 @@ Status BetaRowsetWriter::flush_memtable(vectorized::Block* block, int32_t segmen } TabletSchemaSPtr flush_schema; + if (_context.tablet_schema->num_variant_columns() > 0) { + // Unfold variant column + RETURN_IF_ERROR(expand_variant_to_subcolumns(*block, flush_schema)); + } { SCOPED_RAW_TIMER(&_segment_writer_ns); RETURN_IF_ERROR( @@ -521,6 +523,11 @@ Status BetaRowsetWriter::build(RowsetSharedPtr& rowset) { _rowset_meta->set_newest_write_timestamp(UnixSeconds()); } + // update rowset meta tablet schema if tablet schema updated + if (_context.tablet_schema->num_variant_columns() > 0) { + _rowset_meta->set_tablet_schema(_context.tablet_schema); + } + RETURN_NOT_OK_STATUS_WITH_WARN( RowsetFactory::create_rowset(_context.tablet_schema, _context.rowset_dir, _rowset_meta, &rowset), @@ -543,6 +550,24 @@ bool BetaRowsetWriter::_is_segment_overlapping( return false; } +// update tablet schema when meet variant columns, before commit_txn +// Eg. rowset schema: A(int), B(float), C(int), D(int) +// _tabelt->tablet_schema: A(bigint), B(double) +// => update_schema: A(bigint), B(double), C(int), D(int) +void BetaRowsetWriter::update_rowset_schema(TabletSchemaSPtr flush_schema) { + std::lock_guard lock(*(_context.schema_lock)); + TabletSchemaSPtr update_schema = std::make_shared(); + vectorized::schema_util::get_least_common_schema({_context.tablet_schema, flush_schema}, + update_schema); + CHECK_GE(update_schema->num_columns(), flush_schema->num_columns()) + << "Rowset merge schema columns count is " << update_schema->num_columns() + << ", but flush_schema is larger " << flush_schema->num_columns() + << " update_schema: " << update_schema->dump_structure() + << " flush_schema: " << flush_schema->dump_structure(); + _context.tablet_schema.swap(update_schema); + VLOG_DEBUG << "dump rs schema: " << _context.tablet_schema->dump_structure(); +} + void BetaRowsetWriter::_build_rowset_meta_with_spec_field( RowsetMetaSharedPtr rowset_meta, const RowsetMetaSharedPtr& spec_rowset_meta) { rowset_meta->set_num_rows(spec_rowset_meta->num_rows()); @@ -677,6 +702,8 @@ Status BetaRowsetWriter::_create_segment_writer_for_segcompaction( Status BetaRowsetWriter::_check_segment_number_limit() { size_t total_segment_num = _num_segment - _segcompacted_point + 1 + _num_segcompacted; + DBUG_EXECUTE_IF("BetaRowsetWriter._check_segment_number_limit_too_many_segments", + { total_segment_num = dp->param("segnum", 1024); }); if (UNLIKELY(total_segment_num > config::max_segment_num_per_rowset)) { return Status::Error( "too many segments in rowset. tablet_id:{}, rowset_id:{}, max:{}, _num_segment:{}, " @@ -688,7 +715,7 @@ Status BetaRowsetWriter::_check_segment_number_limit() { return Status::OK(); } -Status BetaRowsetWriter::add_segment(uint32_t segment_id, SegmentStatistics& segstat) { +Status BetaRowsetWriter::add_segment(uint32_t segment_id, const SegmentStatistics& segstat) { uint32_t segid_offset = segment_id - _segment_start_id; { std::lock_guard lock(_segid_statistics_map_mutex); @@ -748,4 +775,113 @@ Status BetaRowsetWriter::flush_segment_writer_for_segcompaction( return Status::OK(); } +Status BetaRowsetWriter::expand_variant_to_subcolumns(vectorized::Block& block, + TabletSchemaSPtr& flush_schema) { + size_t num_rows = block.rows(); + if (num_rows == 0) { + return Status::OK(); + } + + std::vector variant_column_pos; + if (is_partial_update()) { + // check columns that used to do partial updates should not include variant + for (int i : get_partial_update_info()->update_cids) { + if (_context.tablet_schema->columns()[i].is_variant_type()) { + return Status::InvalidArgument("Not implement partial updates for variant"); + } + } + } else { + for (int i = 0; i < _context.tablet_schema->columns().size(); ++i) { + if (_context.tablet_schema->columns()[i].is_variant_type()) { + variant_column_pos.push_back(i); + } + } + } + + if (variant_column_pos.empty()) { + return Status::OK(); + } + + try { + // Parse each variant column from raw string column + vectorized::schema_util::parse_variant_columns(block, variant_column_pos); + vectorized::schema_util::finalize_variant_columns(block, variant_column_pos, + false /*not ingore sparse*/); + vectorized::schema_util::encode_variant_sparse_subcolumns(block, variant_column_pos); + } catch (const doris::Exception& e) { + // TODO more graceful, max_filter_ratio + LOG(WARNING) << "encounter execption " << e.to_string(); + return Status::InternalError(e.to_string()); + } + + // Dynamic Block consists of two parts, dynamic part of columns and static part of columns + // static extracted + // | --------- | ----------- | + // The static ones are original _tablet_schame columns + flush_schema = std::make_shared(); + flush_schema->copy_from(*_context.tablet_schema); + vectorized::Block flush_block(std::move(block)); + + // If column already exist in original tablet schema, then we pick common type + // and cast column to common type, and modify tablet column to common type, + // otherwise it's a new column, we should add to frontend + auto append_column = [&](const TabletColumn& parent_variant, auto& column_entry_from_object) { + const std::string& column_name = + parent_variant.name_lower_case() + "." + column_entry_from_object->path.get_path(); + const vectorized::DataTypePtr& final_data_type_from_object = + column_entry_from_object->data.get_least_common_type(); + TabletColumn tablet_column; + vectorized::PathInDataBuilder full_path_builder; + auto full_path = full_path_builder.append(parent_variant.name_lower_case(), false) + .append(column_entry_from_object->path.get_parts(), false) + .build(); + vectorized::schema_util::get_column_by_type( + final_data_type_from_object, column_name, tablet_column, + vectorized::schema_util::ExtraInfo {.unique_id = -1, + .parent_unique_id = parent_variant.unique_id(), + .path_info = full_path}); + flush_schema->append_column(std::move(tablet_column)); + flush_block.insert({column_entry_from_object->data.get_finalized_column_ptr()->get_ptr(), + final_data_type_from_object, column_name}); + }; + + // 1. Flatten variant column into flat columns, append flatten columns to the back of original Block and TabletSchema + // those columns are extracted columns, leave none extracted columns remain in original variant column, which is + // JSONB format at present. + // 2. Collect columns that need to be added or modified when data type changes or new columns encountered + for (size_t i = 0; i < variant_column_pos.size(); ++i) { + size_t variant_pos = variant_column_pos[i]; + vectorized::ColumnObject& object_column = assert_cast( + flush_block.get_by_position(variant_pos).column->assume_mutable_ref()); + const TabletColumn& parent_column = _context.tablet_schema->columns()[variant_pos]; + CHECK(object_column.is_finalized()); + std::shared_ptr root; + for (auto& entry : object_column.get_subcolumns()) { + if (entry->path.empty()) { + // root + root = entry; + continue; + } + append_column(parent_column, entry); + } + // Create new variant column and set root column + auto obj = vectorized::ColumnObject::create(true, false); + // '{}' indicates a root path + static_cast(obj.get())->add_sub_column( + {}, root->data.get_finalized_column_ptr()->assume_mutable(), + root->data.get_least_common_type()); + flush_block.get_by_position(variant_pos).column = obj->get_ptr(); + vectorized::PathInDataBuilder full_root_path_builder; + auto full_root_path = + full_root_path_builder.append(parent_column.name_lower_case(), false).build(); + flush_schema->mutable_columns()[variant_pos].set_path_info(full_root_path); + VLOG_DEBUG << "set root_path : " << full_root_path.get_path(); + } + update_rowset_schema(flush_schema); + block.swap(flush_block); + VLOG_DEBUG << "dump block: " << block.dump_data(); + VLOG_DEBUG << "dump flush schema: " << flush_schema->dump_structure(); + return Status::OK(); +} + } // namespace doris diff --git a/be/src/olap/rowset/beta_rowset_writer.h b/be/src/olap/rowset/beta_rowset_writer.h index 9c615048a7fdc2..1821b670037d53 100644 --- a/be/src/olap/rowset/beta_rowset_writer.h +++ b/be/src/olap/rowset/beta_rowset_writer.h @@ -78,7 +78,7 @@ class BetaRowsetWriter : public RowsetWriter { Status create_file_writer(uint32_t segment_id, io::FileWriterPtr& writer) override; - Status add_segment(uint32_t segment_id, SegmentStatistics& segstat) override; + Status add_segment(uint32_t segment_id, const SegmentStatistics& segstat) override; Status flush() override; @@ -138,6 +138,8 @@ class BetaRowsetWriter : public RowsetWriter { return _context.partial_update_info && _context.partial_update_info->is_partial_update; } + const RowsetWriterContext& context() const override { return _context; } + private: Status _create_file_writer(std::string path, io::FileWriterPtr& file_writer); Status _check_segment_number_limit(); @@ -163,6 +165,14 @@ class BetaRowsetWriter : public RowsetWriter { Status _rename_compacted_segment_plain(uint64_t seg_id); Status _rename_compacted_indices(int64_t begin, int64_t end, uint64_t seg_id); + // Unfold variant column to Block + // Eg. [A | B | C | (D, E, F)] + // After unfold block structure changed to -> [A | B | C | D | E | F] + // The expanded D, E, F is dynamic part of the block + // The flushed Block columns should match exactly from the same type of frontend meta + Status expand_variant_to_subcolumns(vectorized::Block& block, TabletSchemaSPtr& flush_schema); + void update_rowset_schema(TabletSchemaSPtr flush_schema); + // build a tmp rowset for load segment to calc delete_bitmap // for this segment RowsetSharedPtr _build_tmp(); diff --git a/be/src/olap/rowset/beta_rowset_writer_v2.cpp b/be/src/olap/rowset/beta_rowset_writer_v2.cpp index abd7af2aabaf6b..da3e4bc8861f94 100644 --- a/be/src/olap/rowset/beta_rowset_writer_v2.cpp +++ b/be/src/olap/rowset/beta_rowset_writer_v2.cpp @@ -28,7 +28,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/config.h" #include "common/logging.h" @@ -72,7 +71,7 @@ Status BetaRowsetWriterV2::init(const RowsetWriterContext& rowset_writer_context _context = rowset_writer_context; _context.segment_collector = std::make_shared>(this); _context.file_writer_creator = std::make_shared>(this); - static_cast(_segment_creator.init(_context)); + RETURN_IF_ERROR(_segment_creator.init(_context)); return Status::OK(); } @@ -88,7 +87,7 @@ Status BetaRowsetWriterV2::create_file_writer(uint32_t segment_id, io::FileWrite return Status::OK(); } -Status BetaRowsetWriterV2::add_segment(uint32_t segment_id, SegmentStatistics& segstat) { +Status BetaRowsetWriterV2::add_segment(uint32_t segment_id, const SegmentStatistics& segstat) { for (const auto& stream : _streams) { RETURN_IF_ERROR(stream->add_segment(_context.partition_id, _context.index_id, _context.tablet_id, segment_id, segstat)); diff --git a/be/src/olap/rowset/beta_rowset_writer_v2.h b/be/src/olap/rowset/beta_rowset_writer_v2.h index d35f4b2486f31a..d025383bee650a 100644 --- a/be/src/olap/rowset/beta_rowset_writer_v2.h +++ b/be/src/olap/rowset/beta_rowset_writer_v2.h @@ -104,6 +104,8 @@ class BetaRowsetWriterV2 : public RowsetWriter { PUniqueId load_id() override { return _context.load_id; } + const RowsetWriterContext& context() const override { return _context; } + Version version() override { return _context.version; } int64_t num_rows() const override { return _segment_creator.num_rows_written(); } @@ -120,7 +122,7 @@ class BetaRowsetWriterV2 : public RowsetWriter { return Status::OK(); } - Status add_segment(uint32_t segment_id, SegmentStatistics& segstat) override; + Status add_segment(uint32_t segment_id, const SegmentStatistics& segstat) override; int32_t allocate_segment_id() override { return _next_segment_id.fetch_add(1); }; diff --git a/be/src/olap/rowset/pending_rowset_helper.cpp b/be/src/olap/rowset/pending_rowset_helper.cpp new file mode 100644 index 00000000000000..3e976344f6f038 --- /dev/null +++ b/be/src/olap/rowset/pending_rowset_helper.cpp @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/rowset/pending_rowset_helper.h" + +namespace doris { + +PendingRowsetGuard::~PendingRowsetGuard() { + if (_pending_rowset_set) { + _pending_rowset_set->remove(_rowset_id); + } +} + +PendingRowsetGuard::PendingRowsetGuard(const RowsetId& rowset_id, PendingRowsetSet* set) + : _rowset_id(rowset_id), _pending_rowset_set(set) {} + +bool PendingRowsetSet::contains(const RowsetId& rowset_id) { + std::lock_guard lock(_mtx); + return _set.contains(rowset_id); +} + +PendingRowsetGuard PendingRowsetSet::add(const RowsetId& rowset_id) { + { + std::lock_guard lock(_mtx); + _set.insert(rowset_id); + } + return PendingRowsetGuard {rowset_id, this}; +} + +void PendingRowsetSet::remove(const RowsetId& rowset_id) { + std::lock_guard lock(_mtx); + _set.erase(rowset_id); +} + +} // namespace doris diff --git a/be/src/olap/rowset/pending_rowset_helper.h b/be/src/olap/rowset/pending_rowset_helper.h new file mode 100644 index 00000000000000..4852de5ee7d6cb --- /dev/null +++ b/be/src/olap/rowset/pending_rowset_helper.h @@ -0,0 +1,83 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "olap/olap_common.h" + +namespace doris { + +class PendingRowsetSet; + +class [[nodiscard]] PendingRowsetGuard { +public: + PendingRowsetGuard() = default; + + // Remove guarded rowset id from `PendingRowsetSet` if it's initialized + ~PendingRowsetGuard(); + + PendingRowsetGuard(const PendingRowsetGuard&) = delete; + PendingRowsetGuard& operator=(const PendingRowsetGuard&) = delete; + + PendingRowsetGuard(PendingRowsetGuard&& other) noexcept { + CHECK(!_pending_rowset_set || + (_rowset_id == other._rowset_id && _pending_rowset_set == other._pending_rowset_set)); + _rowset_id = other._rowset_id; + _pending_rowset_set = other._pending_rowset_set; + other._pending_rowset_set = nullptr; + } + PendingRowsetGuard& operator=(PendingRowsetGuard&& other) noexcept { + CHECK(!_pending_rowset_set || + (_rowset_id == other._rowset_id && _pending_rowset_set == other._pending_rowset_set)); + _rowset_id = other._rowset_id; + _pending_rowset_set = other._pending_rowset_set; + other._pending_rowset_set = nullptr; + return *this; + } + +private: + friend class PendingRowsetSet; + explicit PendingRowsetGuard(const RowsetId& rowset_id, PendingRowsetSet* set); + + RowsetId _rowset_id; + PendingRowsetSet* _pending_rowset_set {nullptr}; +}; + +// Pending rowsets refer to those rowsets that are under construction, and have not been added to +// the tablet. BE storage engine will skip these pending rowsets during garbage collection. +class PendingRowsetSet { +public: + bool contains(const RowsetId& rowset_id); + + // Developers MUST add the rowset id to `PendingRowsetSet` before writing first row data to a + // local rowset, and hold returned `PendingRowsetGuard` during rowset construction before + // adding the rowset to tablet. rowset id will be removed from `PendingRowsetSet` eventually + // when `PendingRowsetGuard` is destroyed. + PendingRowsetGuard add(const RowsetId& rowset_id); + +private: + friend class PendingRowsetGuard; + void remove(const RowsetId& rowset_id); + + // TODO(plat1ko): Use high concurrency data structure + std::mutex _mtx; + RowsetIdUnorderedSet _set; +}; + +} // namespace doris diff --git a/be/src/olap/rowset/rowset.h b/be/src/olap/rowset/rowset.h index 57f110733dd7ec..da4a368c2be4eb 100644 --- a/be/src/olap/rowset/rowset.h +++ b/be/src/olap/rowset/rowset.h @@ -139,7 +139,7 @@ class Rowset : public std::enable_shared_from_this { // publish rowset to make it visible to read void make_visible(Version version); - TabletSchemaSPtr tablet_schema() { return _schema; } + const TabletSchemaSPtr& tablet_schema() { return _schema; } // helper class to access RowsetMeta int64_t start_version() const { return rowset_meta()->version().first; } diff --git a/be/src/olap/rowset/rowset_id_generator.h b/be/src/olap/rowset/rowset_id_generator.h index 39d593ea8169b7..43be640ffc6181 100644 --- a/be/src/olap/rowset/rowset_id_generator.h +++ b/be/src/olap/rowset/rowset_id_generator.h @@ -17,32 +17,20 @@ #pragma once -#include - #include "olap/olap_common.h" -#include "olap/olap_define.h" namespace doris { -class OlapMeta; - // all implementations must be thread-safe class RowsetIdGenerator { public: - RowsetIdGenerator() {} - virtual ~RowsetIdGenerator() {} + RowsetIdGenerator() = default; + virtual ~RowsetIdGenerator() = default; + RowsetIdGenerator(const RowsetIdGenerator&) = delete; + RowsetIdGenerator& operator=(const RowsetIdGenerator&) = delete; // generate and return the next global unique rowset id virtual RowsetId next_id() = 0; - - // check whether the rowset id is useful or validate - // for example, during gc logic, gc thread finds a file - // and it could not find it under rowset list. but it maybe in use - // during load procedure. Gc thread will check it using this method. - virtual bool id_in_use(const RowsetId& rowset_id) const = 0; - - // remove the rowsetid from useful rowsetid list. - virtual void release_id(const RowsetId& rowset_id) = 0; }; // RowsetIdGenerator } // namespace doris diff --git a/be/src/olap/rowset/rowset_tree.cpp b/be/src/olap/rowset/rowset_tree.cpp deleted file mode 100644 index 8ac153435fc644..00000000000000 --- a/be/src/olap/rowset/rowset_tree.cpp +++ /dev/null @@ -1,276 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -// -// This file is copied from -// https://github.com/apache/kudu/blob/master/src/kudu/tablet/rowset_tree.cc -// and modified by Doris - -#include "olap/rowset/rowset_tree.h" - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "gutil/stl_util.h" -#include "gutil/strings/substitute.h" -#include "olap/rowset/rowset.h" -#include "olap/rowset/rowset_meta.h" -#include "util/interval_tree-inl.h" -#include "util/interval_tree.h" -#include "util/slice.h" - -using std::shared_ptr; -using std::string; -using std::unique_ptr; -using std::vector; - -namespace doris { - -namespace { - -// Lexicographic, first by slice, then by rowset pointer, then by segment id, then by start/stop -bool RSEndpointBySliceCompare(const RowsetTree::RSEndpoint& a, const RowsetTree::RSEndpoint& b) { - int slice_cmp = a.slice_.compare(b.slice_); - if (slice_cmp) return slice_cmp < 0; - ptrdiff_t rs_cmp = a.rowset_.get() - b.rowset_.get(); - if (rs_cmp) return rs_cmp < 0; - int seg_cmp = a.segment_id_ < b.segment_id_; - if (seg_cmp) return seg_cmp < 0; - if (a.endpoint_ != b.endpoint_) return a.endpoint_ == RowsetTree::START; - return false; -} - -// Wrapper used when making batch queries into the interval tree. -struct QueryStruct { - // The slice of the operation performing the query. - Slice slice; - // The original index of this slice in the incoming batch. - int idx; -}; - -} // anonymous namespace - -// Entry for use in the interval tree. -struct RowsetWithBounds { - string min_key; - string max_key; - - // NOTE: the ordering of struct fields here is purposeful: we access - // min_key and max_key frequently, so putting them first in the struct - // ensures they fill a single 64-byte cache line (each is 32 bytes). - // The 'rowset' pointer is accessed comparitively rarely. - RowsetSharedPtr rowset; - int32_t segment_id; -}; - -// Traits struct for IntervalTree. -struct RowsetIntervalTraits { - typedef Slice point_type; - typedef RowsetWithBounds* interval_type; - - static Slice get_left(const RowsetWithBounds* rs) { return Slice(rs->min_key); } - - static Slice get_right(const RowsetWithBounds* rs) { return Slice(rs->max_key); } - - static int compare(const Slice& a, const Slice& b) { return a.compare(b); } - - static int compare(const Slice& a, const QueryStruct& b) { return a.compare(b.slice); } - - static int compare(const QueryStruct& a, const Slice& b) { return -compare(b, a); } - - // When 'a' is std::nullopt: - // (1)'a' is +OO when 'positive_direction' is true; - // (2)'a' is -OO when 'positive_direction' is false. - static int compare(const std::optional& a, const Slice& b, const EndpointIfNone& type) { - if (a == std::nullopt) { - return ((POSITIVE_INFINITY == type) ? 1 : -1); - } - - return compare(*a, b); - } - - static int compare(const Slice& a, const std::optional& b, const EndpointIfNone& type) { - return -compare(b, a, type); - } -}; - -RowsetTree::RowsetTree() : initted_(false) {} - -Status RowsetTree::Init(const RowsetVector& rowsets) { - if (initted_) { - return Status::InternalError("Call Init method on a RowsetTree that's already inited!"); - } - std::vector entries; - ElementDeleter deleter(&entries); - entries.reserve(rowsets.size()); - std::vector endpoints; - endpoints.reserve(rowsets.size() * 2); - - // Iterate over each of the provided Rowsets, fetching their - // bounds and adding them to the local vectors. - for (const RowsetSharedPtr& rs : rowsets) { - std::vector segments_key_bounds; - Status s = rs->get_segments_key_bounds(&segments_key_bounds); - if (!s.ok()) { - LOG(WARNING) << "Unable to construct RowsetTree: " << rs->rowset_id() - << " unable to determine its bounds: " << s.to_string(); - return s; - } - DCHECK_EQ(segments_key_bounds.size(), rs->num_segments()); - - for (auto i = 0; i < rs->num_segments(); i++) { - unique_ptr rsit(new RowsetWithBounds()); - rsit->rowset = rs; - rsit->segment_id = i; - string min_key = segments_key_bounds[i].min_key(); - string max_key = segments_key_bounds[i].max_key(); - DCHECK_LE(min_key.compare(max_key), 0) - << "Rowset min: " << min_key << " must be <= max: " << max_key; - - // Load bounds and save entry - rsit->min_key = std::move(min_key); - rsit->max_key = std::move(max_key); - - // Load into key endpoints. - endpoints.emplace_back(rsit->rowset, i, START, rsit->min_key); - endpoints.emplace_back(rsit->rowset, i, STOP, rsit->max_key); - - entries.push_back(rsit.release()); - } - } - - // Sort endpoints - std::sort(endpoints.begin(), endpoints.end(), RSEndpointBySliceCompare); - - // Install the vectors into the object. - entries_.swap(entries); - tree_.reset(new IntervalTree(entries_)); - key_endpoints_.swap(endpoints); - all_rowsets_.assign(rowsets.begin(), rowsets.end()); - - // Build the mapping from RS_ID to RS. - rs_by_id_.clear(); - for (auto& rs : all_rowsets_) { - if (!rs_by_id_.insert({rs->rowset_id(), rs}).second) { - return Status::InternalError(strings::Substitute( - "Add rowset with $0 to rowset tree of tablet $1 failed", - rs->rowset_id().to_string(), rs->rowset_meta()->tablet_uid().to_string())); - } - } - - initted_ = true; - - return Status::OK(); -} - -void RowsetTree::FindRowsetsIntersectingInterval( - const std::optional& lower_bound, const std::optional& upper_bound, - vector>* rowsets) const { - DCHECK(initted_); - - vector from_tree; - from_tree.reserve(all_rowsets_.size()); - tree_->FindIntersectingInterval(lower_bound, upper_bound, &from_tree); - rowsets->reserve(rowsets->size() + from_tree.size()); - for (RowsetWithBounds* rs : from_tree) { - rowsets->emplace_back(rs->rowset, rs->segment_id); - } -} - -void RowsetTree::FindRowsetsWithKeyInRange( - const Slice& encoded_key, const RowsetIdUnorderedSet* rowset_ids, - vector>* rowsets) const { - DCHECK(initted_); - - // Query the interval tree to efficiently find rowsets with known bounds - // whose ranges overlap the probe key. - vector from_tree; - from_tree.reserve(all_rowsets_.size()); - tree_->FindContainingPoint(encoded_key, &from_tree); - rowsets->reserve(rowsets->size() + from_tree.size()); - for (RowsetWithBounds* rs : from_tree) { - if (!rowset_ids || rowset_ids->find(rs->rowset->rowset_id()) != rowset_ids->end()) { - rowsets->emplace_back(rs->rowset, rs->segment_id); - } - } -} - -void RowsetTree::ForEachRowsetContainingKeys( - const std::vector& encoded_keys, - const std::function& cb) const { - DCHECK(std::is_sorted(encoded_keys.cbegin(), encoded_keys.cend(), Slice::Comparator())); - // The interval tree batch query callback would naturally just give us back - // the matching Slices, but that won't allow us to easily tell the caller - // which specific operation _index_ matched the Rowset. So, we make a vector - // of QueryStructs to pair the Slice with its original index. - vector queries; - queries.resize(encoded_keys.size()); - for (int i = 0; i < encoded_keys.size(); i++) { - queries[i] = {encoded_keys[i], i}; - } - - tree_->ForEachIntervalContainingPoints( - queries, [&](const QueryStruct& qs, RowsetWithBounds* rs) { cb(rs->rowset, qs.idx); }); -} - -RowsetTree::~RowsetTree() { - for (RowsetWithBounds* e : entries_) { - delete e; - } - entries_.clear(); -} - -void ModifyRowSetTree(const RowsetTree& old_tree, const RowsetVector& rowsets_to_remove, - const RowsetVector& rowsets_to_add, RowsetTree* new_tree) { - RowsetVector post_swap; - - // O(n^2) diff algorithm to collect the set of rowsets excluding - // the rowsets that were included in the compaction - int num_removed = 0; - - for (const RowsetSharedPtr& rs : old_tree.all_rowsets()) { - // Determine if it should be removed - bool should_remove = false; - for (const RowsetSharedPtr& to_remove : rowsets_to_remove) { - if (to_remove->rowset_id() == rs->rowset_id()) { - should_remove = true; - num_removed++; - break; - } - } - if (!should_remove) { - post_swap.push_back(rs); - } - } - - CHECK_EQ(num_removed, rowsets_to_remove.size()); - - // Then push the new rowsets on the end of the new list - std::copy(rowsets_to_add.begin(), rowsets_to_add.end(), std::back_inserter(post_swap)); - - CHECK(new_tree->Init(post_swap).ok()); -} - -} // namespace doris diff --git a/be/src/olap/rowset/rowset_tree.h b/be/src/olap/rowset/rowset_tree.h deleted file mode 100644 index 5ebabca0f1ec93..00000000000000 --- a/be/src/olap/rowset/rowset_tree.h +++ /dev/null @@ -1,139 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -// -// This file is copied from -// https://github.com/apache/kudu/blob/master/src/kudu/tablet/rowset_tree.h -// and modified by Doris - -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -#include "common/status.h" -#include "gutil/map-util.h" -#include "olap/olap_common.h" -#include "olap/rowset/rowset.h" -#include "util/slice.h" - -namespace doris { - -template -class IntervalTree; -struct RowsetIntervalTraits; -struct RowsetWithBounds; - -// Used often enough, may as well typedef it. -typedef std::vector RowsetVector; - -// Class which encapsulates the set of rowsets which are active for a given -// Tablet. This provides efficient lookup by key for Rowsets which may overlap -// that key range. -// -// Additionally, the rowset tree maintains information about the implicit -// intervals generated by the row sets (for instance, if a tablet has -// rowsets [0, 2] and [1, 3] it has three implicit contiguous intervals: -// [0, 1], [1, 2], and [2, 3]. -class RowsetTree { -public: - // An RSEndpoint is a POD which associates a rowset, an EndpointType - // (either the START or STOP of an interval), and the key at which the - // endpoint is located. - enum EndpointType { START, STOP }; - struct RSEndpoint { - RSEndpoint(RowsetSharedPtr rowset, uint32_t segment_id, EndpointType endpoint, Slice slice) - : rowset_(rowset), segment_id_(segment_id), endpoint_(endpoint), slice_(slice) {} - - RowsetSharedPtr rowset_; - uint32_t segment_id_; - enum EndpointType endpoint_; - Slice slice_; - }; - - RowsetTree(); - Status Init(const RowsetVector& rowsets); - ~RowsetTree(); - - // Return Rowsets whose id in rowset_ids and range may contain the given encoded key. - // - // The returned pointers are guaranteed to be valid at least until this - // RowsetTree object is Reset(). - void FindRowsetsWithKeyInRange(const Slice& encoded_key, const RowsetIdUnorderedSet* rowset_ids, - vector>* rowsets) const; - - // Call 'cb(rowset, index)' for each (rowset, index) pair such that - // 'encoded_keys[index]' may be within the bounds of 'rowset'. - // - // See IntervalTree::ForEachIntervalContainingPoints for additional - // information on the particular order in which the callback will be called. - // - // REQUIRES: 'encoded_keys' must be in sorted order. - void ForEachRowsetContainingKeys(const std::vector& encoded_keys, - const std::function& cb) const; - - // When 'lower_bound' is boost::none, it means negative infinity. - // When 'upper_bound' is boost::none, it means positive infinity. - // So the query interval can be one of below: - // [-OO, +OO) - // [-OO, upper_bound) - // [lower_bound, +OO) - // [lower_bound, upper_bound) - void FindRowsetsIntersectingInterval( - const std::optional& lower_bound, const std::optional& upper_bound, - vector>* rowsets) const; - - const RowsetVector& all_rowsets() const { return all_rowsets_; } - - RowsetSharedPtr rs_by_id(RowsetId rs_id) const { return FindPtrOrNull(rs_by_id_, rs_id); } - - // Iterates over RowsetTree::RSEndpoint, guaranteed to be ordered and for - // any rowset to appear exactly twice, once at its start slice and once at - // its stop slice, equivalent to its GetBounds() values. - const std::vector& key_endpoints() const { return key_endpoints_; } - -private: - // Interval tree of the rowsets. Used to efficiently find rowsets which might contain - // a probe row. - std::unique_ptr> tree_; - - // Ordered map of all the interval endpoints, holding the implicit contiguous - // intervals - std::vector key_endpoints_; - - // Container for all of the entries in tree_. IntervalTree does - // not itself manage memory, so this provides a simple way to enumerate - // all the entry structs and free them in the destructor. - std::vector entries_; - - // All of the rowsets which were put in this RowsetTree. - RowsetVector all_rowsets_; - - // The Rowsets in this RowsetTree, keyed by their id. - std::unordered_map rs_by_id_; - - bool initted_; -}; - -void ModifyRowSetTree(const RowsetTree& old_tree, const RowsetVector& rowsets_to_remove, - const RowsetVector& rowsets_to_add, RowsetTree* new_tree); - -} // namespace doris diff --git a/be/src/olap/rowset/rowset_writer.h b/be/src/olap/rowset/rowset_writer.h index 7958cc2ce0727a..dc85283c4ec95b 100644 --- a/be/src/olap/rowset/rowset_writer.h +++ b/be/src/olap/rowset/rowset_writer.h @@ -48,7 +48,7 @@ struct SegmentStatistics { index_size(pb.index_size()), key_bounds(pb.key_bounds()) {} - void to_pb(SegmentStatisticsPB* segstat_pb) { + void to_pb(SegmentStatisticsPB* segstat_pb) const { segstat_pb->set_row_num(row_num); segstat_pb->set_data_size(data_size); segstat_pb->set_index_size(index_size); @@ -114,7 +114,7 @@ class RowsetWriter { "RowsetWriter not support flush_single_block"); } - virtual Status add_segment(uint32_t segment_id, SegmentStatistics& segstat) { + virtual Status add_segment(uint32_t segment_id, const SegmentStatistics& segstat) { return Status::NotSupported("RowsetWriter does not support add_segment"); } @@ -156,6 +156,7 @@ class RowsetWriter { virtual std::shared_ptr get_partial_update_info() = 0; virtual bool is_partial_update() = 0; + virtual const RowsetWriterContext& context() const = 0; private: DISALLOW_COPY_AND_ASSIGN(RowsetWriter); diff --git a/be/src/olap/rowset/rowset_writer_context.h b/be/src/olap/rowset/rowset_writer_context.h index 6d4a6e7281779c..49a63b1e42ace3 100644 --- a/be/src/olap/rowset/rowset_writer_context.h +++ b/be/src/olap/rowset/rowset_writer_context.h @@ -48,7 +48,8 @@ struct RowsetWriterContext { version(Version(0, 0)), txn_id(0), tablet_uid(0, 0), - segments_overlap(OVERLAP_UNKNOWN) { + segments_overlap(OVERLAP_UNKNOWN), + schema_lock(new std::mutex) { load_id.set_hi(0); load_id.set_lo(0); } @@ -90,9 +91,6 @@ struct RowsetWriterContext { std::set skip_inverted_index; DataWriteType write_type = DataWriteType::TYPE_DEFAULT; BaseTabletSPtr tablet = nullptr; - // for tracing local schema change record - std::shared_ptr schema_change_recorder = - nullptr; std::shared_ptr mow_context; std::shared_ptr file_writer_creator; @@ -110,6 +108,9 @@ struct RowsetWriterContext { std::shared_ptr partial_update_info; bool is_transient_rowset_writer = false; + // In semi-structure senario tablet_schema will be updated concurrently, + // this lock need to be held when update.Use shared_ptr to avoid delete copy contructor + std::shared_ptr schema_lock; }; } // namespace doris diff --git a/be/src/olap/rowset/segcompaction.cpp b/be/src/olap/rowset/segcompaction.cpp index a11363856035b6..21bfcf0da82d20 100644 --- a/be/src/olap/rowset/segcompaction.cpp +++ b/be/src/olap/rowset/segcompaction.cpp @@ -33,7 +33,6 @@ #include #include "beta_rowset_writer.h" -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/logging.h" #include "gutil/stringprintf.h" @@ -58,6 +57,7 @@ #include "olap/storage_engine.h" #include "olap/tablet_schema.h" #include "runtime/thread_context.h" +#include "util/debug_points.h" #include "util/mem_info.h" #include "util/time.h" #include "vec/olap/vertical_block_reader.h" @@ -167,6 +167,7 @@ Status SegcompactionWorker::_check_correctness(OlapReaderStatistics& reader_stat } } + DBUG_EXECUTE_IF("SegcompactionWorker._check_correctness_wrong_sum_src_row", { sum_src_row++; }); if (raw_rows_read != sum_src_row) { return Status::Error( "segcompaction read row num does not match source. expect read row:{}, actual read " @@ -174,12 +175,15 @@ Status SegcompactionWorker::_check_correctness(OlapReaderStatistics& reader_stat sum_src_row, raw_rows_read); } + DBUG_EXECUTE_IF("SegcompactionWorker._check_correctness_wrong_merged_rows", { merged_rows++; }); if ((output_rows + merged_rows) != raw_rows_read) { return Status::Error( "segcompaction total row num does not match after merge. expect total row:{}, " "actual total row:{}, (output_rows:{},merged_rows:{})", raw_rows_read, output_rows + merged_rows, output_rows, merged_rows); } + DBUG_EXECUTE_IF("SegcompactionWorker._check_correctness_wrong_filtered_rows", + { filtered_rows++; }); if (filtered_rows != 0) { return Status::Error( "segcompaction should not have filtered rows but actual filtered rows:{}", diff --git a/be/src/olap/rowset/segment_creator.cpp b/be/src/olap/rowset/segment_creator.cpp index c55464783bd1b6..f40d899bdce5f7 100644 --- a/be/src/olap/rowset/segment_creator.cpp +++ b/be/src/olap/rowset/segment_creator.cpp @@ -24,13 +24,13 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/config.h" #include "common/logging.h" #include "io/fs/file_writer.h" #include "olap/rowset/beta_rowset_writer.h" // SegmentStatistics #include "olap/rowset/segment_v2/segment_writer.h" +#include "olap/rowset/segment_v2/vertical_segment_writer.h" #include "vec/core/block.h" namespace doris { @@ -50,7 +50,7 @@ Status SegmentFlusher::flush_single_block(const vectorized::Block* block, int32_ if (block->rows() == 0) { return Status::OK(); } - std::unique_ptr writer; + std::unique_ptr writer; bool no_compression = block->bytes() <= config::segment_compression_threshold_kb * 1024; RETURN_IF_ERROR(_create_segment_writer(writer, segment_id, no_compression, flush_schema)); RETURN_IF_ERROR(_add_rows(writer, block, 0, block->rows())); @@ -74,10 +74,16 @@ Status SegmentFlusher::close() { Status SegmentFlusher::_add_rows(std::unique_ptr& segment_writer, const vectorized::Block* block, size_t row_offset, size_t row_num) { - auto s = segment_writer->append_block(block, row_offset, row_num); - if (UNLIKELY(!s.ok())) { - return Status::Error("failed to append block: {}", s.to_string()); - } + RETURN_IF_ERROR(segment_writer->append_block(block, row_offset, row_num)); + _num_rows_written += row_num; + return Status::OK(); +} + +Status SegmentFlusher::_add_rows(std::unique_ptr& segment_writer, + const vectorized::Block* block, size_t row_offset, + size_t row_num) { + RETURN_IF_ERROR(segment_writer->batch_block(block, row_offset, row_num)); + RETURN_IF_ERROR(segment_writer->write_batch()); _num_rows_written += row_num; return Status::OK(); } @@ -113,6 +119,79 @@ Status SegmentFlusher::_create_segment_writer(std::unique_ptr& writer, int32_t segment_id, + bool no_compression, TabletSchemaSPtr flush_schema) { + io::FileWriterPtr file_writer; + RETURN_IF_ERROR(_context.file_writer_creator->create(segment_id, file_writer)); + + segment_v2::VerticalSegmentWriterOptions writer_options; + writer_options.enable_unique_key_merge_on_write = _context.enable_unique_key_merge_on_write; + writer_options.rowset_ctx = &_context; + writer_options.write_type = _context.write_type; + if (no_compression) { + writer_options.compression_type = NO_COMPRESSION; + } + + const auto& tablet_schema = flush_schema ? flush_schema : _context.tablet_schema; + writer.reset(new segment_v2::VerticalSegmentWriter( + file_writer.get(), segment_id, tablet_schema, _context.tablet, _context.data_dir, + _context.max_rows_per_segment, writer_options, _context.mow_context)); + { + std::lock_guard l(_lock); + _file_writers.push_back(std::move(file_writer)); + } + auto s = writer->init(); + if (!s.ok()) { + LOG(WARNING) << "failed to init segment writer: " << s.to_string(); + writer.reset(); + return s; + } + return Status::OK(); +} + +Status SegmentFlusher::_flush_segment_writer( + std::unique_ptr& writer, int64_t* flush_size) { + uint32_t row_num = writer->num_rows_written(); + _num_rows_filtered += writer->num_rows_filtered(); + + if (row_num == 0) { + return Status::OK(); + } + uint64_t segment_size; + uint64_t index_size; + Status s = writer->finalize(&segment_size, &index_size); + if (!s.ok()) { + return Status::Error(s.code(), "failed to finalize segment: {}", s.to_string()); + } + VLOG_DEBUG << "tablet_id:" << _context.tablet_id + << " flushing filename: " << writer->data_dir_path() + << " rowset_id:" << _context.rowset_id; + + KeyBoundsPB key_bounds; + Slice min_key = writer->min_encoded_key(); + Slice max_key = writer->max_encoded_key(); + DCHECK_LE(min_key.compare(max_key), 0); + key_bounds.set_min_key(min_key.to_string()); + key_bounds.set_max_key(max_key.to_string()); + + uint32_t segment_id = writer->segment_id(); + SegmentStatistics segstat; + segstat.row_num = row_num; + segstat.data_size = segment_size + writer->inverted_index_file_size(); + segstat.index_size = index_size + writer->inverted_index_file_size(); + segstat.key_bounds = key_bounds; + + writer.reset(); + + RETURN_IF_ERROR(_context.segment_collector->add(segment_id, segstat)); + + if (flush_size) { + *flush_size = segment_size + index_size; + } + return Status::OK(); +} + Status SegmentFlusher::_flush_segment_writer(std::unique_ptr& writer, int64_t* flush_size) { uint32_t row_num = writer->num_rows_written(); diff --git a/be/src/olap/rowset/segment_creator.h b/be/src/olap/rowset/segment_creator.h index 750aa1487240a3..cf5456a28aa5cd 100644 --- a/be/src/olap/rowset/segment_creator.h +++ b/be/src/olap/rowset/segment_creator.h @@ -35,6 +35,7 @@ class Block; namespace segment_v2 { class SegmentWriter; +class VerticalSegmentWriter; } // namespace segment_v2 struct SegmentStatistics; @@ -127,11 +128,18 @@ class SegmentFlusher { private: Status _add_rows(std::unique_ptr& segment_writer, const vectorized::Block* block, size_t row_offset, size_t row_num); + Status _add_rows(std::unique_ptr& segment_writer, + const vectorized::Block* block, size_t row_offset, size_t row_num); Status _create_segment_writer(std::unique_ptr& writer, int32_t segment_id, bool no_compression = false, TabletSchemaSPtr flush_schema = nullptr); + Status _create_segment_writer(std::unique_ptr& writer, + int32_t segment_id, bool no_compression = false, + TabletSchemaSPtr flush_schema = nullptr); Status _flush_segment_writer(std::unique_ptr& writer, int64_t* flush_size = nullptr); + Status _flush_segment_writer(std::unique_ptr& writer, + int64_t* flush_size = nullptr); private: RowsetWriterContext _context; diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp index 69d92f5ede92a2..ff61f1a392d1d2 100644 --- a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp +++ b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp @@ -23,7 +23,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/logging.h" #include "common/status.h" diff --git a/be/src/olap/rowset/segment_v2/bitshuffle_page.h b/be/src/olap/rowset/segment_v2/bitshuffle_page.h index 54f446070f106f..89180c2cd1b903 100644 --- a/be/src/olap/rowset/segment_v2/bitshuffle_page.h +++ b/be/src/olap/rowset/segment_v2/bitshuffle_page.h @@ -24,7 +24,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/status.h" #include "gutil/port.h" diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index dd488e6e0acb26..34659211bac981 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -25,7 +25,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/status.h" #include "io/fs/file_reader.h" @@ -88,6 +87,7 @@ Status ColumnReader::create(const ColumnReaderOptions& opts, const ColumnMetaPB& case FieldType::OLAP_FIELD_TYPE_STRUCT: { // not support empty struct DCHECK(meta.children_columns_size() >= 1); + num_rows = meta.children_columns(0).num_rows(); // create struct column reader std::unique_ptr struct_reader( new ColumnReader(opts, meta, num_rows, file_reader)); diff --git a/be/src/olap/rowset/segment_v2/column_writer.cpp b/be/src/olap/rowset/segment_v2/column_writer.cpp index 9de11d83c10986..bacf6fa1ce0671 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.cpp +++ b/be/src/olap/rowset/segment_v2/column_writer.cpp @@ -353,6 +353,13 @@ Status ColumnWriter::create(const ColumnWriterOptions& opts, const TabletColumn* *writer = std::move(writer_local); return Status::OK(); } + case FieldType::OLAP_FIELD_TYPE_VARIANT: { + // Use ScalarColumnWriter to write it's only root data + std::unique_ptr writer_local = std::unique_ptr( + new ScalarColumnWriter(opts, std::move(field), file_writer)); + *writer = std::move(writer_local); + return Status::OK(); + } default: return Status::NotSupported("unsupported type for ColumnWriter: {}", std::to_string(int(field->type()))); @@ -439,12 +446,7 @@ ScalarColumnWriter::ScalarColumnWriter(const ColumnWriterOptions& opts, ScalarColumnWriter::~ScalarColumnWriter() { // delete all pages - Page* page = _pages.head; - while (page != nullptr) { - Page* next_page = page->next; - delete page; - page = next_page; - } + _pages.clear(); } Status ScalarColumnWriter::init() { @@ -594,11 +596,10 @@ Status ScalarColumnWriter::finish() { } Status ScalarColumnWriter::write_data() { - Page* page = _pages.head; - while (page != nullptr) { - RETURN_IF_ERROR(_write_data_page(page)); - page = page->next; + for (auto& page : _pages) { + RETURN_IF_ERROR(_write_data_page(page.get())); } + _pages.clear(); // write column dict if (_encoding_info->encoding() == DICT_ENCODING) { OwnedSlice dict_body; @@ -615,6 +616,7 @@ Status ScalarColumnWriter::write_data() { {dict_body.slice()}, footer, &dict_pp)); dict_pp.to_proto(_opts.meta->mutable_dict_page()); } + _page_builder.reset(); return Status::OK(); } @@ -724,7 +726,7 @@ Status ScalarColumnWriter::finish_current_page() { page->data.emplace_back(std::move(compressed_body)); } - _push_back_page(page.release()); + _push_back_page(std::move(page)); _first_rowid = _next_rowid; return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/column_writer.h b/be/src/olap/rowset/segment_v2/column_writer.h index 1bc0afb972b245..67cefc3c9ce891 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.h +++ b/be/src/olap/rowset/segment_v2/column_writer.h @@ -230,28 +230,16 @@ class ScalarColumnWriter : public ColumnWriter { // use vector for easier management for lifetime of OwnedSlice std::vector data; PageFooterPB footer; - Page* next = nullptr; }; - struct PageHead { - Page* head = nullptr; - Page* tail = nullptr; - }; - - void _push_back_page(Page* page) { - // add page to pages' tail - if (_pages.tail != nullptr) { - _pages.tail->next = page; - } - _pages.tail = page; - if (_pages.head == nullptr) { - _pages.head = page; - } + void _push_back_page(std::unique_ptr page) { for (auto& data_slice : page->data) { _data_size += data_slice.slice().size; } // estimate (page footer + footer size + checksum) took 20 bytes _data_size += 20; + // add page to pages' tail + _pages.emplace_back(std::move(page)); } Status _write_data_page(Page* page); @@ -262,7 +250,7 @@ class ScalarColumnWriter : public ColumnWriter { uint64_t _data_size; // cached generated pages, - PageHead _pages; + std::vector> _pages; ordinal_t _first_rowid = 0; BlockCompressionCodec* _compress_codec; diff --git a/be/src/olap/rowset/segment_v2/encoding_info.cpp b/be/src/olap/rowset/segment_v2/encoding_info.cpp index db41b15e3ded0f..ecf127e27a1d46 100644 --- a/be/src/olap/rowset/segment_v2/encoding_info.cpp +++ b/be/src/olap/rowset/segment_v2/encoding_info.cpp @@ -284,6 +284,10 @@ EncodingInfoResolver::EncodingInfoResolver() { _add_map(); _add_map(); + _add_map(); + _add_map(); + _add_map(); + _add_map(); _add_map(); _add_map(); diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index 2776fa4a8da4c4..e870680ef50dbb 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -289,10 +289,12 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run "inverted index path: {} not exist.", index_file_path.string()); } - InvertedIndexCacheHandle inverted_index_cache_handle; - static_cast(InvertedIndexSearcherCache::instance()->get_index_searcher( - _fs, index_dir.c_str(), index_file_name, &inverted_index_cache_handle, stats)); - auto index_searcher = inverted_index_cache_handle.get_index_searcher(); + auto get_index_search = [this, &index_dir, &index_file_name, &stats]() { + InvertedIndexCacheHandle inverted_index_cache_handle; + static_cast(InvertedIndexSearcherCache::instance()->get_index_searcher( + _fs, index_dir.c_str(), index_file_name, &inverted_index_cache_handle, stats)); + return inverted_index_cache_handle.get_index_searcher(); + }; std::unique_ptr query; std::wstring field_ws = std::wstring(column_name.begin(), column_name.end()); @@ -323,6 +325,8 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run } else { stats->inverted_index_query_cache_miss++; + auto index_searcher = get_index_search(); + term_match_bitmap = std::make_shared(); Status res = Status::OK(); @@ -371,6 +375,8 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run } else { stats->inverted_index_query_cache_miss++; + auto index_searcher = get_index_search(); + term_match_bitmap = std::make_shared(); // unique_ptr with custom deleter std::unique_ptr term { diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index 4be1c72b53dd5a..5c53a91c3d8cfc 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -22,8 +22,6 @@ #include #include -#include -#include #include #include #include @@ -51,6 +49,7 @@ #include "olap/tablet_schema.h" #include "olap/types.h" #include "runtime/collection_value.h" +#include "util/debug_points.h" #include "util/faststring.h" #include "util/slice.h" #include "util/string_util.h" @@ -477,6 +476,8 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { InvertedIndexDescriptor::get_temporary_bkd_index_meta_file_name().c_str()); index_out = dir->createOutput( InvertedIndexDescriptor::get_temporary_bkd_index_file_name().c_str()); + DBUG_EXECUTE_IF("InvertedIndexWriter._set_fulltext_data_out_nullptr", + { data_out = nullptr; }); if (data_out != nullptr && meta_out != nullptr && index_out != nullptr) { _bkd_writer->meta_finish(meta_out, _bkd_writer->finish(data_out, index_out), int(field_type)); @@ -492,6 +493,9 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { dir = _index_writer->getDirectory(); write_null_bitmap(null_bitmap_out, dir); close(); + DBUG_EXECUTE_IF("InvertedIndexWriter._throw_clucene_error_in_bkd_writer_close", { + _CLTHROWA(CL_ERR_IO, "debug point: test throw error in bkd index writer"); + }); } } catch (CLuceneError& e) { FINALLY_FINALIZE_OUTPUT(null_bitmap_out) diff --git a/be/src/olap/rowset/segment_v2/page_io.cpp b/be/src/olap/rowset/segment_v2/page_io.cpp index 30165423724747..c712478850a6c5 100644 --- a/be/src/olap/rowset/segment_v2/page_io.cpp +++ b/be/src/olap/rowset/segment_v2/page_io.cpp @@ -130,7 +130,8 @@ Status PageIO::read_and_decompress_page(const PageReadOptions& opts, PageHandle* uint32_t footer_size = decode_fixed32_le((uint8_t*)page_slice.data + page_slice.size - 4); std::string footer_buf(page_slice.data + page_slice.size - 4 - footer_size, footer_size); if (!footer->ParseFromString(footer_buf)) { - return Status::Corruption("Bad page: invalid footer"); + return Status::Corruption("Bad page: invalid footer, footer_size={}, file={}", + footer_size, opts.file_reader->path().native()); } *body = Slice(page_slice.data, page_slice.size - 4 - footer_size); return Status::OK(); @@ -139,7 +140,8 @@ Status PageIO::read_and_decompress_page(const PageReadOptions& opts, PageHandle* // every page contains 4 bytes footer length and 4 bytes checksum const uint32_t page_size = opts.page_pointer.size; if (page_size < 8) { - return Status::Corruption("Bad page: too small size ({})", page_size); + return Status::Corruption("Bad page: too small size ({}), file={}", page_size, + opts.file_reader->path().native()); } // hold compressed page at first, reset to decompressed page later @@ -158,8 +160,9 @@ Status PageIO::read_and_decompress_page(const PageReadOptions& opts, PageHandle* uint32_t expect = decode_fixed32_le((uint8_t*)page_slice.data + page_slice.size - 4); uint32_t actual = crc32c::Value(page_slice.data, page_slice.size - 4); if (expect != actual) { - return Status::Corruption("Bad page: checksum mismatch (actual={} vs expect={})", - actual, expect); + return Status::Corruption( + "Bad page: checksum mismatch (actual={} vs expect={}), file={}", actual, expect, + opts.file_reader->path().native()); } } @@ -168,13 +171,16 @@ Status PageIO::read_and_decompress_page(const PageReadOptions& opts, PageHandle* // parse and set footer uint32_t footer_size = decode_fixed32_le((uint8_t*)page_slice.data + page_slice.size - 4); if (!footer->ParseFromArray(page_slice.data + page_slice.size - 4 - footer_size, footer_size)) { - return Status::Corruption("Bad page: invalid footer"); + return Status::Corruption("Bad page: invalid footer, footer_size={}, file={}", footer_size, + opts.file_reader->path().native()); } uint32_t body_size = page_slice.size - 4 - footer_size; if (body_size != footer->uncompressed_size()) { // need decompress body if (opts.codec == nullptr) { - return Status::Corruption("Bad page: page is compressed but codec is NO_COMPRESSION"); + return Status::Corruption( + "Bad page: page is compressed but codec is NO_COMPRESSION, file={}", + opts.file_reader->path().native()); } SCOPED_RAW_TIMER(&opts.stats->decompress_ns); std::unique_ptr decompressed_page = @@ -186,8 +192,9 @@ Status PageIO::read_and_decompress_page(const PageReadOptions& opts, PageHandle* RETURN_IF_ERROR(opts.codec->decompress(compressed_body, &decompressed_body)); if (decompressed_body.size != footer->uncompressed_size()) { return Status::Corruption( - "Bad page: record uncompressed size={} vs real decompressed size={}", - footer->uncompressed_size(), decompressed_body.size); + "Bad page: record uncompressed size={} vs real decompressed size={}, file={}", + footer->uncompressed_size(), decompressed_body.size, + opts.file_reader->path().native()); } // append footer and footer size memcpy(decompressed_body.data + decompressed_body.size, page_slice.data + body_size, diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index f5ccbef0adc537..9f4b57d04681f1 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -30,7 +30,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/config.h" #include "common/consts.h" @@ -101,11 +100,12 @@ class SegmentIterator::BitmapRangeIterator { explicit BitmapRangeIterator(const roaring::Roaring& bitmap) { roaring_init_iterator(&bitmap.roaring, &_iter); - _read_next_batch(); } bool has_more_range() const { return !_eof; } + [[nodiscard]] static uint32_t get_batch_size() { return kBatchSize; } + // read next range into [*from, *to) whose size <= max_range_size. // return false when there is no more range. virtual bool next_range(const uint32_t max_range_size, uint32_t* from, uint32_t* to) { @@ -144,6 +144,11 @@ class SegmentIterator::BitmapRangeIterator { return true; } + // read batch_size of rowids from roaring bitmap into buf array + virtual uint32_t read_batch_rowids(rowid_t* buf, uint32_t batch_size) { + return roaring::api::roaring_read_uint32_iterator(&_iter, buf, batch_size); + } + private: void _read_next_batch() { _buf_pos = 0; @@ -168,6 +173,8 @@ class SegmentIterator::BackwardBitmapRangeIterator : public SegmentIterator::Bit public: explicit BackwardBitmapRangeIterator(const roaring::Roaring& bitmap) { roaring_init_iterator_last(&bitmap.roaring, &_riter); + _rowid_count = roaring_bitmap_get_cardinality(&bitmap.roaring); + _rowid_left = _rowid_count; } bool has_more_range() const { return !_riter.has_value; } @@ -191,9 +198,51 @@ class SegmentIterator::BackwardBitmapRangeIterator : public SegmentIterator::Bit return true; } + /** + * Reads a batch of row IDs from a roaring bitmap, starting from the end and moving backwards. + * This function retrieves the last `batch_size` row IDs from the bitmap and stores them in the provided buffer. + * It updates the internal state to track how many row IDs are left to read in subsequent calls. + * + * The row IDs are read in reverse order, but stored in the buffer maintaining their original order in the bitmap. + * + * Example: + * input bitmap: [0 1 4 5 6 7 10 15 16 17 18 19] + * If the bitmap has 12 elements and batch_size is set to 5, the function will first read [15, 16, 17, 18, 19] + * into the buffer, leaving 7 elements left. In the next call with batch_size 5, it will read [4, 5, 6, 7, 10]. + * + */ + uint32_t read_batch_rowids(rowid_t* buf, uint32_t batch_size) override { + if (!_riter.has_value || _rowid_left == 0) { + return 0; + } + + if (_rowid_count <= batch_size) { + roaring_bitmap_to_uint32_array(_riter.parent, + buf); // Fill 'buf' with '_rowid_count' elements. + uint32_t num_read = _rowid_left; // Save the number of row IDs read. + _rowid_left = 0; // No row IDs left after this operation. + return num_read; // Return the number of row IDs read. + } + + uint32_t read_size = std::min(batch_size, _rowid_left); + uint32_t num_read = 0; // Counter for the number of row IDs read. + + // Read row IDs into the buffer in reverse order. + while (num_read < read_size && _riter.has_value) { + buf[read_size - num_read - 1] = _riter.current_value; + num_read++; + _rowid_left--; // Decrement the count of remaining row IDs. + roaring_previous_uint32_iterator(&_riter); + } + + // Return the actual number of row IDs read. + return num_read; + } private: roaring::api::roaring_uint32_iterator_t _riter; + uint32_t _rowid_count; + uint32_t _rowid_left; }; SegmentIterator::SegmentIterator(std::shared_ptr segment, SchemaSPtr schema) @@ -691,6 +740,9 @@ bool SegmentIterator::_check_apply_by_bitmap_index(ColumnPredicate* pred) { } bool SegmentIterator::_check_apply_by_inverted_index(ColumnPredicate* pred, bool pred_in_compound) { + if (_opts.runtime_state && !_opts.runtime_state->query_options().enable_inverted_index_query) { + return false; + } if (_inverted_index_iterators[pred->column_id()] == nullptr) { //this column without inverted index return false; @@ -735,9 +787,6 @@ Status SegmentIterator::_apply_bitmap_index_except_leafnode_of_andnode( Status SegmentIterator::_apply_inverted_index_except_leafnode_of_andnode( ColumnPredicate* pred, roaring::Roaring* output_result) { - if (_opts.runtime_state && !_opts.runtime_state->query_options().enable_inverted_index_query) { - return Status::OK(); - } RETURN_IF_ERROR(pred->evaluate(*_schema, _inverted_index_iterators[pred->column_id()].get(), num_rows(), output_result)); return Status::OK(); @@ -1186,7 +1235,7 @@ Status SegmentIterator::_lookup_ordinal_from_pk_index(const RowCursor& key, bool std::string index_key; // when is_include is false, we shoudle append KEY_NORMAL_MARKER to the // encode key. Otherwise, we will get an incorrect upper bound. - encode_key_with_padding( + encode_key_with_padding( &index_key, key, _segment->_tablet_schema->num_key_columns(), is_include, true); if (index_key < _segment->min_key()) { *rowid = 0; @@ -1612,49 +1661,86 @@ void SegmentIterator::_output_non_pred_columns(vectorized::Block* block) { } } +/** + * Reads columns by their index, handling both continuous and discontinuous rowid scenarios. + * + * This function is designed to read a specified number of rows (up to nrows_read_limit) + * from the segment iterator, dealing with both continuous and discontinuous rowid arrays. + * It operates as follows: + * + * 1. Reads a batch of rowids (up to the specified limit), and checks if they are continuous. + * Continuous here means that the rowids form an unbroken sequence (e.g., 1, 2, 3, 4...). + * + * 2. For each column that needs to be read (identified by _first_read_column_ids): + * - If the rowids are continuous, the function uses seek_to_ordinal and next_batch + * for efficient reading. + * - If the rowids are not continuous, the function processes them in smaller batches + * (each of size up to 256). Each batch is checked for internal continuity: + * a. If a batch is continuous, uses seek_to_ordinal and next_batch for that batch. + * b. If a batch is not continuous, uses read_by_rowids for individual rowids in the batch. + * + * This approach optimizes reading performance by leveraging batch processing for continuous + * rowid sequences and handling discontinuities gracefully in smaller chunks. + */ Status SegmentIterator::_read_columns_by_index(uint32_t nrows_read_limit, uint32_t& nrows_read, bool set_block_rowid) { SCOPED_RAW_TIMER(&_opts.stats->first_read_ns); - do { - uint32_t range_from = 0; - uint32_t range_to = 0; - bool has_next_range = - _range_iter->next_range(nrows_read_limit - nrows_read, &range_from, &range_to); - if (!has_next_range) { - break; - } - - size_t rows_to_read = range_to - range_from; - _cur_rowid = range_to; - - if (set_block_rowid) { - // Here use std::iota is better performance than for-loop, maybe for-loop is not vectorized - auto start = _block_rowids.data() + nrows_read; - auto end = start + rows_to_read; - std::iota(start, end, range_from); - nrows_read += rows_to_read; - } else { - nrows_read += rows_to_read; - } - - _split_row_ranges.emplace_back(std::pair {range_from, range_to}); - } while (nrows_read < nrows_read_limit && !_opts.read_orderby_key_reverse); + nrows_read = _range_iter->read_batch_rowids(_block_rowids.data(), nrows_read_limit); + bool is_continuous = (nrows_read > 1) && + (_block_rowids[nrows_read - 1] - _block_rowids[0] == nrows_read - 1); for (auto cid : _first_read_column_ids) { auto& column = _current_return_columns[cid]; if (_prune_column(cid, column, true, nrows_read)) { continue; } - for (auto& range : _split_row_ranges) { - size_t nrows = range.second - range.first; - RETURN_IF_ERROR(_column_iterators[cid]->seek_to_ordinal(range.first)); - size_t rows_read = nrows; + if (is_continuous) { + size_t rows_read = nrows_read; + _opts.stats->block_first_read_seek_num += 1; + if (_opts.runtime_state && _opts.runtime_state->enable_profile()) { + SCOPED_RAW_TIMER(&_opts.stats->block_first_read_seek_ns); + RETURN_IF_ERROR(_column_iterators[cid]->seek_to_ordinal(_block_rowids[0])); + } else { + RETURN_IF_ERROR(_column_iterators[cid]->seek_to_ordinal(_block_rowids[0])); + } RETURN_IF_ERROR(_column_iterators[cid]->next_batch(&rows_read, column)); - if (rows_read != nrows) { - return Status::Error("nrows({}) != rows_read({})", nrows, - rows_read); + if (rows_read != nrows_read) { + return Status::Error("nrows({}) != rows_read({})", + nrows_read, rows_read); + } + } else { + const uint32_t batch_size = _range_iter->get_batch_size(); + uint32_t processed = 0; + while (processed < nrows_read) { + uint32_t current_batch_size = std::min(batch_size, nrows_read - processed); + bool batch_continuous = (current_batch_size > 1) && + (_block_rowids[processed + current_batch_size - 1] - + _block_rowids[processed] == + current_batch_size - 1); + + if (batch_continuous) { + size_t rows_read = current_batch_size; + _opts.stats->block_first_read_seek_num += 1; + if (_opts.runtime_state && _opts.runtime_state->enable_profile()) { + SCOPED_RAW_TIMER(&_opts.stats->block_first_read_seek_ns); + RETURN_IF_ERROR( + _column_iterators[cid]->seek_to_ordinal(_block_rowids[processed])); + } else { + RETURN_IF_ERROR( + _column_iterators[cid]->seek_to_ordinal(_block_rowids[processed])); + } + RETURN_IF_ERROR(_column_iterators[cid]->next_batch(&rows_read, column)); + if (rows_read != current_batch_size) { + return Status::Error( + "batch nrows({}) != rows_read({})", current_batch_size, rows_read); + } + } else { + RETURN_IF_ERROR(_column_iterators[cid]->read_by_rowids( + &_block_rowids[processed], current_batch_size, column)); + } + processed += current_batch_size; } } } @@ -1853,8 +1939,6 @@ Status SegmentIterator::_next_batch_internal(vectorized::Block* block) { nrows_read_limit = std::min(nrows_read_limit, (uint32_t)100); _wait_times_estimate_row_size--; } - _split_row_ranges.clear(); - _split_row_ranges.reserve(nrows_read_limit / 2); RETURN_IF_ERROR(_read_columns_by_index( nrows_read_limit, _current_batch_rows_read, _lazy_materialization_read || _opts.record_rowids || _is_need_expr_eval)); @@ -2120,35 +2204,28 @@ void SegmentIterator::_output_index_result_column(uint16_t* sel_rowid_idx, uint1 } } -void SegmentIterator::_build_index_result_column(uint16_t* sel_rowid_idx, uint16_t select_size, - vectorized::Block* block, +void SegmentIterator::_build_index_result_column(const uint16_t* sel_rowid_idx, + uint16_t select_size, vectorized::Block* block, const std::string& pred_result_sign, const roaring::Roaring& index_result) { auto index_result_column = vectorized::ColumnUInt8::create(); vectorized::ColumnUInt8::Container& vec_match_pred = index_result_column->get_data(); vec_match_pred.resize(block->rows()); - size_t idx_in_block = 0; - size_t idx_in_row_range = 0; size_t idx_in_selected = 0; - // _split_row_ranges store multiple ranges which split in function _read_columns_by_index(), - // index_result is a column predicate apply result in a whole segement, - // but a scanner thread one time can read max rows limit by block_row_max, - // so split _row_bitmap by one time scan range, in order to match size of one scanner thread read rows. - for (auto origin_row_range : _split_row_ranges) { - for (size_t rowid = origin_row_range.first; rowid < origin_row_range.second; ++rowid) { - if (sel_rowid_idx == nullptr || (idx_in_selected < select_size && - idx_in_row_range == sel_rowid_idx[idx_in_selected])) { - if (index_result.contains(rowid)) { - vec_match_pred[idx_in_block++] = true; - } else { - vec_match_pred[idx_in_block++] = false; - } - idx_in_selected++; + + for (uint32_t i = 0; i < _current_batch_rows_read; i++) { + auto rowid = _block_rowids[i]; + if (sel_rowid_idx == nullptr || + (idx_in_selected < select_size && i == sel_rowid_idx[idx_in_selected])) { + if (index_result.contains(rowid)) { + vec_match_pred[idx_in_selected] = true; + } else { + vec_match_pred[idx_in_selected] = false; } - idx_in_row_range++; + idx_in_selected++; } } - assert(block->rows() == vec_match_pred.size()); + DCHECK(block->rows() == vec_match_pred.size()); auto index_result_position = block->get_position_by_name(pred_result_sign); block->replace_by_position(index_result_position, std::move(index_result_column)); } diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h b/be/src/olap/rowset/segment_v2/segment_iterator.h index 33d3a3f5f9c9c9..352929678b3588 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.h +++ b/be/src/olap/rowset/segment_v2/segment_iterator.h @@ -259,7 +259,7 @@ class SegmentIterator : public RowwiseIterator { std::string _gen_predicate_result_sign(ColumnPredicate* predicate); std::string _gen_predicate_result_sign(ColumnPredicateInfo* predicate_info); - void _build_index_result_column(uint16_t* sel_rowid_idx, uint16_t select_size, + void _build_index_result_column(const uint16_t* sel_rowid_idx, uint16_t select_size, vectorized::Block* block, const std::string& pred_result_sign, const roaring::Roaring& index_result); void _output_index_result_column(uint16_t* sel_rowid_idx, uint16_t select_size, @@ -340,7 +340,6 @@ class SegmentIterator : public RowwiseIterator { roaring::Roaring _row_bitmap; // "column_name+operator+value-> std::unordered_map> _rowid_result_for_index; - std::vector> _split_row_ranges; // an iterator for `_row_bitmap` that can be used to extract row range to scan std::unique_ptr _range_iter; // the next rowid to read diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp b/be/src/olap/rowset/segment_v2/segment_writer.cpp index 9e2073b9fb4a98..329a7fa1e9252f 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp @@ -26,7 +26,6 @@ #include #include -// IWYU pragma: no_include #include "cloud/config.h" #include "common/compiler_util.h" // IWYU pragma: keep #include "common/config.h" @@ -109,12 +108,16 @@ SegmentWriter::~SegmentWriter() { void SegmentWriter::init_column_meta(ColumnMetaPB* meta, uint32_t column_id, const TabletColumn& column, TabletSchemaSPtr tablet_schema) { meta->set_column_id(column_id); - meta->set_unique_id(column.unique_id()); meta->set_type(int(column.type())); meta->set_length(column.length()); meta->set_encoding(DEFAULT_ENCODING); meta->set_compression(_opts.compression_type); meta->set_is_nullable(column.is_nullable()); + meta->set_default_value(column.default_value()); + meta->set_precision(column.precision()); + meta->set_frac(column.frac()); + column.path_info().to_protobuf(meta->mutable_column_path_info(), column.parent_unique_id()); + meta->set_unique_id(column.unique_id()); for (uint32_t i = 0; i < column.get_subtype_count(); ++i) { init_column_meta(meta->add_children_columns(), column_id, column.get_sub_column(i), tablet_schema); @@ -181,57 +184,30 @@ Status SegmentWriter::init(const std::vector& col_ids, bool has_key) { break; } } - if (column.type() == FieldType::OLAP_FIELD_TYPE_STRUCT) { - opts.need_zone_map = false; - if (opts.need_bloom_filter) { - return Status::NotSupported("Do not support bloom filter for struct type"); - } - if (opts.need_bitmap_index) { - return Status::NotSupported("Do not support bitmap index for struct type"); - } - } - if (column.type() == FieldType::OLAP_FIELD_TYPE_ARRAY) { - opts.need_zone_map = false; - if (opts.need_bloom_filter) { - return Status::NotSupported("Do not support bloom filter for array type"); - } - if (opts.need_bitmap_index) { - return Status::NotSupported("Do not support bitmap index for array type"); - } - } - if (column.type() == FieldType::OLAP_FIELD_TYPE_JSONB) { - opts.need_zone_map = false; - if (opts.need_bloom_filter) { - return Status::NotSupported("Do not support bloom filter for jsonb type"); - } - if (opts.need_bitmap_index) { - return Status::NotSupported("Do not support bitmap index for jsonb type"); - } - } - if (column.type() == FieldType::OLAP_FIELD_TYPE_AGG_STATE) { - opts.need_zone_map = false; - if (opts.need_bloom_filter) { - return Status::NotSupported("Do not support bloom filter for agg_state type"); - } - if (opts.need_bitmap_index) { - return Status::NotSupported("Do not support bitmap index for agg_state type"); - } - } - if (column.type() == FieldType::OLAP_FIELD_TYPE_MAP) { - opts.need_zone_map = false; - if (opts.need_bloom_filter) { - return Status::NotSupported("Do not support bloom filter for map type"); - } - if (opts.need_bitmap_index) { - return Status::NotSupported("Do not support bitmap index for map type"); - } - } +#define CHECK_FIELD_TYPE(TYPE, type_name) \ + if (column.type() == FieldType::OLAP_FIELD_TYPE_##TYPE) { \ + opts.need_zone_map = false; \ + if (opts.need_bloom_filter) { \ + return Status::NotSupported("Do not support bloom filter for " type_name " type"); \ + } \ + if (opts.need_bitmap_index) { \ + return Status::NotSupported("Do not support bitmap index for " type_name " type"); \ + } \ + } + + CHECK_FIELD_TYPE(STRUCT, "struct") + CHECK_FIELD_TYPE(ARRAY, "array") + CHECK_FIELD_TYPE(JSONB, "jsonb") + CHECK_FIELD_TYPE(AGG_STATE, "agg_state") + CHECK_FIELD_TYPE(MAP, "map") + CHECK_FIELD_TYPE(VARIANT, "variant") + +#undef CHECK_FIELD_TYPE if (column.is_row_store_column()) { // smaller page size for row store column opts.data_page_size = config::row_column_page_size; } - std::unique_ptr writer; RETURN_IF_ERROR(ColumnWriter::create(opts, &column, _file_writer, &writer)); RETURN_IF_ERROR(writer->init()); @@ -411,6 +387,7 @@ Status SegmentWriter::append_block_with_partial_content(const vectorized::Block* size_t delta_pos = block_pos - row_pos; size_t segment_pos = segment_start_pos + delta_pos; std::string key = _full_encode_keys(key_columns, delta_pos); + _maybe_invalid_row_cache(key); if (have_input_seq_column) { _encode_seq_column(seq_column, delta_pos, &key); } @@ -420,18 +397,10 @@ Status SegmentWriter::append_block_with_partial_content(const vectorized::Block* if (!_tablet_schema->has_sequence_col() || have_input_seq_column) { RETURN_IF_ERROR(_primary_key_index_builder->add_item(key)); } - _maybe_invalid_row_cache(key); // mark key with delete sign as deleted. bool have_delete_sign = (delete_sign_column_data != nullptr && delete_sign_column_data[block_pos] != 0); - if (have_delete_sign && !_tablet_schema->has_sequence_col() && !have_input_seq_column) { - // we can directly use delete bitmap to mark the rows with delete sign as deleted - // if sequence column doesn't exist to eliminate reading delete sign columns in later reads - _mow_context->delete_bitmap->add({_opts.rowset_ctx->rowset_id, _segment_id, - DeleteBitmap::TEMP_VERSION_FOR_DELETE_SIGN}, - segment_pos); - } RowLocation loc; // save rowset shared ptr so this rowset wouldn't delete @@ -577,8 +546,8 @@ Status SegmentWriter::fill_missing_columns(vectorized::MutableColumns& mutable_f read_index[id_and_pos.pos] = read_idx++; } if (has_row_column) { - auto st = tablet->fetch_value_through_row_column(rowset, seg_it.first, rids, - cids_missing, old_value_block); + auto st = tablet->fetch_value_through_row_column( + rowset, *_tablet_schema, seg_it.first, rids, cids_missing, old_value_block); if (!st.ok()) { LOG(WARNING) << "failed to fetch value through row column"; return st; @@ -684,29 +653,6 @@ Status SegmentWriter::append_block(const vectorized::Block* block, size_t row_po _serialize_block_to_row_column(*const_cast(block)); } - if (_opts.write_type == DataWriteType::TYPE_DIRECT && _opts.enable_unique_key_merge_on_write && - !_tablet_schema->has_sequence_col() && _tablet_schema->delete_sign_idx() != -1) { - const vectorized::ColumnWithTypeAndName& delete_sign_column = - block->get_by_position(_tablet_schema->delete_sign_idx()); - auto& delete_sign_col = - reinterpret_cast(*(delete_sign_column.column)); - if (delete_sign_col.size() >= row_pos + num_rows) { - const vectorized::Int8* delete_sign_column_data = delete_sign_col.get_data().data(); - uint32_t segment_start_pos = - _column_writers[_tablet_schema->delete_sign_idx()]->get_next_rowid(); - for (size_t block_pos = row_pos, seg_pos = segment_start_pos; - seg_pos < segment_start_pos + num_rows; block_pos++, seg_pos++) { - // we can directly use delete bitmap to mark the rows with delete sign as deleted - // if sequence column doesn't exist to eliminate reading delete sign columns in later reads - if (delete_sign_column_data[block_pos]) { - _mow_context->delete_bitmap->add({_opts.rowset_ctx->rowset_id, _segment_id, - DeleteBitmap::TEMP_VERSION_FOR_DELETE_SIGN}, - seg_pos); - } - } - } - } - _olap_data_convertor->set_source_content(block, row_pos, num_rows); // find all row pos for short key indexes @@ -750,6 +696,7 @@ Status SegmentWriter::append_block(const vectorized::Block* block, size_t row_po std::string last_key; for (size_t pos = 0; pos < num_rows; pos++) { std::string key = _full_encode_keys(key_columns, pos); + _maybe_invalid_row_cache(key); if (_tablet_schema->has_sequence_col()) { _encode_seq_column(seq_column, pos, &key); } @@ -757,7 +704,6 @@ Status SegmentWriter::append_block(const vectorized::Block* block, size_t row_po << "found duplicate key or key is not sorted! current key: " << key << ", last key" << last_key; RETURN_IF_ERROR(_primary_key_index_builder->add_item(key)); - _maybe_invalid_row_cache(key); last_key = std::move(key); } } else { @@ -791,8 +737,7 @@ int64_t SegmentWriter::max_row_to_add(size_t row_avg_size_in_bytes) { } std::string SegmentWriter::_full_encode_keys( - const std::vector& key_columns, size_t pos, - bool null_first) { + const std::vector& key_columns, size_t pos) { assert(_key_index_size.size() == _num_key_columns); assert(key_columns.size() == _num_key_columns && _key_coders.size() == _num_key_columns); @@ -801,11 +746,7 @@ std::string SegmentWriter::_full_encode_keys( for (const auto& column : key_columns) { auto field = column->get_data_at(pos); if (UNLIKELY(!field)) { - if (null_first) { - encoded_keys.push_back(KEY_NULL_FIRST_MARKER); - } else { - encoded_keys.push_back(KEY_NULL_LAST_MARKER); - } + encoded_keys.push_back(KEY_NULL_FIRST_MARKER); ++cid; continue; } @@ -833,8 +774,7 @@ void SegmentWriter::_encode_seq_column(const vectorized::IOlapColumnDataAccessor } std::string SegmentWriter::_encode_keys( - const std::vector& key_columns, size_t pos, - bool null_first) { + const std::vector& key_columns, size_t pos) { assert(key_columns.size() == _num_short_key_columns); std::string encoded_keys; @@ -842,11 +782,7 @@ std::string SegmentWriter::_encode_keys( for (const auto& column : key_columns) { auto field = column->get_data_at(pos); if (UNLIKELY(!field)) { - if (null_first) { - encoded_keys.push_back(KEY_NULL_FIRST_MARKER); - } else { - encoded_keys.push_back(KEY_NULL_LAST_MARKER); - } + encoded_keys.push_back(KEY_NULL_FIRST_MARKER); ++cid; continue; } @@ -864,7 +800,7 @@ Status SegmentWriter::append_row(const RowType& row) { RETURN_IF_ERROR(_column_writers[cid]->append(cell)); } std::string full_encoded_key; - encode_key(&full_encoded_key, row, _num_key_columns); + encode_key(&full_encoded_key, row, _num_key_columns); if (_tablet_schema->has_sequence_col()) { full_encoded_key.push_back(KEY_NORMAL_MARKER); auto cid = _tablet_schema->sequence_col_idx(); diff --git a/be/src/olap/rowset/segment_v2/segment_writer.h b/be/src/olap/rowset/segment_v2/segment_writer.h index 918eca2da4dca9..3b50d0a4aac8c3 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.h +++ b/be/src/olap/rowset/segment_v2/segment_writer.h @@ -146,11 +146,10 @@ class SegmentWriter { Status _write_raw_data(const std::vector& slices); void _maybe_invalid_row_cache(const std::string& key); std::string _encode_keys(const std::vector& key_columns, - size_t pos, bool null_first = true); + size_t pos); // used for unique-key with merge on write and segment min_max key std::string _full_encode_keys( - const std::vector& key_columns, size_t pos, - bool null_first = true); + const std::vector& key_columns, size_t pos); // used for unique-key with merge on write void _encode_seq_column(const vectorized::IOlapColumnDataAccessor* seq_column, size_t pos, string* encoded_keys); diff --git a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp new file mode 100644 index 00000000000000..23ae7cfd4240f7 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp @@ -0,0 +1,1008 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/rowset/segment_v2/vertical_segment_writer.h" + +#include +#include + +#include +#include +#include +#include +#include + +#include "cloud/config.h" +#include "common/compiler_util.h" // IWYU pragma: keep +#include "common/config.h" +#include "common/logging.h" // LOG +#include "gutil/port.h" +#include "io/fs/file_writer.h" +#include "olap/data_dir.h" +#include "olap/key_coder.h" +#include "olap/olap_common.h" +#include "olap/primary_key_index.h" +#include "olap/row_cursor.h" // RowCursor // IWYU pragma: keep +#include "olap/rowset/rowset_writer_context.h" // RowsetWriterContext +#include "olap/rowset/segment_v2/column_writer.h" // ColumnWriter +#include "olap/rowset/segment_v2/page_io.h" +#include "olap/rowset/segment_v2/page_pointer.h" +#include "olap/segment_loader.h" +#include "olap/short_key_index.h" +#include "olap/tablet_schema.h" +#include "olap/utils.h" +#include "runtime/memory/mem_tracker.h" +#include "service/point_query_executor.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/faststring.h" +#include "util/key_util.h" +#include "vec/columns/column_nullable.h" +#include "vec/common/schema_util.h" +#include "vec/core/block.h" +#include "vec/core/column_with_type_and_name.h" +#include "vec/core/types.h" +#include "vec/io/reader_buffer.h" +#include "vec/jsonb/serialize.h" +#include "vec/olap/olap_data_convertor.h" + +namespace doris { +namespace segment_v2 { + +using namespace ErrorCode; + +static const char* k_segment_magic = "D0R1"; +static const uint32_t k_segment_magic_length = 4; + +VerticalSegmentWriter::VerticalSegmentWriter(io::FileWriter* file_writer, uint32_t segment_id, + TabletSchemaSPtr tablet_schema, BaseTabletSPtr tablet, + DataDir* data_dir, uint32_t max_row_per_segment, + const VerticalSegmentWriterOptions& opts, + std::shared_ptr mow_context) + : _segment_id(segment_id), + _tablet_schema(std::move(tablet_schema)), + _tablet(std::move(tablet)), + _data_dir(data_dir), + _opts(opts), + _file_writer(file_writer), + _mem_tracker(std::make_unique("VerticalSegmentWriter:Segment-" + + std::to_string(segment_id))), + _mow_context(std::move(mow_context)) { + CHECK_NOTNULL(file_writer); + _num_key_columns = _tablet_schema->num_key_columns(); + _num_short_key_columns = _tablet_schema->num_short_key_columns(); + DCHECK(_num_key_columns >= _num_short_key_columns); + for (size_t cid = 0; cid < _num_key_columns; ++cid) { + const auto& column = _tablet_schema->column(cid); + _key_coders.push_back(get_key_coder(column.type())); + _key_index_size.push_back(column.index_length()); + } + // encode the sequence id into the primary key index + if (_tablet_schema->has_sequence_col() && _tablet_schema->keys_type() == UNIQUE_KEYS && + _opts.enable_unique_key_merge_on_write) { + const auto& column = _tablet_schema->column(_tablet_schema->sequence_col_idx()); + _seq_coder = get_key_coder(column.type()); + } +} + +VerticalSegmentWriter::~VerticalSegmentWriter() { + _mem_tracker->release(_mem_tracker->consumption()); +} + +void VerticalSegmentWriter::_init_column_meta(ColumnMetaPB* meta, uint32_t column_id, + const TabletColumn& column) { + meta->set_column_id(column_id); + meta->set_unique_id(column.unique_id()); + meta->set_type(int(column.type())); + meta->set_length(column.length()); + meta->set_encoding(DEFAULT_ENCODING); + meta->set_compression(_opts.compression_type); + meta->set_is_nullable(column.is_nullable()); + for (uint32_t i = 0; i < column.get_subtype_count(); ++i) { + _init_column_meta(meta->add_children_columns(), column_id, column.get_sub_column(i)); + } +} + +Status VerticalSegmentWriter::_create_column_writer(uint32_t cid, const TabletColumn& column) { + ColumnWriterOptions opts; + opts.meta = _footer.add_columns(); + + _init_column_meta(opts.meta, cid, column); + + // now we create zone map for key columns in AGG_KEYS or all column in UNIQUE_KEYS or DUP_KEYS + // and not support zone map for array type and jsonb type. + opts.need_zone_map = (column.is_key() || _tablet_schema->keys_type() != KeysType::AGG_KEYS) && + column.type() != FieldType::OLAP_FIELD_TYPE_OBJECT; + opts.need_bloom_filter = column.is_bf_column(); + auto* tablet_index = _tablet_schema->get_ngram_bf_index(column.unique_id()); + if (tablet_index) { + opts.need_bloom_filter = true; + opts.is_ngram_bf_index = true; + opts.gram_size = tablet_index->get_gram_size(); + opts.gram_bf_size = tablet_index->get_gram_bf_size(); + } + + opts.need_bitmap_index = column.has_bitmap_index(); + bool skip_inverted_index = false; + if (_opts.rowset_ctx != nullptr) { + // skip write inverted index for index compaction + skip_inverted_index = _opts.rowset_ctx->skip_inverted_index.count(column.unique_id()) > 0; + } + // skip write inverted index on load if skip_write_index_on_load is true + if (_opts.write_type == DataWriteType::TYPE_DIRECT && + _tablet_schema->skip_write_index_on_load()) { + skip_inverted_index = true; + } + // indexes for this column + opts.indexes = _tablet_schema->get_indexes_for_column(column.unique_id()); + for (auto index : opts.indexes) { + if (!skip_inverted_index && index && index->index_type() == IndexType::INVERTED) { + opts.inverted_index = index; + // TODO support multiple inverted index + break; + } + } + if (column.type() == FieldType::OLAP_FIELD_TYPE_STRUCT) { + opts.need_zone_map = false; + if (opts.need_bloom_filter) { + return Status::NotSupported("Do not support bloom filter for struct type"); + } + if (opts.need_bitmap_index) { + return Status::NotSupported("Do not support bitmap index for struct type"); + } + } + if (column.type() == FieldType::OLAP_FIELD_TYPE_ARRAY) { + opts.need_zone_map = false; + if (opts.need_bloom_filter) { + return Status::NotSupported("Do not support bloom filter for array type"); + } + if (opts.need_bitmap_index) { + return Status::NotSupported("Do not support bitmap index for array type"); + } + } + if (column.type() == FieldType::OLAP_FIELD_TYPE_JSONB) { + opts.need_zone_map = false; + if (opts.need_bloom_filter) { + return Status::NotSupported("Do not support bloom filter for jsonb type"); + } + if (opts.need_bitmap_index) { + return Status::NotSupported("Do not support bitmap index for jsonb type"); + } + } + if (column.type() == FieldType::OLAP_FIELD_TYPE_AGG_STATE) { + opts.need_zone_map = false; + if (opts.need_bloom_filter) { + return Status::NotSupported("Do not support bloom filter for agg_state type"); + } + if (opts.need_bitmap_index) { + return Status::NotSupported("Do not support bitmap index for agg_state type"); + } + } + if (column.type() == FieldType::OLAP_FIELD_TYPE_MAP) { + opts.need_zone_map = false; + if (opts.need_bloom_filter) { + return Status::NotSupported("Do not support bloom filter for map type"); + } + if (opts.need_bitmap_index) { + return Status::NotSupported("Do not support bitmap index for map type"); + } + } + + if (column.is_row_store_column()) { + // smaller page size for row store column + opts.data_page_size = config::row_column_page_size; + } + + std::unique_ptr writer; + RETURN_IF_ERROR(ColumnWriter::create(opts, &column, _file_writer, &writer)); + RETURN_IF_ERROR(writer->init()); + _column_writers.push_back(std::move(writer)); + + _olap_data_convertor->add_column_data_convertor(column); + return Status::OK(); +}; + +Status VerticalSegmentWriter::init() { + DCHECK(_column_writers.empty()); + if (_opts.compression_type == UNKNOWN_COMPRESSION) { + _opts.compression_type = _tablet_schema->compression_type(); + } + _olap_data_convertor = std::make_unique(); + _olap_data_convertor->reserve(_tablet_schema->num_columns()); + _column_writers.reserve(_tablet_schema->columns().size()); + // we don't need the short key index for unique key merge on write table. + if (_tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write) { + size_t seq_col_length = 0; + if (_tablet_schema->has_sequence_col()) { + seq_col_length = + _tablet_schema->column(_tablet_schema->sequence_col_idx()).length() + 1; + } + _primary_key_index_builder.reset(new PrimaryKeyIndexBuilder(_file_writer, seq_col_length)); + RETURN_IF_ERROR(_primary_key_index_builder->init()); + } else { + _short_key_index_builder.reset( + new ShortKeyIndexBuilder(_segment_id, _opts.num_rows_per_block)); + } + return Status::OK(); +} + +void VerticalSegmentWriter::_maybe_invalid_row_cache(const std::string& key) const { + // Just invalid row cache for simplicity, since the rowset is not visible at present. + // If we update/insert cache, if load failed rowset will not be visible but cached data + // will be visible, and lead to inconsistency. + if (!config::disable_storage_row_cache && _tablet_schema->store_row_column() && + _opts.write_type == DataWriteType::TYPE_DIRECT) { + // invalidate cache + RowCache::instance()->erase({_opts.rowset_ctx->tablet_id, key}); + } +} + +void VerticalSegmentWriter::_serialize_block_to_row_column(vectorized::Block& block) { + if (block.rows() == 0) { + return; + } + MonotonicStopWatch watch; + watch.start(); + // find row column id + int row_column_id = 0; + for (int i = 0; i < _tablet_schema->num_columns(); ++i) { + if (_tablet_schema->column(i).is_row_store_column()) { + row_column_id = i; + break; + } + } + if (row_column_id == 0) { + return; + } + auto* row_store_column = + static_cast(block.get_by_position(row_column_id) + .column->assume_mutable_ref() + .assume_mutable() + .get()); + row_store_column->clear(); + vectorized::DataTypeSerDeSPtrs serdes = + vectorized::create_data_type_serdes(block.get_data_types()); + vectorized::JsonbSerializeUtil::block_to_jsonb(*_tablet_schema, block, *row_store_column, + _tablet_schema->num_columns(), serdes); + VLOG_DEBUG << "serialize , num_rows:" << block.rows() << ", row_column_id:" << row_column_id + << ", total_byte_size:" << block.allocated_bytes() << ", serialize_cost(us)" + << watch.elapsed_time() / 1000; +} + +// for partial update, we should do following steps to fill content of block: +// 1. set block data to data convertor, and get all key_column's converted slice +// 2. get pk of input block, and read missing columns +// 2.1 first find key location{rowset_id, segment_id, row_id} +// 2.2 build read plan to read by batch +// 2.3 fill block +// 3. set columns to data convertor and then write all columns +Status VerticalSegmentWriter::_append_block_with_partial_content(RowsInBlock& data) { + if (config::cloud_mode) { + // TODO(plat1ko) + return Status::NotSupported("append_block_with_partial_content"); + } + DCHECK(_tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write); + DCHECK(_opts.rowset_ctx->partial_update_info != nullptr); + + auto tablet = static_cast(_tablet.get()); + // create full block and fill with input columns + auto full_block = _tablet_schema->create_block(); + const auto& including_cids = _opts.rowset_ctx->partial_update_info->update_cids; + size_t input_id = 0; + for (auto i : including_cids) { + full_block.replace_by_position(i, data.block->get_by_position(input_id++).column); + } + _olap_data_convertor->set_source_content_with_specifid_columns(&full_block, data.row_pos, + data.num_rows, including_cids); + + bool have_input_seq_column = false; + // write including columns + std::vector key_columns; + vectorized::IOlapColumnDataAccessor* seq_column = nullptr; + size_t segment_start_pos; + for (auto cid : including_cids) { + // here we get segment column row num before append data. + segment_start_pos = _column_writers[cid]->get_next_rowid(); + // olap data convertor alway start from id = 0 + auto [status, column] = _olap_data_convertor->convert_column_data(cid); + if (!status.ok()) { + return status; + } + if (cid < _num_key_columns) { + key_columns.push_back(column); + } else if (_tablet_schema->has_sequence_col() && + cid == _tablet_schema->sequence_col_idx()) { + seq_column = column; + have_input_seq_column = true; + } + RETURN_IF_ERROR(_column_writers[cid]->append(column->get_nullmap(), column->get_data(), + data.num_rows)); + } + + bool has_default_or_nullable = false; + std::vector use_default_or_null_flag; + use_default_or_null_flag.reserve(data.num_rows); + const vectorized::Int8* delete_sign_column_data = nullptr; + if (const vectorized::ColumnWithTypeAndName* delete_sign_column = + full_block.try_get_by_name(DELETE_SIGN); + delete_sign_column != nullptr) { + auto& delete_sign_col = + reinterpret_cast(*(delete_sign_column->column)); + if (delete_sign_col.size() >= data.row_pos + data.num_rows) { + delete_sign_column_data = delete_sign_col.get_data().data(); + } + } + + std::vector specified_rowsets; + { + std::shared_lock rlock(tablet->get_header_lock()); + specified_rowsets = tablet->get_rowset_by_ids(&_mow_context->rowset_ids); + } + std::vector> segment_caches(specified_rowsets.size()); + // locate rows in base data + + int64_t num_rows_filtered = 0; + for (size_t block_pos = data.row_pos; block_pos < data.row_pos + data.num_rows; block_pos++) { + // block segment + // 2 -> 0 + // 3 -> 1 + // 4 -> 2 + // 5 -> 3 + // here row_pos = 2, num_rows = 4. + size_t delta_pos = block_pos - data.row_pos; + size_t segment_pos = segment_start_pos + delta_pos; + std::string key = _full_encode_keys(key_columns, delta_pos); + _maybe_invalid_row_cache(key); + if (have_input_seq_column) { + _encode_seq_column(seq_column, delta_pos, &key); + } + // If the table have sequence column, and the include-cids don't contain the sequence + // column, we need to update the primary key index builder at the end of this method. + // At that time, we have a valid sequence column to encode the key with seq col. + if (!_tablet_schema->has_sequence_col() || have_input_seq_column) { + RETURN_IF_ERROR(_primary_key_index_builder->add_item(key)); + } + + // mark key with delete sign as deleted. + bool have_delete_sign = + (delete_sign_column_data != nullptr && delete_sign_column_data[block_pos] != 0); + + RowLocation loc; + // save rowset shared ptr so this rowset wouldn't delete + RowsetSharedPtr rowset; + auto st = tablet->lookup_row_key(key, have_input_seq_column, specified_rowsets, &loc, + _mow_context->max_version, segment_caches, &rowset); + if (st.is()) { + if (_opts.rowset_ctx->partial_update_info->is_strict_mode) { + ++num_rows_filtered; + // delete the invalid newly inserted row + _mow_context->delete_bitmap->add({_opts.rowset_ctx->rowset_id, _segment_id, + DeleteBitmap::TEMP_VERSION_COMMON}, + segment_pos); + } + + if (!_opts.rowset_ctx->partial_update_info->can_insert_new_rows_in_partial_update) { + return Status::InternalError( + "the unmentioned columns should have default value or be nullable for " + "newly inserted rows in non-strict mode partial update"); + } + has_default_or_nullable = true; + use_default_or_null_flag.emplace_back(true); + continue; + } + if (!st.ok() && !st.is()) { + LOG(WARNING) << "failed to lookup row key, error: " << st; + return st; + } + + // if the delete sign is marked, it means that the value columns of the row + // will not be read. So we don't need to read the missing values from the previous rows. + // But we still need to mark the previous row on delete bitmap + if (have_delete_sign) { + has_default_or_nullable = true; + use_default_or_null_flag.emplace_back(true); + } else { + // partial update should not contain invisible columns + use_default_or_null_flag.emplace_back(false); + _rsid_to_rowset.emplace(rowset->rowset_id(), rowset); + tablet->prepare_to_read(loc, segment_pos, &_rssid_to_rid); + } + + if (st.is()) { + // although we need to mark delete current row, we still need to read missing columns + // for this row, we need to ensure that each column is aligned + _mow_context->delete_bitmap->add( + {_opts.rowset_ctx->rowset_id, _segment_id, DeleteBitmap::TEMP_VERSION_COMMON}, + segment_pos); + } else { + _mow_context->delete_bitmap->add( + {loc.rowset_id, loc.segment_id, DeleteBitmap::TEMP_VERSION_COMMON}, loc.row_id); + } + } + CHECK(use_default_or_null_flag.size() == data.num_rows); + + if (config::enable_merge_on_write_correctness_check) { + tablet->add_sentinel_mark_to_delete_bitmap(_mow_context->delete_bitmap.get(), + _mow_context->rowset_ids); + } + + // read and fill block + auto mutable_full_columns = full_block.mutate_columns(); + RETURN_IF_ERROR(_fill_missing_columns(mutable_full_columns, use_default_or_null_flag, + has_default_or_nullable, segment_start_pos)); + // row column should be filled here + if (_tablet_schema->store_row_column()) { + // convert block to row store format + _serialize_block_to_row_column(full_block); + } + + // convert missing columns and send to column writer + const auto& missing_cids = _opts.rowset_ctx->partial_update_info->missing_cids; + _olap_data_convertor->set_source_content_with_specifid_columns(&full_block, data.row_pos, + data.num_rows, missing_cids); + for (auto cid : missing_cids) { + auto [status, column] = _olap_data_convertor->convert_column_data(cid); + if (!status.ok()) { + return status; + } + if (_tablet_schema->has_sequence_col() && !have_input_seq_column && + cid == _tablet_schema->sequence_col_idx()) { + DCHECK_EQ(seq_column, nullptr); + seq_column = column; + } + RETURN_IF_ERROR(_column_writers[cid]->append(column->get_nullmap(), column->get_data(), + data.num_rows)); + } + + _num_rows_filtered += num_rows_filtered; + if (_tablet_schema->has_sequence_col() && !have_input_seq_column) { + DCHECK_NE(seq_column, nullptr); + DCHECK_EQ(_num_rows_written, data.row_pos) + << "_num_rows_written: " << _num_rows_written << ", row_pos" << data.row_pos; + DCHECK_EQ(_primary_key_index_builder->num_rows(), _num_rows_written) + << "primary key index builder num rows(" << _primary_key_index_builder->num_rows() + << ") not equal to segment writer's num rows written(" << _num_rows_written << ")"; + if (_num_rows_written != data.row_pos || + _primary_key_index_builder->num_rows() != _num_rows_written) { + return Status::InternalError( + "Correctness check failed, _num_rows_written: {}, row_pos: {}, primary key " + "index builder num rows: {}", + _num_rows_written, data.row_pos, _primary_key_index_builder->num_rows()); + } + for (size_t block_pos = data.row_pos; block_pos < data.row_pos + data.num_rows; + block_pos++) { + std::string key = _full_encode_keys(key_columns, block_pos - data.row_pos); + _encode_seq_column(seq_column, block_pos - data.row_pos, &key); + RETURN_IF_ERROR(_primary_key_index_builder->add_item(key)); + } + } + + _num_rows_written += data.num_rows; + DCHECK_EQ(_primary_key_index_builder->num_rows(), _num_rows_written) + << "primary key index builder num rows(" << _primary_key_index_builder->num_rows() + << ") not equal to segment writer's num rows written(" << _num_rows_written << ")"; + _olap_data_convertor->clear_source_content(); + return Status::OK(); +} + +Status VerticalSegmentWriter::_fill_missing_columns( + vectorized::MutableColumns& mutable_full_columns, + const std::vector& use_default_or_null_flag, bool has_default_or_nullable, + const size_t& segment_start_pos) { + if (config::cloud_mode) [[unlikely]] { + return Status::NotSupported("fill_missing_columns"); + } + auto tablet = static_cast(_tablet.get()); + // create old value columns + const auto& missing_cids = _opts.rowset_ctx->partial_update_info->missing_cids; + auto old_value_block = _tablet_schema->create_block_by_cids(missing_cids); + CHECK(missing_cids.size() == old_value_block.columns()); + auto mutable_old_columns = old_value_block.mutate_columns(); + bool has_row_column = _tablet_schema->store_row_column(); + // record real pos, key is input line num, value is old_block line num + std::map read_index; + size_t read_idx = 0; + for (auto rs_it : _rssid_to_rid) { + for (auto seg_it : rs_it.second) { + auto rowset = _rsid_to_rowset[rs_it.first]; + CHECK(rowset); + std::vector rids; + for (auto id_and_pos : seg_it.second) { + rids.emplace_back(id_and_pos.rid); + read_index[id_and_pos.pos] = read_idx++; + } + if (has_row_column) { + auto st = tablet->fetch_value_through_row_column( + rowset, *_tablet_schema, seg_it.first, rids, missing_cids, old_value_block); + if (!st.ok()) { + LOG(WARNING) << "failed to fetch value through row column"; + return st; + } + continue; + } + for (size_t cid = 0; cid < mutable_old_columns.size(); ++cid) { + TabletColumn tablet_column = _tablet_schema->column(missing_cids[cid]); + auto st = tablet->fetch_value_by_rowids(rowset, seg_it.first, rids, tablet_column, + mutable_old_columns[cid]); + // set read value to output block + if (!st.ok()) { + LOG(WARNING) << "failed to fetch value by rowids"; + return st; + } + } + } + } + // build default value columns + auto default_value_block = old_value_block.clone_empty(); + auto mutable_default_value_columns = default_value_block.mutate_columns(); + + const vectorized::Int8* delete_sign_column_data = nullptr; + if (const vectorized::ColumnWithTypeAndName* delete_sign_column = + old_value_block.try_get_by_name(DELETE_SIGN); + delete_sign_column != nullptr && _tablet_schema->has_sequence_col()) { + auto& delete_sign_col = + reinterpret_cast(*(delete_sign_column->column)); + delete_sign_column_data = delete_sign_col.get_data().data(); + } + + if (has_default_or_nullable || delete_sign_column_data != nullptr) { + for (auto i = 0; i < missing_cids.size(); ++i) { + const auto& column = _tablet_schema->column(missing_cids[i]); + if (column.has_default_value()) { + auto default_value = _tablet_schema->column(missing_cids[i]).default_value(); + vectorized::ReadBuffer rb(const_cast(default_value.c_str()), + default_value.size()); + RETURN_IF_ERROR(old_value_block.get_by_position(i).type->from_string( + rb, mutable_default_value_columns[i].get())); + } + } + } + + // fill all missing value from mutable_old_columns, need to consider default value and null value + for (auto idx = 0; idx < use_default_or_null_flag.size(); idx++) { + // `use_default_or_null_flag[idx] == true` doesn't mean that we should read values from the old row + // for the missing columns. For example, if a table has sequence column, the rows with DELETE_SIGN column + // marked will not be marked in delete bitmap(see https://github.com/apache/doris/pull/24011), so it will + // be found in Tablet::lookup_row_key() and `use_default_or_null_flag[idx]` will be false. But we should not + // read values from old rows for missing values in this occasion. So we should read the DELETE_SIGN column + // to check if a row REALLY exists in the table. + if (use_default_or_null_flag[idx] || + (delete_sign_column_data != nullptr && + delete_sign_column_data[read_index[idx + segment_start_pos]] != 0)) { + for (auto i = 0; i < missing_cids.size(); ++i) { + // if the column has default value, fiil it with default value + // otherwise, if the column is nullable, fill it with null value + const auto& tablet_column = _tablet_schema->column(missing_cids[i]); + if (tablet_column.has_default_value()) { + mutable_full_columns[missing_cids[i]]->insert_from( + *mutable_default_value_columns[i].get(), 0); + } else if (tablet_column.is_nullable()) { + auto nullable_column = assert_cast( + mutable_full_columns[missing_cids[i]].get()); + nullable_column->insert_null_elements(1); + } else { + // If the control flow reaches this branch, the column neither has default value + // nor is nullable. It means that the row's delete sign is marked, and the value + // columns are useless and won't be read. So we can just put arbitary values in the cells + mutable_full_columns[missing_cids[i]]->insert_default(); + } + } + continue; + } + auto pos_in_old_block = read_index[idx + segment_start_pos]; + for (auto i = 0; i < missing_cids.size(); ++i) { + mutable_full_columns[missing_cids[i]]->insert_from( + *old_value_block.get_columns_with_type_and_name()[i].column.get(), + pos_in_old_block); + } + } + return Status::OK(); +} + +Status VerticalSegmentWriter::batch_block(const vectorized::Block* block, size_t row_pos, + size_t num_rows) { + if (_opts.rowset_ctx->partial_update_info && + _opts.rowset_ctx->partial_update_info->is_partial_update && + _opts.write_type == DataWriteType::TYPE_DIRECT && + !_opts.rowset_ctx->is_transient_rowset_writer) { + if (block->columns() <= _tablet_schema->num_key_columns() || + block->columns() >= _tablet_schema->num_columns()) { + return Status::InternalError(fmt::format( + "illegal partial update block columns: {}, num key columns: {}, total " + "schema columns: {}", + block->columns(), _tablet_schema->num_key_columns(), + _tablet_schema->num_columns())); + } + } else if (block->columns() != _tablet_schema->num_columns()) { + return Status::InternalError( + "illegal block columns, block columns = {}, tablet_schema columns = {}", + block->columns(), _tablet_schema->num_columns()); + } + _batched_blocks.emplace_back(block, row_pos, num_rows); + return Status::OK(); +} + +Status VerticalSegmentWriter::write_batch() { + if (_opts.rowset_ctx->partial_update_info && + _opts.rowset_ctx->partial_update_info->is_partial_update && + _opts.write_type == DataWriteType::TYPE_DIRECT && + !_opts.rowset_ctx->is_transient_rowset_writer) { + for (uint32_t cid = 0; cid < _tablet_schema->num_columns(); ++cid) { + RETURN_IF_ERROR(_create_column_writer(cid, _tablet_schema->column(cid))); + } + for (auto& data : _batched_blocks) { + RETURN_IF_ERROR(_append_block_with_partial_content(data)); + } + for (auto& column_writer : _column_writers) { + RETURN_IF_ERROR(column_writer->finish()); + RETURN_IF_ERROR(column_writer->write_data()); + } + return Status::OK(); + } + // Row column should be filled here when it's a directly write from memtable + // or it's schema change write(since column data type maybe changed, so we should reubild) + if (_tablet_schema->store_row_column() && + (_opts.write_type == DataWriteType::TYPE_DIRECT || + _opts.write_type == DataWriteType::TYPE_SCHEMA_CHANGE)) { + for (auto& data : _batched_blocks) { + // TODO: maybe we should pass range to this method + _serialize_block_to_row_column(*const_cast(data.block)); + } + } + + std::vector key_columns; + vectorized::IOlapColumnDataAccessor* seq_column = nullptr; + for (uint32_t cid = 0; cid < _tablet_schema->num_columns(); ++cid) { + RETURN_IF_ERROR(_create_column_writer(cid, _tablet_schema->column(cid))); + for (auto& data : _batched_blocks) { + _olap_data_convertor->set_source_content_with_specifid_columns( + data.block, data.row_pos, data.num_rows, std::vector {cid}); + + // convert column data from engine format to storage layer format + auto [status, column] = _olap_data_convertor->convert_column_data(cid); + if (!status.ok()) { + return status; + } + if (cid < _num_key_columns) { + key_columns.push_back(column); + } else if (_tablet_schema->has_sequence_col() && + cid == _tablet_schema->sequence_col_idx()) { + seq_column = column; + } + RETURN_IF_ERROR(_column_writers[cid]->append(column->get_nullmap(), column->get_data(), + data.num_rows)); + _olap_data_convertor->clear_source_content(); + } + if (_data_dir != nullptr && + _data_dir->reach_capacity_limit(_column_writers[cid]->estimate_buffer_size())) { + return Status::Error("disk {} exceed capacity limit.", + _data_dir->path_hash()); + } + RETURN_IF_ERROR(_column_writers[cid]->finish()); + RETURN_IF_ERROR(_column_writers[cid]->write_data()); + } + + for (auto& data : _batched_blocks) { + _olap_data_convertor->set_source_content(data.block, data.row_pos, data.num_rows); + // find all row pos for short key indexes + std::vector short_key_pos; + // We build a short key index every `_opts.num_rows_per_block` rows. Specifically, we + // build a short key index using 1st rows for first block and `_short_key_row_pos - _row_count` + // for next blocks. + if (_short_key_row_pos == 0 && _num_rows_written == 0) { + short_key_pos.push_back(0); + } + while (_short_key_row_pos + _opts.num_rows_per_block < _num_rows_written + data.num_rows) { + _short_key_row_pos += _opts.num_rows_per_block; + short_key_pos.push_back(_short_key_row_pos - _num_rows_written); + } + if (_tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write) { + // create primary indexes + std::string last_key; + for (size_t pos = 0; pos < data.num_rows; pos++) { + std::string key = _full_encode_keys(key_columns, pos); + _maybe_invalid_row_cache(key); + if (_tablet_schema->has_sequence_col()) { + _encode_seq_column(seq_column, pos, &key); + } + DCHECK(key.compare(last_key) > 0) + << "found duplicate key or key is not sorted! current key: " << key + << ", last key" << last_key; + RETURN_IF_ERROR(_primary_key_index_builder->add_item(key)); + last_key = std::move(key); + } + } else { + // create short key indexes' + // for min_max key + _set_min_key(_full_encode_keys(key_columns, 0)); + _set_max_key(_full_encode_keys(key_columns, data.num_rows - 1)); + + key_columns.resize(_num_short_key_columns); + for (const auto pos : short_key_pos) { + RETURN_IF_ERROR(_short_key_index_builder->add_item(_encode_keys(key_columns, pos))); + } + } + _olap_data_convertor->clear_source_content(); + _num_rows_written += data.num_rows; + } + + _batched_blocks.clear(); + return Status::OK(); +} + +std::string VerticalSegmentWriter::_full_encode_keys( + const std::vector& key_columns, size_t pos) { + assert(_key_index_size.size() == _num_key_columns); + assert(key_columns.size() == _num_key_columns && _key_coders.size() == _num_key_columns); + + std::string encoded_keys; + size_t cid = 0; + for (const auto& column : key_columns) { + auto field = column->get_data_at(pos); + if (UNLIKELY(!field)) { + encoded_keys.push_back(KEY_NULL_FIRST_MARKER); + ++cid; + continue; + } + encoded_keys.push_back(KEY_NORMAL_MARKER); + _key_coders[cid]->full_encode_ascending(field, &encoded_keys); + ++cid; + } + return encoded_keys; +} + +void VerticalSegmentWriter::_encode_seq_column( + const vectorized::IOlapColumnDataAccessor* seq_column, size_t pos, string* encoded_keys) { + const auto* field = seq_column->get_data_at(pos); + // To facilitate the use of the primary key index, encode the seq column + // to the minimum value of the corresponding length when the seq column + // is null + if (UNLIKELY(!field)) { + encoded_keys->push_back(KEY_NULL_FIRST_MARKER); + size_t seq_col_length = _tablet_schema->column(_tablet_schema->sequence_col_idx()).length(); + encoded_keys->append(seq_col_length, KEY_MINIMAL_MARKER); + return; + } + encoded_keys->push_back(KEY_NORMAL_MARKER); + _seq_coder->full_encode_ascending(field, encoded_keys); +} + +std::string VerticalSegmentWriter::_encode_keys( + const std::vector& key_columns, size_t pos) { + assert(key_columns.size() == _num_short_key_columns); + + std::string encoded_keys; + size_t cid = 0; + for (const auto& column : key_columns) { + auto field = column->get_data_at(pos); + if (UNLIKELY(!field)) { + encoded_keys.push_back(KEY_NULL_FIRST_MARKER); + ++cid; + continue; + } + encoded_keys.push_back(KEY_NORMAL_MARKER); + _key_coders[cid]->encode_ascending(field, _key_index_size[cid], &encoded_keys); + ++cid; + } + return encoded_keys; +} + +// TODO(lingbin): Currently this function does not include the size of various indexes, +// We should make this more precise. +uint64_t VerticalSegmentWriter::_estimated_remaining_size() { + // footer_size(4) + checksum(4) + segment_magic(4) + uint64_t size = 12; + if (_tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write) { + size += _primary_key_index_builder->size(); + } else { + size += _short_key_index_builder->size(); + } + + // update the mem_tracker of segment size + _mem_tracker->consume(size - _mem_tracker->consumption()); + return size; +} + +size_t VerticalSegmentWriter::_calculate_inverted_index_file_size() { + size_t total_size = 0; + for (auto& column_writer : _column_writers) { + total_size += column_writer->get_inverted_index_size(); + } + return total_size; +} + +Status VerticalSegmentWriter::finalize_columns_index(uint64_t* index_size) { + uint64_t index_start = _file_writer->bytes_appended(); + RETURN_IF_ERROR(_write_ordinal_index()); + RETURN_IF_ERROR(_write_zone_map()); + RETURN_IF_ERROR(_write_bitmap_index()); + RETURN_IF_ERROR(_write_inverted_index()); + RETURN_IF_ERROR(_write_bloom_filter_index()); + + *index_size = _file_writer->bytes_appended() - index_start; + if (_tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write) { + RETURN_IF_ERROR(_write_primary_key_index()); + // IndexedColumnWriter write data pages mixed with segment data, we should use + // the stat from primary key index builder. + *index_size += _primary_key_index_builder->disk_size(); + } else { + RETURN_IF_ERROR(_write_short_key_index()); + *index_size = _file_writer->bytes_appended() - index_start; + } + _inverted_index_file_size = _calculate_inverted_index_file_size(); + // reset all column writers and data_conveter + clear(); + + return Status::OK(); +} + +Status VerticalSegmentWriter::finalize_footer(uint64_t* segment_file_size) { + RETURN_IF_ERROR(_write_footer()); + // finish + RETURN_IF_ERROR(_file_writer->finalize()); + *segment_file_size = _file_writer->bytes_appended(); + if (*segment_file_size == 0) { + return Status::Corruption("Bad segment, file size = 0"); + } + return Status::OK(); +} + +Status VerticalSegmentWriter::finalize(uint64_t* segment_file_size, uint64_t* index_size) { + MonotonicStopWatch timer; + timer.start(); + // check disk capacity + if (_data_dir != nullptr && + _data_dir->reach_capacity_limit((int64_t)_estimated_remaining_size())) { + return Status::Error("disk {} exceed capacity limit.", + _data_dir->path_hash()); + } + _row_count = _num_rows_written; + _num_rows_written = 0; + // write index + RETURN_IF_ERROR(finalize_columns_index(index_size)); + // write footer + RETURN_IF_ERROR(finalize_footer(segment_file_size)); + + if (timer.elapsed_time() > 5000000000L) { + LOG(INFO) << "segment flush consumes a lot time_ns " << timer.elapsed_time() + << ", segmemt_size " << *segment_file_size; + } + return Status::OK(); +} + +void VerticalSegmentWriter::clear() { + for (auto& column_writer : _column_writers) { + column_writer.reset(); + } + _column_writers.clear(); + _olap_data_convertor.reset(); +} + +// write ordinal index after data has been written +Status VerticalSegmentWriter::_write_ordinal_index() { + for (auto& column_writer : _column_writers) { + RETURN_IF_ERROR(column_writer->write_ordinal_index()); + } + return Status::OK(); +} + +Status VerticalSegmentWriter::_write_zone_map() { + for (auto& column_writer : _column_writers) { + RETURN_IF_ERROR(column_writer->write_zone_map()); + } + return Status::OK(); +} + +Status VerticalSegmentWriter::_write_bitmap_index() { + for (auto& column_writer : _column_writers) { + RETURN_IF_ERROR(column_writer->write_bitmap_index()); + } + return Status::OK(); +} + +Status VerticalSegmentWriter::_write_inverted_index() { + for (auto& column_writer : _column_writers) { + RETURN_IF_ERROR(column_writer->write_inverted_index()); + } + return Status::OK(); +} + +Status VerticalSegmentWriter::_write_bloom_filter_index() { + for (auto& column_writer : _column_writers) { + RETURN_IF_ERROR(column_writer->write_bloom_filter_index()); + } + return Status::OK(); +} + +Status VerticalSegmentWriter::_write_short_key_index() { + std::vector body; + PageFooterPB footer; + RETURN_IF_ERROR(_short_key_index_builder->finalize(_row_count, &body, &footer)); + PagePointer pp; + // short key index page is not compressed right now + RETURN_IF_ERROR(PageIO::write_page(_file_writer, body, footer, &pp)); + pp.to_proto(_footer.mutable_short_key_index_page()); + return Status::OK(); +} + +Status VerticalSegmentWriter::_write_primary_key_index() { + CHECK(_primary_key_index_builder->num_rows() == _row_count); + return _primary_key_index_builder->finalize(_footer.mutable_primary_key_index_meta()); +} + +Status VerticalSegmentWriter::_write_footer() { + _footer.set_num_rows(_row_count); + + // Footer := SegmentFooterPB, FooterPBSize(4), FooterPBChecksum(4), MagicNumber(4) + std::string footer_buf; + if (!_footer.SerializeToString(&footer_buf)) { + return Status::InternalError("failed to serialize segment footer"); + } + + faststring fixed_buf; + // footer's size + put_fixed32_le(&fixed_buf, footer_buf.size()); + // footer's checksum + uint32_t checksum = crc32c::Value(footer_buf.data(), footer_buf.size()); + put_fixed32_le(&fixed_buf, checksum); + // Append magic number. we don't write magic number in the header because + // that will need an extra seek when reading + fixed_buf.append(k_segment_magic, k_segment_magic_length); + + std::vector slices {footer_buf, fixed_buf}; + return _write_raw_data(slices); +} + +Status VerticalSegmentWriter::_write_raw_data(const std::vector& slices) { + RETURN_IF_ERROR(_file_writer->appendv(&slices[0], slices.size())); + return Status::OK(); +} + +Slice VerticalSegmentWriter::min_encoded_key() { + return (_primary_key_index_builder == nullptr) ? Slice(_min_key.data(), _min_key.size()) + : _primary_key_index_builder->min_key(); +} +Slice VerticalSegmentWriter::max_encoded_key() { + return (_primary_key_index_builder == nullptr) ? Slice(_max_key.data(), _max_key.size()) + : _primary_key_index_builder->max_key(); +} + +void VerticalSegmentWriter::_set_min_max_key(const Slice& key) { + if (UNLIKELY(_is_first_row)) { + _min_key.append(key.get_data(), key.get_size()); + _is_first_row = false; + } + if (key.compare(_max_key) > 0) { + _max_key.clear(); + _max_key.append(key.get_data(), key.get_size()); + } +} + +void VerticalSegmentWriter::_set_min_key(const Slice& key) { + if (UNLIKELY(_is_first_row)) { + _min_key.append(key.get_data(), key.get_size()); + _is_first_row = false; + } +} + +void VerticalSegmentWriter::_set_max_key(const Slice& key) { + _max_key.clear(); + _max_key.append(key.get_data(), key.get_size()); +} + +} // namespace segment_v2 +} // namespace doris diff --git a/be/src/olap/rowset/segment_v2/vertical_segment_writer.h b/be/src/olap/rowset/segment_v2/vertical_segment_writer.h new file mode 100644 index 00000000000000..773751934b8482 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/vertical_segment_writer.h @@ -0,0 +1,196 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include // unique_ptr +#include +#include +#include + +#include "common/status.h" // Status +#include "gutil/macros.h" +#include "gutil/strings/substitute.h" +#include "olap/olap_define.h" +#include "olap/rowset/segment_v2/column_writer.h" +#include "olap/tablet.h" +#include "olap/tablet_schema.h" +#include "util/faststring.h" +#include "util/slice.h" + +namespace doris { +namespace vectorized { +class Block; +class IOlapColumnDataAccessor; +class OlapBlockDataConvertor; +} // namespace vectorized + +class DataDir; +class MemTracker; +class ShortKeyIndexBuilder; +class PrimaryKeyIndexBuilder; +class KeyCoder; +struct RowsetWriterContext; + +namespace io { +class FileWriter; +} // namespace io + +namespace segment_v2 { + +struct VerticalSegmentWriterOptions { + uint32_t num_rows_per_block = 1024; + bool enable_unique_key_merge_on_write = false; + CompressionTypePB compression_type = UNKNOWN_COMPRESSION; + + RowsetWriterContext* rowset_ctx = nullptr; + DataWriteType write_type = DataWriteType::TYPE_DEFAULT; +}; + +struct RowsInBlock { + const vectorized::Block* block; + size_t row_pos; + size_t num_rows; +}; + +class VerticalSegmentWriter { +public: + explicit VerticalSegmentWriter(io::FileWriter* file_writer, uint32_t segment_id, + TabletSchemaSPtr tablet_schema, BaseTabletSPtr tablet, + DataDir* data_dir, uint32_t max_row_per_segment, + const VerticalSegmentWriterOptions& opts, + std::shared_ptr mow_context); + ~VerticalSegmentWriter(); + + VerticalSegmentWriter(const VerticalSegmentWriter&) = delete; + const VerticalSegmentWriter& operator=(const VerticalSegmentWriter&) = delete; + + Status init(); + + // Add one block to batch, memory is owned by the caller. + // The batched blocks will be flushed in write_batch. + // Once write_batch is called, no more blocks shoud be added. + Status batch_block(const vectorized::Block* block, size_t row_pos, size_t num_rows); + Status write_batch(); + + [[nodiscard]] std::string data_dir_path() const { + return _data_dir == nullptr ? "" : _data_dir->path(); + } + [[nodiscard]] size_t inverted_index_file_size() const { return _inverted_index_file_size; } + [[nodiscard]] uint32_t num_rows_written() const { return _num_rows_written; } + [[nodiscard]] int64_t num_rows_filtered() const { return _num_rows_filtered; } + [[nodiscard]] uint32_t row_count() const { return _row_count; } + [[nodiscard]] uint32_t segment_id() const { return _segment_id; } + + Status finalize(uint64_t* segment_file_size, uint64_t* index_size); + + Status finalize_columns_index(uint64_t* index_size); + Status finalize_footer(uint64_t* segment_file_size); + + Slice min_encoded_key(); + Slice max_encoded_key(); + + void clear(); + +private: + void _init_column_meta(ColumnMetaPB* meta, uint32_t column_id, const TabletColumn& column); + Status _create_column_writer(uint32_t cid, const TabletColumn& column); + size_t _calculate_inverted_index_file_size(); + uint64_t _estimated_remaining_size(); + Status _write_ordinal_index(); + Status _write_zone_map(); + Status _write_bitmap_index(); + Status _write_inverted_index(); + Status _write_bloom_filter_index(); + Status _write_short_key_index(); + Status _write_primary_key_index(); + Status _write_footer(); + Status _write_raw_data(const std::vector& slices); + void _maybe_invalid_row_cache(const std::string& key) const; + std::string _encode_keys(const std::vector& key_columns, + size_t pos); + // used for unique-key with merge on write and segment min_max key + std::string _full_encode_keys( + const std::vector& key_columns, size_t pos); + // used for unique-key with merge on write + void _encode_seq_column(const vectorized::IOlapColumnDataAccessor* seq_column, size_t pos, + string* encoded_keys); + void _set_min_max_key(const Slice& key); + void _set_min_key(const Slice& key); + void _set_max_key(const Slice& key); + void _serialize_block_to_row_column(vectorized::Block& block); + Status _append_block_with_partial_content(RowsInBlock& data); + Status _fill_missing_columns(vectorized::MutableColumns& mutable_full_columns, + const std::vector& use_default_or_null_flag, + bool has_default_or_nullable, const size_t& segment_start_pos); + +private: + uint32_t _segment_id; + TabletSchemaSPtr _tablet_schema; + BaseTabletSPtr _tablet; + DataDir* _data_dir; + VerticalSegmentWriterOptions _opts; + + // Not owned. owned by RowsetWriter + io::FileWriter* _file_writer; + + SegmentFooterPB _footer; + size_t _num_key_columns; + size_t _num_short_key_columns; + size_t _inverted_index_file_size; + std::unique_ptr _short_key_index_builder; + std::unique_ptr _primary_key_index_builder; + std::vector> _column_writers; + std::unique_ptr _mem_tracker; + + std::unique_ptr _olap_data_convertor; + // used for building short key index or primary key index during vectorized write. + std::vector _key_coders; + const KeyCoder* _seq_coder = nullptr; + std::vector _key_index_size; + size_t _short_key_row_pos = 0; + + // _num_rows_written means row count already written in this current column group + uint32_t _num_rows_written = 0; + // number of rows filtered in strict mode partial update + int64_t _num_rows_filtered = 0; + // _row_count means total row count of this segment + // In vertical compaction row count is recorded when key columns group finish + // and _num_rows_written will be updated in value column group + uint32_t _row_count = 0; + + bool _is_first_row = true; + faststring _min_key; + faststring _max_key; + + std::shared_ptr _mow_context; + // group every rowset-segment row id to speed up reader + PartialUpdateReadPlan _rssid_to_rid; + std::map _rsid_to_rowset; + + std::vector _batched_blocks; +}; + +} // namespace segment_v2 +} // namespace doris diff --git a/be/src/olap/rowset/unique_rowset_id_generator.cpp b/be/src/olap/rowset/unique_rowset_id_generator.cpp index 0d7d77915e0cb5..0ac7f63837a099 100644 --- a/be/src/olap/rowset/unique_rowset_id_generator.cpp +++ b/be/src/olap/rowset/unique_rowset_id_generator.cpp @@ -17,66 +17,18 @@ #include "olap/rowset/unique_rowset_id_generator.h" -#include -#include - -#include "util/doris_metrics.h" -#include "util/metrics.h" -#include "util/spinlock.h" -#include "util/uid_util.h" - namespace doris { -DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(rowset_count_generated_and_in_use, MetricUnit::ROWSETS); - UniqueRowsetIdGenerator::UniqueRowsetIdGenerator(const UniqueId& backend_uid) - : _backend_uid(backend_uid), _inc_id(0) { - REGISTER_HOOK_METRIC(rowset_count_generated_and_in_use, [this]() { - std::lock_guard l(_lock); - return _valid_rowset_id_hi.size(); - }); -} + : _backend_uid(backend_uid), _inc_id(1) {} -UniqueRowsetIdGenerator::~UniqueRowsetIdGenerator() { - DEREGISTER_HOOK_METRIC(rowset_count_generated_and_in_use); -} +UniqueRowsetIdGenerator::~UniqueRowsetIdGenerator() = default; -// generate a unique rowset id and save it in a set to check whether it is valid in the future RowsetId UniqueRowsetIdGenerator::next_id() { RowsetId rowset_id; - rowset_id.init(_version, ++_inc_id, _backend_uid.hi, _backend_uid.lo); - { - std::lock_guard l(_lock); - _valid_rowset_id_hi.insert(rowset_id.hi); - } + rowset_id.init(_version, _inc_id.fetch_add(1, std::memory_order_relaxed), _backend_uid.hi, + _backend_uid.lo); return rowset_id; } -bool UniqueRowsetIdGenerator::id_in_use(const RowsetId& rowset_id) const { - // if rowset_id == 1, then it is an old version rowsetid, not gc it - // because old version rowset id is not generated by this code, so that not delete them - if (rowset_id.version < _version) { - return true; - } - if ((rowset_id.mi != _backend_uid.hi) || (rowset_id.lo != _backend_uid.lo)) { - return false; - } - - std::lock_guard l(_lock); - return _valid_rowset_id_hi.count(rowset_id.hi) == 1; -} - -void UniqueRowsetIdGenerator::release_id(const RowsetId& rowset_id) { - // Only release the rowsetid generated after this startup. - // So we need to check version/mid/low part first - if (rowset_id.version < _version) { - return; - } - if ((rowset_id.mi != _backend_uid.hi) || (rowset_id.lo != _backend_uid.lo)) { - return; - } - std::lock_guard l(_lock); - _valid_rowset_id_hi.erase(rowset_id.hi); -} - } // namespace doris diff --git a/be/src/olap/rowset/unique_rowset_id_generator.h b/be/src/olap/rowset/unique_rowset_id_generator.h index 051eefd245fb6a..2bbf65b47d114e 100644 --- a/be/src/olap/rowset/unique_rowset_id_generator.h +++ b/be/src/olap/rowset/unique_rowset_id_generator.h @@ -17,14 +17,9 @@ #pragma once -#include - #include -#include -#include "olap/olap_common.h" #include "olap/rowset/rowset_id_generator.h" -#include "util/spinlock.h" #include "util/uid_util.h" namespace doris { @@ -34,27 +29,14 @@ class UniqueRowsetIdGenerator : public RowsetIdGenerator { UniqueRowsetIdGenerator(const UniqueId& backend_uid); ~UniqueRowsetIdGenerator() override; - UniqueRowsetIdGenerator(const UniqueRowsetIdGenerator&) = delete; - UniqueRowsetIdGenerator& operator=(const UniqueRowsetIdGenerator&) = delete; - RowsetId next_id() override; - bool id_in_use(const RowsetId& rowset_id) const override; - - void release_id(const RowsetId& rowset_id) override; - private: - mutable SpinLock _lock; const UniqueId _backend_uid; const int64_t _version = 2; // modify it when create new version id generator // A monotonically increasing integer generator, // This integer will be part of a rowset id. std::atomic _inc_id; - // Save the high part of rowset ids generated since last process startup. - // Therefore, we cannot strictly rely on _valid_rowset_id_hi - // to determine whether the rowset id is being used. - // But to use id_in_use() and release_id() to check it. - std::unordered_set _valid_rowset_id_hi; }; // UniqueRowsetIdGenerator } // namespace doris diff --git a/be/src/olap/rowset/vertical_beta_rowset_writer.cpp b/be/src/olap/rowset/vertical_beta_rowset_writer.cpp index 59c1afc930719f..cd9fee5fb4ee35 100644 --- a/be/src/olap/rowset/vertical_beta_rowset_writer.cpp +++ b/be/src/olap/rowset/vertical_beta_rowset_writer.cpp @@ -27,7 +27,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/logging.h" #include "io/fs/file_system.h" diff --git a/be/src/olap/rowset/vertical_beta_rowset_writer.h b/be/src/olap/rowset/vertical_beta_rowset_writer.h index 8e318686db97b2..8251ad0a07ebe0 100644 --- a/be/src/olap/rowset/vertical_beta_rowset_writer.h +++ b/be/src/olap/rowset/vertical_beta_rowset_writer.h @@ -49,6 +49,8 @@ class VerticalBetaRowsetWriter : public BetaRowsetWriter { int64_t num_rows() const override { return _total_key_group_rows; } + virtual const RowsetWriterContext& context() const override { LOG(FATAL) << "Not implemented"; } + private: // only key group will create segment writer Status _create_segment_writer(const std::vector& column_ids, bool is_key, diff --git a/be/src/olap/rowset_builder.cpp b/be/src/olap/rowset_builder.cpp index afe7fd385ff235..219d344f6253fc 100644 --- a/be/src/olap/rowset_builder.cpp +++ b/be/src/olap/rowset_builder.cpp @@ -26,18 +26,19 @@ #include #include -// IWYU pragma: no_include #include "cloud/config.h" #include "common/compiler_util.h" // IWYU pragma: keep #include "common/config.h" #include "common/status.h" #include "exec/tablet_info.h" #include "gutil/strings/numbers.h" +#include "io/fs/file_system.h" #include "io/fs/file_writer.h" // IWYU pragma: keep #include "olap/calc_delete_bitmap_executor.h" #include "olap/olap_define.h" #include "olap/rowset/beta_rowset.h" #include "olap/rowset/beta_rowset_writer.h" +#include "olap/rowset/pending_rowset_helper.h" #include "olap/rowset/rowset_meta.h" #include "olap/rowset/rowset_writer.h" #include "olap/rowset/rowset_writer_context.h" @@ -53,6 +54,7 @@ #include "util/stopwatch.hpp" #include "util/time.h" #include "util/trace.h" +#include "vec/common/schema_util.h" #include "vec/core/block.h" namespace doris { @@ -199,9 +201,8 @@ Status RowsetBuilder::init() { context.mow_context = mow_context; context.write_file_cache = _req.write_file_cache; context.partial_update_info = _partial_update_info; - std::unique_ptr rowset_writer; - RETURN_IF_ERROR(_tablet->create_rowset_writer(context, &rowset_writer)); - _rowset_writer = std::move(rowset_writer); + _rowset_writer = DORIS_TRY(_tablet->create_rowset_writer(context, false)); + _pending_rs_guard = StorageEngine::instance()->pending_local_rowsets().add(context.rowset_id); if (config::cloud_mode) { // TODO(plat1ko) @@ -301,8 +302,18 @@ Status RowsetBuilder::commit_txn() { auto storage_engine = StorageEngine::instance(); std::lock_guard l(_lock); SCOPED_TIMER(_commit_txn_timer); + if (_tablet->tablet_schema()->num_variant_columns() > 0) { + // update tablet schema when meet variant columns, before commit_txn + // Eg. rowset schema: A(int), B(float), C(int), D(int) + // _tabelt->tablet_schema: A(bigint), B(double) + // => update_schema: A(bigint), B(double), C(int), D(int) + const RowsetWriterContext& rw_ctx = _rowset_writer->context(); + _tablet->update_by_least_common_schema(rw_ctx.tablet_schema); + } + // Transfer ownership of `PendingRowsetGuard` to `TxnManager` Status res = storage_engine->txn_manager()->commit_txn(_req.partition_id, *tablet, _req.txn_id, - _req.load_id, _rowset, false); + _req.load_id, _rowset, + std::move(_pending_rs_guard), false); if (!res && !res.is()) { LOG(WARNING) << "Failed to commit txn: " << _req.txn_id diff --git a/be/src/olap/rowset_builder.h b/be/src/olap/rowset_builder.h index 64f58f9bc36d4f..ea849f80e6140f 100644 --- a/be/src/olap/rowset_builder.h +++ b/be/src/olap/rowset_builder.h @@ -30,6 +30,7 @@ #include "olap/delta_writer_context.h" #include "olap/olap_common.h" #include "olap/partial_update_info.h" +#include "olap/rowset/pending_rowset_helper.h" #include "olap/rowset/rowset.h" #include "olap/tablet_fwd.h" #include "util/runtime_profile.h" @@ -107,6 +108,7 @@ class RowsetBuilder { BaseTabletSPtr _tablet; RowsetSharedPtr _rowset; std::shared_ptr _rowset_writer; + PendingRowsetGuard _pending_rs_guard; TabletSchemaSPtr _tablet_schema; std::mutex _lock; diff --git a/be/src/olap/schema.h b/be/src/olap/schema.h index d220ebbee0d661..ce5c6705e7fac6 100644 --- a/be/src/olap/schema.h +++ b/be/src/olap/schema.h @@ -50,7 +50,7 @@ using SchemaSPtr = std::shared_ptr; class Schema { public: Schema(TabletSchemaSPtr tablet_schema) { - SCOPED_MEM_COUNT(&_mem_size); + SCOPED_MEM_COUNT_BY_HOOK(&_mem_size); size_t num_columns = tablet_schema->num_columns(); // ignore this column if (tablet_schema->columns().back().name() == BeConsts::ROW_STORE_COL) { @@ -86,7 +86,7 @@ class Schema { // All the columns of one table may exist in the columns param, but col_ids is only a subset. Schema(const std::vector& columns, const std::vector& col_ids) { - SCOPED_MEM_COUNT(&_mem_size); + SCOPED_MEM_COUNT_BY_HOOK(&_mem_size); size_t num_key_columns = 0; _unique_ids.resize(columns.size()); for (size_t i = 0; i < columns.size(); ++i) { @@ -109,7 +109,7 @@ class Schema { // Only for UT Schema(const std::vector& columns, size_t num_key_columns) { - SCOPED_MEM_COUNT(&_mem_size); + SCOPED_MEM_COUNT_BY_HOOK(&_mem_size); std::vector col_ids(columns.size()); _unique_ids.resize(columns.size()); for (uint32_t cid = 0; cid < columns.size(); ++cid) { @@ -121,7 +121,7 @@ class Schema { } Schema(const std::vector& cols, size_t num_key_columns) { - SCOPED_MEM_COUNT(&_mem_size); + SCOPED_MEM_COUNT_BY_HOOK(&_mem_size); std::vector col_ids(cols.size()); _unique_ids.resize(cols.size()); for (uint32_t cid = 0; cid < cols.size(); ++cid) { diff --git a/be/src/olap/schema_change.cpp b/be/src/olap/schema_change.cpp index e331564d9d28e3..9d0e23cb975b5d 100644 --- a/be/src/olap/schema_change.cpp +++ b/be/src/olap/schema_change.cpp @@ -44,6 +44,7 @@ #include "olap/olap_common.h" #include "olap/olap_define.h" #include "olap/rowset/beta_rowset.h" +#include "olap/rowset/pending_rowset_helper.h" #include "olap/rowset/rowset_meta.h" #include "olap/rowset/rowset_reader_context.h" #include "olap/rowset/rowset_writer_context.h" @@ -517,6 +518,7 @@ Status VSchemaChangeWithSorting::_inner_process(RowsetReaderSharedPtr rowset_rea // for external sorting // src_rowsets to store the rowset generated by internal sorting std::vector src_rowsets; + std::vector pending_rs_guards; Defer defer {[&]() { // remove the intermediate rowsets generated by internal sorting @@ -535,12 +537,11 @@ Status VSchemaChangeWithSorting::_inner_process(RowsetReaderSharedPtr rowset_rea return Status::OK(); } - RowsetSharedPtr rowset; - RETURN_IF_ERROR(_internal_sorting( + auto [rowset, guard] = DORIS_TRY(_internal_sorting( blocks, Version(_temp_delta_versions.second, _temp_delta_versions.second), - newest_write_timestamp, new_tablet, BETA_ROWSET, segments_overlap, &rowset)); - src_rowsets.push_back(rowset); - + newest_write_timestamp, new_tablet, BETA_ROWSET, segments_overlap)); + src_rowsets.push_back(std::move(rowset)); + pending_rs_guards.push_back(std::move(guard)); for (auto& block : blocks) { _mem_tracker->release(block->allocated_bytes()); } @@ -594,13 +595,12 @@ Status VSchemaChangeWithSorting::_inner_process(RowsetReaderSharedPtr rowset_rea return Status::OK(); } -Status VSchemaChangeWithSorting::_internal_sorting( +Result> VSchemaChangeWithSorting::_internal_sorting( const std::vector>& blocks, const Version& version, int64_t newest_write_timestamp, TabletSharedPtr new_tablet, RowsetTypePB new_rowset_type, - SegmentsOverlapPB segments_overlap, RowsetSharedPtr* rowset) { + SegmentsOverlapPB segments_overlap) { uint64_t merged_rows = 0; MultiBlockMerger merger(new_tablet); - std::unique_ptr rowset_writer; RowsetWriterContext context; context.version = version; context.rowset_state = VISIBLE; @@ -608,13 +608,20 @@ Status VSchemaChangeWithSorting::_internal_sorting( context.tablet_schema = new_tablet->tablet_schema(); context.newest_write_timestamp = newest_write_timestamp; context.write_type = DataWriteType::TYPE_SCHEMA_CHANGE; - RETURN_IF_ERROR(new_tablet->create_rowset_writer(context, &rowset_writer)); - - RETURN_IF_ERROR(merger.merge(blocks, rowset_writer.get(), &merged_rows)); - + std::unique_ptr rowset_writer; + // TODO(plat1ko): Use monad op + if (auto result = new_tablet->create_rowset_writer(context, false); !result.has_value()) + [[unlikely]] { + return unexpected(std::move(result).error()); + } else { + rowset_writer = std::move(result).value(); + } + auto guard = StorageEngine::instance()->pending_local_rowsets().add(context.rowset_id); + RETURN_IF_ERROR_RESULT(merger.merge(blocks, rowset_writer.get(), &merged_rows)); _add_merged_rows(merged_rows); - RETURN_IF_ERROR(rowset_writer->build(*rowset)); - return Status::OK(); + RowsetSharedPtr rowset; + RETURN_IF_ERROR_RESULT(rowset_writer->build(rowset)); + return std::make_pair(std::move(rowset), std::move(guard)); } Status VSchemaChangeWithSorting::_external_sorting(vector& src_rowsets, @@ -1023,7 +1030,7 @@ Status SchemaChangeHandler::_get_versions_to_be_changed( *max_rowset = rowset; RETURN_IF_ERROR(base_tablet->capture_consistent_versions(Version(0, rowset->version().second), - versions_to_be_changed)); + versions_to_be_changed, false, false)); return Status::OK(); } @@ -1098,7 +1105,6 @@ Status SchemaChangeHandler::_convert_historical_rowsets(const SchemaChangeParams TabletSharedPtr new_tablet = sc_params.new_tablet; // When tablet create new rowset writer, it may change rowset type, in this case // linked schema change will not be used. - std::unique_ptr rowset_writer; RowsetWriterContext context; context.version = rs_reader->version(); context.rowset_state = VISIBLE; @@ -1107,12 +1113,14 @@ Status SchemaChangeHandler::_convert_historical_rowsets(const SchemaChangeParams context.newest_write_timestamp = rs_reader->newest_write_timestamp(); context.fs = rs_reader->rowset()->rowset_meta()->fs(); context.write_type = DataWriteType::TYPE_SCHEMA_CHANGE; - Status status = new_tablet->create_rowset_writer(context, &rowset_writer); - if (!status.ok()) { + auto result = new_tablet->create_rowset_writer(context, false); + if (!result.has_value()) { res = Status::Error("create_rowset_writer failed, reason={}", - status.to_string()); + result.error().to_string()); return process_alter_exit(); } + auto rowset_writer = std::move(result).value(); + auto pending_rs_guard = StorageEngine::instance()->add_pending_rowset(context); if (res = sc_procedure->process(rs_reader, rowset_writer.get(), sc_params.new_tablet, sc_params.base_tablet, sc_params.base_tablet_schema); diff --git a/be/src/olap/schema_change.h b/be/src/olap/schema_change.h index 2c5075b9ce5845..f11f6d2d815a95 100644 --- a/be/src/olap/schema_change.h +++ b/be/src/olap/schema_change.h @@ -39,6 +39,7 @@ #include "common/status.h" #include "olap/column_mapping.h" #include "olap/olap_common.h" +#include "olap/rowset/pending_rowset_helper.h" #include "olap/rowset/rowset.h" #include "olap/rowset/rowset_reader.h" #include "olap/rowset/rowset_writer.h" @@ -107,11 +108,7 @@ class SchemaChange { TabletSharedPtr new_tablet, TabletSharedPtr base_tablet, TabletSchemaSPtr base_tablet_schema) { if (rowset_reader->rowset()->empty() || rowset_reader->rowset()->num_rows() == 0) { - RETURN_WITH_WARN_IF_ERROR( - rowset_writer->flush(), Status::Error(""), - fmt::format("create empty version for schema change failed. version= {}-{}", - rowset_writer->version().first, rowset_writer->version().second)); - + RETURN_IF_ERROR(rowset_writer->flush()); return Status::OK(); } @@ -204,10 +201,11 @@ class VSchemaChangeWithSorting : public SchemaChange { Status _inner_process(RowsetReaderSharedPtr rowset_reader, RowsetWriter* rowset_writer, TabletSharedPtr new_tablet, TabletSchemaSPtr base_tablet_schema) override; - Status _internal_sorting(const std::vector>& blocks, - const Version& temp_delta_versions, int64_t newest_write_timestamp, - TabletSharedPtr new_tablet, RowsetTypePB new_rowset_type, - SegmentsOverlapPB segments_overlap, RowsetSharedPtr* rowset); + Result> _internal_sorting( + const std::vector>& blocks, + const Version& temp_delta_versions, int64_t newest_write_timestamp, + TabletSharedPtr new_tablet, RowsetTypePB new_rowset_type, + SegmentsOverlapPB segments_overlap); Status _external_sorting(std::vector& src_rowsets, RowsetWriter* rowset_writer, TabletSharedPtr new_tablet); diff --git a/be/src/olap/single_replica_compaction.cpp b/be/src/olap/single_replica_compaction.cpp index 5fdbb1fdf0becc..5b348ab2c1ce62 100644 --- a/be/src/olap/single_replica_compaction.cpp +++ b/be/src/olap/single_replica_compaction.cpp @@ -312,60 +312,36 @@ Status SingleReplicaCompaction::_fetch_rowset(const TReplicaInfo& addr, const st std::string local_path = local_data_path + "/"; std::string snapshot_path; int timeout_s = 0; - Status status = Status::OK(); // 1: make snapshot - auto st = _make_snapshot(addr.host, addr.be_port, _tablet->tablet_id(), _tablet->schema_hash(), - timeout_s, rowset_version, &snapshot_path); - if (st.ok()) { - status = Status::OK(); - } else { - LOG(WARNING) << "fail to make snapshot from " << addr.host - << ", tablet id: " << _tablet->tablet_id(); - status = Status::InternalError("Failed to make snapshot"); - return status; - } + RETURN_IF_ERROR(_make_snapshot(addr.host, addr.be_port, _tablet->tablet_id(), + _tablet->schema_hash(), timeout_s, rowset_version, + &snapshot_path)); + Defer defer {[&, this] { + // TODO(plat1ko): Async release snapshot + auto st = _release_snapshot(addr.host, addr.be_port, snapshot_path); + if (!st.ok()) [[unlikely]] { + LOG_WARNING("failed to release snapshot in remote BE") + .tag("host", addr.host) + .tag("port", addr.be_port) + .tag("snapshot_path", snapshot_path) + .error(st); + } + }}; + // 2: download snapshot std::string remote_url_prefix; { std::stringstream ss; ss << "http://" << addr.host << ":" << addr.http_port << HTTP_REQUEST_PREFIX << HTTP_REQUEST_TOKEN_PARAM << token << HTTP_REQUEST_FILE_PARAM << snapshot_path << "/" << _tablet->tablet_id() << "/" << _tablet->schema_hash() << "/"; - remote_url_prefix = ss.str(); } - - // 2: download snapshot - st = _download_files(_tablet->data_dir(), remote_url_prefix, local_path); - if (!st.ok()) { - LOG(WARNING) << "fail to download and convert tablet, remote=" << remote_url_prefix - << ", error=" << st.to_string(); - status = Status::InternalError("Fail to download and convert tablet"); - // when there is an error, keep this program executing to release snapshot - } - if (status.ok()) { - // change all rowset ids because they maybe its id same with local rowset - auto olap_st = SnapshotManager::instance()->convert_rowset_ids( - local_path, _tablet->tablet_id(), _tablet->replica_id(), _tablet->partition_id(), - _tablet->schema_hash()); - if (!olap_st.ok()) { - LOG(WARNING) << "fail to convert rowset ids, path=" << local_path - << ", tablet_id=" << _tablet->tablet_id() << ", error=" << olap_st; - status = Status::InternalError("Failed to convert rowset ids"); - } - } - - // 3: releasse_snapshot - st = _release_snapshot(addr.host, addr.be_port, snapshot_path); - if (status.ok()) { - // 4: finish_clone: create output_rowset and link file - Status olap_status = _finish_clone(local_data_path, rowset_version); - if (!olap_status.ok()) { - LOG(WARNING) << "failed to finish clone. [table=" << _tablet->tablet_id() - << " res=" << olap_status << "]"; - status = Status::InternalError("Failed to finish clone"); - } - } - return status; + RETURN_IF_ERROR(_download_files(_tablet->data_dir(), remote_url_prefix, local_path)); + _pending_rs_guards = DORIS_TRY(SnapshotManager::instance()->convert_rowset_ids( + local_path, _tablet->tablet_id(), _tablet->replica_id(), _tablet->partition_id(), + _tablet->schema_hash())); + // 4: finish_clone: create output_rowset and link file + return _finish_clone(local_data_path, rowset_version); } Status SingleReplicaCompaction::_make_snapshot(const std::string& ip, int port, TTableId tablet_id, diff --git a/be/src/olap/single_replica_compaction.h b/be/src/olap/single_replica_compaction.h index 3d967dff20b09a..7ccfe09f7aecbc 100644 --- a/be/src/olap/single_replica_compaction.h +++ b/be/src/olap/single_replica_compaction.h @@ -21,6 +21,7 @@ #include "common/status.h" #include "olap/compaction.h" +#include "olap/rowset/pending_rowset_helper.h" #include "olap/rowset/rowset.h" #include "olap/tablet.h" @@ -62,6 +63,9 @@ class SingleReplicaCompaction : public Compaction { CompactionType _compaction_type; DISALLOW_COPY_AND_ASSIGN(SingleReplicaCompaction); + +private: + std::vector _pending_rs_guards; }; } // namespace doris \ No newline at end of file diff --git a/be/src/olap/snapshot_manager.cpp b/be/src/olap/snapshot_manager.cpp index ad6f9ef930be60..fd865712c5f70f 100644 --- a/be/src/olap/snapshot_manager.cpp +++ b/be/src/olap/snapshot_manager.cpp @@ -124,28 +124,23 @@ Status SnapshotManager::release_snapshot(const string& snapshot_path) { snapshot_path); } -Status SnapshotManager::convert_rowset_ids(const std::string& clone_dir, int64_t tablet_id, - int64_t replica_id, int64_t partition_id, - const int32_t& schema_hash) { +Result> SnapshotManager::convert_rowset_ids( + const std::string& clone_dir, int64_t tablet_id, int64_t replica_id, int64_t partition_id, + int32_t schema_hash) { SCOPED_CONSUME_MEM_TRACKER(_mem_tracker); - Status res = Status::OK(); + std::vector guards; // check clone dir existed bool exists = true; - RETURN_IF_ERROR(io::global_local_filesystem()->exists(clone_dir, &exists)); + RETURN_IF_ERROR_RESULT(io::global_local_filesystem()->exists(clone_dir, &exists)); if (!exists) { - res = Status::Error( - "clone dir not existed when convert rowsetids. clone_dir={}", clone_dir); - return res; + return unexpected(Status::Error( + "clone dir not existed when convert rowsetids. clone_dir={}", clone_dir)); } // load original tablet meta auto cloned_meta_file = fmt::format("{}/{}.hdr", clone_dir, tablet_id); TabletMeta cloned_tablet_meta; - if ((res = cloned_tablet_meta.create_from_file(cloned_meta_file)) != Status::OK()) { - LOG(WARNING) << "fail to load original tablet meta after clone. " - << ", cloned_meta_file=" << cloned_meta_file; - return res; - } + RETURN_IF_ERROR_RESULT(cloned_tablet_meta.create_from_file(cloned_meta_file)); TabletMetaPB cloned_tablet_meta_pb; cloned_tablet_meta.to_meta_pb(&cloned_tablet_meta_pb); @@ -171,14 +166,16 @@ Status SnapshotManager::convert_rowset_ids(const std::string& clone_dir, int64_t std::unordered_map rs_version_map; std::unordered_map rowset_id_mapping; - for (auto& visible_rowset : cloned_tablet_meta_pb.rs_metas()) { + guards.reserve(cloned_tablet_meta_pb.rs_metas_size() + + cloned_tablet_meta_pb.stale_rs_metas_size()); + for (auto&& visible_rowset : cloned_tablet_meta_pb.rs_metas()) { RowsetMetaPB* rowset_meta = new_tablet_meta_pb.add_rs_metas(); - if (!visible_rowset.has_resource_id()) { // src be local rowset RowsetId rowset_id = StorageEngine::instance()->next_rowset_id(); - RETURN_IF_ERROR(_rename_rowset_id(visible_rowset, clone_dir, tablet_schema, rowset_id, - rowset_meta)); + guards.push_back(StorageEngine::instance()->pending_local_rowsets().add(rowset_id)); + RETURN_IF_ERROR_RESULT(_rename_rowset_id(visible_rowset, clone_dir, tablet_schema, + rowset_id, rowset_meta)); RowsetId src_rs_id; if (visible_rowset.rowset_id() > 0) { src_rs_id.init(visible_rowset.rowset_id()); @@ -194,7 +191,7 @@ Status SnapshotManager::convert_rowset_ids(const std::string& clone_dir, int64_t rs_version_map[rowset_version] = rowset_meta; } - for (auto& stale_rowset : cloned_tablet_meta_pb.stale_rs_metas()) { + for (auto&& stale_rowset : cloned_tablet_meta_pb.stale_rs_metas()) { Version rowset_version = {stale_rowset.start_version(), stale_rowset.end_version()}; auto exist_rs = rs_version_map.find(rowset_version); if (exist_rs != rs_version_map.end()) { @@ -205,8 +202,9 @@ Status SnapshotManager::convert_rowset_ids(const std::string& clone_dir, int64_t if (!stale_rowset.has_resource_id()) { // src be local rowset RowsetId rowset_id = StorageEngine::instance()->next_rowset_id(); - RETURN_IF_ERROR(_rename_rowset_id(stale_rowset, clone_dir, tablet_schema, rowset_id, - rowset_meta)); + guards.push_back(StorageEngine::instance()->pending_local_rowsets().add(rowset_id)); + RETURN_IF_ERROR_RESULT(_rename_rowset_id(stale_rowset, clone_dir, tablet_schema, + rowset_id, rowset_meta)); RowsetId src_rs_id; if (stale_rowset.rowset_id() > 0) { src_rs_id.init(stale_rowset.rowset_id()); @@ -221,7 +219,7 @@ Status SnapshotManager::convert_rowset_ids(const std::string& clone_dir, int64_t } if (!rowset_id_mapping.empty() && cloned_tablet_meta_pb.has_delete_bitmap()) { - auto& cloned_del_bitmap_pb = cloned_tablet_meta_pb.delete_bitmap(); + const auto& cloned_del_bitmap_pb = cloned_tablet_meta_pb.delete_bitmap(); DeleteBitmapPB* new_del_bitmap_pb = new_tablet_meta_pb.mutable_delete_bitmap(); int rst_ids_size = cloned_del_bitmap_pb.rowset_ids_size(); for (size_t i = 0; i < rst_ids_size; ++i) { @@ -235,13 +233,9 @@ Status SnapshotManager::convert_rowset_ids(const std::string& clone_dir, int64_t } } - res = TabletMeta::save(cloned_meta_file, new_tablet_meta_pb); - if (!res.ok()) { - LOG(WARNING) << "fail to save converted tablet meta to dir='" << clone_dir; - return res; - } + RETURN_IF_ERROR_RESULT(TabletMeta::save(cloned_meta_file, new_tablet_meta_pb)); - return Status::OK(); + return guards; } Status SnapshotManager::_rename_rowset_id(const RowsetMetaPB& rs_meta_pb, diff --git a/be/src/olap/snapshot_manager.h b/be/src/olap/snapshot_manager.h index 3548fd65e22734..20b11ce7b3317a 100644 --- a/be/src/olap/snapshot_manager.h +++ b/be/src/olap/snapshot_manager.h @@ -25,6 +25,7 @@ #include #include "common/status.h" +#include "olap/rowset/pending_rowset_helper.h" #include "olap/rowset/rowset.h" #include "olap/tablet.h" #include "olap/tablet_schema.h" @@ -54,8 +55,11 @@ class SnapshotManager { static SnapshotManager* instance(); - Status convert_rowset_ids(const std::string& clone_dir, int64_t tablet_id, int64_t replica_id, - int64_t partition_id, const int32_t& schema_hash); + Result> convert_rowset_ids(const std::string& clone_dir, + int64_t tablet_id, + int64_t replica_id, + int64_t partition_id, + int32_t schema_hash); private: SnapshotManager() : _snapshot_base_id(0) { diff --git a/be/src/olap/storage_engine.cpp b/be/src/olap/storage_engine.cpp index 3bdc6967af19d8..d714420b3583b9 100644 --- a/be/src/olap/storage_engine.cpp +++ b/be/src/olap/storage_engine.cpp @@ -49,6 +49,7 @@ #include "common/logging.h" #include "common/status.h" #include "gutil/strings/substitute.h" +#include "io/fs/file_system.h" #include "io/fs/local_file_system.h" #include "olap/base_compaction.h" #include "olap/binlog.h" @@ -1058,39 +1059,38 @@ void StorageEngine::_parse_default_rowset_type() { } void StorageEngine::start_delete_unused_rowset() { - std::unordered_map unused_rowsets_copy; + std::vector unused_rowsets_copy; + unused_rowsets_copy.reserve(_unused_rowsets.size()); { std::lock_guard lock(_gc_mutex); for (auto it = _unused_rowsets.begin(); it != _unused_rowsets.end();) { uint64_t now = UnixSeconds(); - if (it->second.use_count() == 1 && it->second->need_delete_file() && + auto&& rs = it->second; + if (rs.use_count() == 1 && rs->need_delete_file() && // We delay the GC time of this rowset since it's maybe still needed, see #20732 - now > it->second->delayed_expired_timestamp()) { - if (it->second->is_local()) { - unused_rowsets_copy[it->first] = it->second; - } - // remote rowset data will be reclaimed by `remove_unused_remote_files` + now > rs->delayed_expired_timestamp()) { evict_querying_rowset(it->second->rowset_id()); + // remote rowset data will be reclaimed by `remove_unused_remote_files` + if (rs->is_local()) { + unused_rowsets_copy.push_back(std::move(rs)); + } it = _unused_rowsets.erase(it); } else { ++it; } } } - for (auto it = unused_rowsets_copy.begin(); it != unused_rowsets_copy.end(); ++it) { - VLOG_NOTICE << "start to remove rowset:" << it->second->rowset_id() - << ", version:" << it->second->version().first << "-" - << it->second->version().second; - auto tablet_id = it->second->rowset_meta()->tablet_id(); - auto tablet = _tablet_manager->get_tablet(tablet_id); + for (auto&& rs : unused_rowsets_copy) { + VLOG_NOTICE << "start to remove rowset:" << rs->rowset_id() + << ", version:" << rs->version(); // delete delete_bitmap of unused rowsets - if (tablet != nullptr && tablet->enable_unique_key_merge_on_write()) { - tablet->tablet_meta()->delete_bitmap().remove({it->second->rowset_id(), 0, 0}, - {it->second->rowset_id(), UINT32_MAX, 0}); + if (auto tablet = _tablet_manager->get_tablet(rs->rowset_meta()->tablet_id()); + tablet && tablet->enable_unique_key_merge_on_write()) { + tablet->tablet_meta()->delete_bitmap().remove({rs->rowset_id(), 0, 0}, + {rs->rowset_id(), UINT32_MAX, 0}); } - Status status = it->second->remove(); - VLOG_NOTICE << "remove rowset:" << it->second->rowset_id() - << " finished. status:" << status; + Status status = rs->remove(); + VLOG_NOTICE << "remove rowset:" << rs->rowset_id() << " finished. status:" << status; } } @@ -1098,19 +1098,14 @@ void StorageEngine::add_unused_rowset(RowsetSharedPtr rowset) { if (rowset == nullptr) { return; } - VLOG_NOTICE << "add unused rowset, rowset id:" << rowset->rowset_id() << ", version:" << rowset->version(); - - auto rowset_id = rowset->rowset_id().to_string(); - std::lock_guard lock(_gc_mutex); - auto it = _unused_rowsets.find(rowset_id); + auto it = _unused_rowsets.find(rowset->rowset_id()); if (it == _unused_rowsets.end()) { rowset->set_need_delete_file(); rowset->close(); - _unused_rowsets[rowset_id] = rowset; - release_rowset_id(rowset->rowset_id()); + _unused_rowsets[rowset->rowset_id()] = std::move(rowset); } } @@ -1157,20 +1152,14 @@ Status StorageEngine::obtain_shard_path(TStorageMedium::type storage_medium, int *store = stores[0]; } - Status res = Status::OK(); - uint64_t shard = 0; - res = (*store)->get_shard(&shard); - if (!res.ok()) { - LOG(WARNING) << "fail to get root path shard. res=" << res; - return res; - } + uint64_t shard = (*store)->get_shard(); std::stringstream root_path_stream; root_path_stream << (*store)->path() << "/" << DATA_PREFIX << "/" << shard; *shard_path = root_path_stream.str(); LOG(INFO) << "success to process obtain root path. path=" << shard_path; - return res; + return Status::OK(); } Status StorageEngine::load_header(const string& shard_path, const TCloneReq& request, @@ -1247,8 +1236,14 @@ Status StorageEngine::execute_task(EngineTask* task) { // check whether any unused rowsets's id equal to rowset_id bool StorageEngine::check_rowset_id_in_unused_rowsets(const RowsetId& rowset_id) { std::lock_guard lock(_gc_mutex); - auto search = _unused_rowsets.find(rowset_id.to_string()); - return search != _unused_rowsets.end(); + return _unused_rowsets.contains(rowset_id); +} + +PendingRowsetGuard StorageEngine::add_pending_rowset(const RowsetWriterContext& ctx) { + if (!ctx.fs || ctx.fs->type() == io::FileSystemType::LOCAL) { + return _pending_local_rowsets.add(ctx.rowset_id); + } + return _pending_remote_rowsets.add(ctx.rowset_id); } void StorageEngine::create_cumulative_compaction( diff --git a/be/src/olap/storage_engine.h b/be/src/olap/storage_engine.h index 6017354ef46bb4..95b11e0c66c3f6 100644 --- a/be/src/olap/storage_engine.h +++ b/be/src/olap/storage_engine.h @@ -42,12 +42,12 @@ #include "olap/compaction_permit_limiter.h" #include "olap/olap_common.h" #include "olap/options.h" -#include "olap/rowset/rowset.h" +#include "olap/rowset/pending_rowset_helper.h" +#include "olap/rowset/rowset_fwd.h" #include "olap/rowset/rowset_id_generator.h" #include "olap/rowset/segment_v2/segment.h" -#include "olap/tablet.h" +#include "olap/tablet_fwd.h" #include "olap/task/index_builder.h" -#include "runtime/exec_env.h" #include "runtime/heartbeat_flags.h" #include "util/countdown_latch.h" @@ -149,18 +149,14 @@ class StorageEngine { return _calc_delete_bitmap_executor.get(); } + // Rowset garbage collection helpers bool check_rowset_id_in_unused_rowsets(const RowsetId& rowset_id); + PendingRowsetSet& pending_local_rowsets() { return _pending_local_rowsets; } + PendingRowsetSet& pending_remote_rowsets() { return _pending_remote_rowsets; } + PendingRowsetGuard add_pending_rowset(const RowsetWriterContext& ctx); RowsetId next_rowset_id() { return _rowset_id_generator->next_id(); } - bool rowset_id_in_use(const RowsetId& rowset_id) { - return _rowset_id_generator->id_in_use(rowset_id); - } - - void release_rowset_id(const RowsetId& rowset_id) { - return _rowset_id_generator->release_id(rowset_id); - } - RowsetTypePB default_rowset_type() const { if (_heartbeat_flags != nullptr && _heartbeat_flags->is_set_default_rowset_type_to_beta()) { return BETA_ROWSET; @@ -289,12 +285,10 @@ class StorageEngine { // path gc process function void _path_gc_thread_callback(DataDir* data_dir); - void _path_scan_thread_callback(DataDir* data_dir); + void _tablet_path_check_callback(); void _tablet_checkpoint_callback(const std::vector& data_dirs); - void _tablet_path_check_callback(); - // parse the default rowset type config to RowsetTypePB void _parse_default_rowset_type(); @@ -388,8 +382,9 @@ class StorageEngine { std::atomic_bool _stopped {false}; std::mutex _gc_mutex; - // map, if we use RowsetId as the key, we need custom hash func - std::unordered_map _unused_rowsets; + std::unordered_map _unused_rowsets; + PendingRowsetSet _pending_local_rowsets; + PendingRowsetSet _pending_remote_rowsets; // Hold reference of quering rowsets std::mutex _quering_rowsets_mutex; diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp index 029c77e86b3248..cfd71abff14586 100644 --- a/be/src/olap/tablet.cpp +++ b/be/src/olap/tablet.cpp @@ -17,7 +17,6 @@ #include "olap/tablet.h" -#include #include #include #include @@ -32,15 +31,12 @@ #include #include #include -#include -#include #include #include #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep // IWYU pragma: no_include #include // IWYU pragma: keep @@ -97,6 +93,7 @@ #include "olap/rowset/segment_v2/column_reader.h" #include "olap/rowset/segment_v2/common.h" #include "olap/rowset/segment_v2/indexed_column_reader.h" +#include "olap/rowset/vertical_beta_rowset_writer.h" #include "olap/schema_change.h" #include "olap/single_replica_compaction.h" #include "olap/storage_engine.h" @@ -123,6 +120,7 @@ #include "util/work_thread_pool.hpp" #include "vec/columns/column.h" #include "vec/columns/column_string.h" +#include "vec/common/schema_util.h" #include "vec/common/string_ref.h" #include "vec/data_types/data_type.h" #include "vec/data_types/data_type_factory.hpp" @@ -237,11 +235,6 @@ void WriteCooldownMetaExecutors::WriteCooldownMetaExecutors::submit(TabletShared [task = std::move(async_write_task)]() { task(); }); } -TabletSharedPtr Tablet::create_tablet_from_meta(TabletMetaSharedPtr tablet_meta, - DataDir* data_dir) { - return std::make_shared(std::move(tablet_meta), data_dir); -} - Tablet::Tablet(TabletMetaSharedPtr tablet_meta, DataDir* data_dir, const std::string_view& cumulative_compaction_type) : BaseTablet(std::move(tablet_meta)), @@ -277,7 +270,7 @@ Tablet::Tablet(TabletMetaSharedPtr tablet_meta, DataDir* data_dir, _max_version_schema = _tablet_meta->tablet_schema(); } else { _max_version_schema = - rowset_meta_with_max_schema_version(_tablet_meta->all_rs_metas())->tablet_schema(); + tablet_schema_with_merged_max_schema_version(_tablet_meta->all_rs_metas()); } DCHECK(_max_version_schema); } @@ -305,7 +298,6 @@ Status Tablet::_init_once_action() { _tablet_meta->compaction_policy()); #endif - RowsetVector rowset_vec; for (const auto& rs_meta : _tablet_meta->all_rs_metas()) { Version version = rs_meta->version(); RowsetSharedPtr rowset; @@ -317,7 +309,6 @@ Status Tablet::_init_once_action() { << ", res=" << res; return res; } - rowset_vec.push_back(rowset); _rs_version_map[version] = std::move(rowset); } @@ -631,9 +622,9 @@ const RowsetSharedPtr Tablet::rowset_with_max_version() const { return iter->second; } -RowsetMetaSharedPtr Tablet::rowset_meta_with_max_schema_version( +TabletSchemaSPtr Tablet::tablet_schema_with_merged_max_schema_version( const std::vector& rowset_metas) { - return *std::max_element( + RowsetMetaSharedPtr max_schema_version_rs = *std::max_element( rowset_metas.begin(), rowset_metas.end(), [](const RowsetMetaSharedPtr& a, const RowsetMetaSharedPtr& b) { return !a->tablet_schema() @@ -643,6 +634,18 @@ RowsetMetaSharedPtr Tablet::rowset_meta_with_max_schema_version( : a->tablet_schema()->schema_version() < b->tablet_schema()->schema_version()); }); + TabletSchemaSPtr target_schema = max_schema_version_rs->tablet_schema(); + if (target_schema->num_variant_columns() > 0) { + // For variant columns tablet schema need to be the merged wide tablet schema + std::vector schemas; + std::transform(rowset_metas.begin(), rowset_metas.end(), std::back_inserter(schemas), + [](const RowsetMetaSharedPtr& rs_meta) { return rs_meta->tablet_schema(); }); + target_schema = std::make_shared(); + // TODO(lhy) maybe slow? + vectorized::schema_util::get_least_common_schema(schemas, target_schema); + VLOG_DEBUG << "dump schema: " << target_schema->dump_structure(); + } + return target_schema; } RowsetSharedPtr Tablet::_rowset_with_largest_size() { @@ -738,7 +741,7 @@ void Tablet::delete_expired_stale_rowset() { Version test_version = Version(0, lastest_delta->end_version()); stale_version_path_map[*path_id_iter] = version_path; - Status status = capture_consistent_versions(test_version, nullptr); + Status status = capture_consistent_versions(test_version, nullptr, false, false); // 1. When there is no consistent versions, we must reconstruct the tracker. if (!status.ok()) { // 2. fetch missing version after delete @@ -859,7 +862,8 @@ bool Tablet::_reconstruct_version_tracker_if_necessary() { } Status Tablet::capture_consistent_versions(const Version& spec_version, - std::vector* version_path, bool quiet) const { + std::vector* version_path, + bool skip_missing_version, bool quiet) const { Status status = _timestamped_version_tracker.capture_consistent_versions(spec_version, version_path); if (!status.ok() && !quiet) { @@ -870,14 +874,22 @@ Status Tablet::capture_consistent_versions(const Version& spec_version, // so to avoid print too many logs. if (version_path != nullptr) { LOG(WARNING) << "tablet:" << tablet_id() - << ", version already has been merged. spec_version: " << spec_version; + << ", version already has been merged. spec_version: " << spec_version + << ", max_version: " << max_version_unlocked(); } - status = Status::Error("missed_versions is empty"); + status = Status::Error( + "missed_versions is empty, spec_version " + "{}, max_version {}, tablet_id {}", + spec_version.second, max_version_unlocked().second, tablet_id()); } else { if (version_path != nullptr) { LOG(WARNING) << "status:" << status << ", tablet:" << tablet_id() << ", missed version for version:" << spec_version; _print_missed_versions(missed_versions); + if (skip_missing_version) { + LOG(WARNING) << "force skipping missing version for tablet:" << tablet_id(); + return Status::OK(); + } } } } @@ -886,7 +898,7 @@ Status Tablet::capture_consistent_versions(const Version& spec_version, Status Tablet::check_version_integrity(const Version& version, bool quiet) { std::shared_lock rdlock(_meta_lock); - return capture_consistent_versions(version, nullptr, quiet); + return capture_consistent_versions(version, nullptr, false, quiet); } bool Tablet::exceed_version_limit(int32_t limit) const { @@ -919,7 +931,7 @@ void Tablet::acquire_version_and_rowsets( Status Tablet::capture_consistent_rowsets(const Version& spec_version, std::vector* rowsets) const { std::vector version_path; - RETURN_IF_ERROR(capture_consistent_versions(spec_version, &version_path)); + RETURN_IF_ERROR(capture_consistent_versions(spec_version, &version_path, false, false)); RETURN_IF_ERROR(_capture_consistent_rowsets_unlocked(version_path, rowsets)); return Status::OK(); } @@ -955,10 +967,11 @@ Status Tablet::_capture_consistent_rowsets_unlocked(const std::vector& return Status::OK(); } -Status Tablet::capture_rs_readers(const Version& spec_version, - std::vector* rs_splits) const { +Status Tablet::capture_rs_readers(const Version& spec_version, std::vector* rs_splits, + bool skip_missing_version) const { std::vector version_path; - RETURN_IF_ERROR(capture_consistent_versions(spec_version, &version_path)); + RETURN_IF_ERROR( + capture_consistent_versions(spec_version, &version_path, skip_missing_version, false)); RETURN_IF_ERROR(capture_rs_readers(version_path, rs_splits)); return Status::OK(); } @@ -1238,34 +1251,6 @@ bool Tablet::check_path(const std::string& path_to_check) const { return false; } -// check rowset id in tablet-meta and in rowset-meta atomicly -// for example, during publish version stage, it will first add rowset meta to tablet meta and then -// remove it from rowset meta manager. If we check tablet meta first and then check rowset meta using 2 step unlocked -// the sequence maybe: 1. check in tablet meta [return false] 2. add to tablet meta 3. remove from rowset meta manager -// 4. check in rowset meta manager return false. so that the rowset maybe checked return false it means it is useless and -// will be treated as a garbage. -bool Tablet::check_rowset_id(const RowsetId& rowset_id) { - std::shared_lock rdlock(_meta_lock); - if (StorageEngine::instance()->rowset_id_in_use(rowset_id)) { - return true; - } - for (auto& version_rowset : _rs_version_map) { - if (version_rowset.second->rowset_id() == rowset_id) { - return true; - } - } - for (auto& stale_version_rowset : _stale_rs_version_map) { - if (stale_version_rowset.second->rowset_id() == rowset_id) { - return true; - } - } - Status s = RowsetMetaManager::exists(_data_dir->get_meta(), tablet_uid(), rowset_id); - if (!s.is()) { - return true; - } - return false; -} - void Tablet::_print_missed_versions(const std::vector& missed_versions) const { std::stringstream ss; ss << tablet_id() << " has " << missed_versions.size() << " missed version:"; @@ -1954,66 +1939,37 @@ void Tablet::reset_compaction(CompactionType compaction_type) { } Status Tablet::create_initial_rowset(const int64_t req_version) { - Status res = Status::OK(); if (req_version < 1) { return Status::Error( "init version of tablet should at least 1. req.ver={}", req_version); } Version version(0, req_version); RowsetSharedPtr new_rowset; - do { - // there is no data in init rowset, so overlapping info is unknown. - std::unique_ptr rs_writer; - RowsetWriterContext context; - context.version = version; - context.rowset_state = VISIBLE; - context.segments_overlap = OVERLAP_UNKNOWN; - context.tablet_schema = tablet_schema(); - context.newest_write_timestamp = UnixSeconds(); - - if (!(res = create_rowset_writer(context, &rs_writer)).ok()) { - LOG(WARNING) << "failed to init rowset writer for tablet " << tablet_id(); - break; - } - - if (!(res = rs_writer->flush()).ok()) { - LOG(WARNING) << "failed to flush rowset writer for tablet " << tablet_id(); - break; - } - - if (!(res = rs_writer->build(new_rowset)).ok()) { - LOG(WARNING) << "failed to build rowset for tablet " << tablet_id(); - break; - } - - if (!(res = add_rowset(new_rowset)).ok()) { - LOG(WARNING) << "failed to add rowset for tablet " << tablet_id(); - break; - } - } while (false); - - // Unregister index and delete files(index and data) if failed - if (!res.ok()) { - LOG(WARNING) << "fail to create initial rowset. res=" << res << " version=" << req_version; - StorageEngine::instance()->add_unused_rowset(new_rowset); - return res; - } + // there is no data in init rowset, so overlapping info is unknown. + RowsetWriterContext context; + context.version = version; + context.rowset_state = VISIBLE; + context.segments_overlap = OVERLAP_UNKNOWN; + context.tablet_schema = tablet_schema(); + context.newest_write_timestamp = UnixSeconds(); + auto rs_writer = DORIS_TRY(create_rowset_writer(context, false)); + RETURN_IF_ERROR(rs_writer->flush()); + RETURN_IF_ERROR(rs_writer->build(new_rowset)); + RETURN_IF_ERROR(add_rowset(std::move(new_rowset))); set_cumulative_layer_point(req_version + 1); - return res; -} - -Status Tablet::create_vertical_rowset_writer(RowsetWriterContext& context, - std::unique_ptr* rowset_writer) { - context.rowset_id = StorageEngine::instance()->next_rowset_id(); - _init_context_common_fields(context); - return RowsetFactory::create_rowset_writer(context, true, rowset_writer); + return Status::OK(); } -Status Tablet::create_rowset_writer(RowsetWriterContext& context, - std::unique_ptr* rowset_writer) { +Result> Tablet::create_rowset_writer(RowsetWriterContext& context, + bool vertical) { context.rowset_id = StorageEngine::instance()->next_rowset_id(); _init_context_common_fields(context); - return RowsetFactory::create_rowset_writer(context, false, rowset_writer); + std::unique_ptr rowset_writer; + if (auto st = RowsetFactory::create_rowset_writer(context, vertical, &rowset_writer); !st.ok()) + [[unlikely]] { + return unexpected(std::move(st)); + } + return rowset_writer; } // create a rowset writer with rowset_id and seg_id @@ -2135,11 +2091,10 @@ Status Tablet::_cooldown_data() { return Status::OK(); } RowsetId new_rowset_id = StorageEngine::instance()->next_rowset_id(); - add_pending_remote_rowset(new_rowset_id.to_string()); + auto pending_rs_guard = StorageEngine::instance()->pending_remote_rowsets().add(new_rowset_id); Status st; Defer defer {[&] { if (!st.ok()) { - erase_pending_remote_rowset(new_rowset_id.to_string()); // reclaim the incomplete rowset data in remote storage record_unused_remote_rowset(new_rowset_id, dest_fs->id(), old_rowset->num_segments()); } @@ -2176,7 +2131,6 @@ Status Tablet::_cooldown_data() { _tablet_meta->set_cooldown_meta_id(cooldown_meta_id); } } - erase_pending_remote_rowset(new_rowset_id.to_string()); { std::shared_lock meta_rlock(_meta_lock); SCOPED_SIMPLE_TRACE_IF_TIMEOUT(TRACE_TABLET_LOCK_THRESHOLD); @@ -2504,18 +2458,6 @@ Status Tablet::remove_all_remote_rowsets() { gc_pb.SerializeAsString()); } -static std::unordered_set s_pending_remote_rowsets; -static std::mutex s_pending_remote_rowsets_mtx; - -void Tablet::add_pending_remote_rowset(std::string rowset_id) { - std::lock_guard lock(s_pending_remote_rowsets_mtx); - s_pending_remote_rowsets.insert(std::move(rowset_id)); -} -void Tablet::erase_pending_remote_rowset(const std::string& rowset_id) { - std::lock_guard lock(s_pending_remote_rowsets_mtx); - s_pending_remote_rowsets.erase(rowset_id); -} - void Tablet::remove_unused_remote_files() { auto tablets = StorageEngine::instance()->tablet_manager()->get_all_tablet([](Tablet* t) { return t->tablet_meta()->cooldown_meta_id().initialized() && t->is_used() && @@ -2569,17 +2511,13 @@ void Tablet::remove_unused_remote_files() { return; } // get all cooldowned rowsets - std::unordered_set cooldowned_rowsets; - { - std::lock_guard lock(s_pending_remote_rowsets_mtx); - cooldowned_rowsets = s_pending_remote_rowsets; - } + RowsetIdUnorderedSet cooldowned_rowsets; UniqueId cooldown_meta_id; { std::shared_lock rlock(t->_meta_lock); - for (auto& rs_meta : t->_tablet_meta->all_rs_metas()) { + for (auto&& rs_meta : t->_tablet_meta->all_rs_metas()) { if (!rs_meta->is_local()) { - cooldowned_rowsets.insert(rs_meta->rowset_id().to_string()); + cooldowned_rowsets.insert(rs_meta->rowset_id()); } } cooldown_meta_id = t->_tablet_meta->cooldown_meta_id(); @@ -2592,31 +2530,19 @@ void Tablet::remove_unused_remote_files() { std::string remote_meta_path = fmt::format("{}.{}.meta", cooldown_replica_id, cooldown_term); // filter out the paths that should be reserved - // clang-format off - files.erase(std::remove_if(files.begin(), files.end(), [&](io::FileInfo& info) { - const std::string& path_str = info.file_name; - if (StringPiece(path_str).ends_with(".meta")) { - return path_str == remote_meta_path; + auto filter = [&](io::FileInfo& info) { + std::string_view filename = info.file_name; + if (filename.ends_with(".meta")) { + return filename == remote_meta_path; } - if (StringPiece(path_str).ends_with(".dat")) { - // extract rowset id. filename format: {rowset_id}_{segment_num}.dat - auto end = path_str.rfind('_'); - if (UNLIKELY(end == std::string::npos)) { - return false; - } - return !!cooldowned_rowsets.count(path_str.substr(0, end)); - } - if (StringPiece(path_str).ends_with(".idx")) { - // extract rowset id. filename format: {rowset_id}_{segment_num}_{index_id}.idx - auto end = path_str.find('_'); - if (UNLIKELY(end == std::string::npos)) { - return false; - } - return !!cooldowned_rowsets.count(path_str.substr(0, end)); + auto rowset_id = extract_rowset_id(filename); + if (rowset_id.hi == 0) { + return false; } - return false; - }), files.end()); - // clang-format on + return cooldowned_rowsets.contains(rowset_id) || + StorageEngine::instance()->pending_remote_rowsets().contains(rowset_id); + }; + files.erase(std::remove_if(files.begin(), files.end(), std::move(filter)), files.end()); if (files.empty()) { return; } @@ -2700,14 +2626,15 @@ void Tablet::update_max_version_schema(const TabletSchemaSPtr& tablet_schema) { Status Tablet::_get_segment_column_iterator( const BetaRowsetSharedPtr& rowset, uint32_t segid, const TabletColumn& target_column, + SegmentCacheHandle* segment_cache_handle, std::unique_ptr* column_iterator, OlapReaderStatistics* stats) { - SegmentCacheHandle segment_cache; - RETURN_IF_ERROR(SegmentLoader::instance()->load_segments(rowset, &segment_cache, true)); + RETURN_IF_ERROR(SegmentLoader::instance()->load_segments(rowset, segment_cache_handle, true)); // find segment auto it = std::find_if( - segment_cache.get_segments().begin(), segment_cache.get_segments().end(), + segment_cache_handle->get_segments().begin(), + segment_cache_handle->get_segments().end(), [&segid](const segment_v2::SegmentSharedPtr& seg) { return seg->id() == segid; }); - if (it == segment_cache.get_segments().end()) { + if (it == segment_cache_handle->get_segments().end()) { return Status::NotFound(fmt::format("rowset {} 's segemnt not found, seg_id {}", rowset->rowset_id().to_string(), segid)); } @@ -2724,7 +2651,8 @@ Status Tablet::_get_segment_column_iterator( } // fetch value by row column -Status Tablet::fetch_value_through_row_column(RowsetSharedPtr input_rowset, uint32_t segid, +Status Tablet::fetch_value_through_row_column(RowsetSharedPtr input_rowset, + const TabletSchema& tablet_schema, uint32_t segid, const std::vector& rowids, const std::vector& cids, vectorized::Block& block) { @@ -2737,13 +2665,13 @@ Status Tablet::fetch_value_through_row_column(RowsetSharedPtr input_rowset, uint BetaRowsetSharedPtr rowset = std::static_pointer_cast(input_rowset); CHECK(rowset); - const TabletSchemaSPtr tablet_schema = rowset->tablet_schema(); - CHECK(tablet_schema->store_row_column()); + CHECK(tablet_schema.store_row_column()); + SegmentCacheHandle segment_cache_handle; std::unique_ptr column_iterator; OlapReaderStatistics stats; RETURN_IF_ERROR(_get_segment_column_iterator(rowset, segid, - tablet_schema->column(BeConsts::ROW_STORE_COL), - &column_iterator, &stats)); + tablet_schema.column(BeConsts::ROW_STORE_COL), + &segment_cache_handle, &column_iterator, &stats)); // get and parse tuple row vectorized::MutableColumnPtr column_ptr = vectorized::ColumnString::create(); RETURN_IF_ERROR(column_iterator->read_by_rowids(rowids.data(), rowids.size(), column_ptr)); @@ -2755,7 +2683,7 @@ Status Tablet::fetch_value_through_row_column(RowsetSharedPtr input_rowset, uint std::vector default_values; default_values.resize(cids.size()); for (int i = 0; i < cids.size(); ++i) { - const TabletColumn& column = tablet_schema->column(cids[i]); + const TabletColumn& column = tablet_schema.column(cids[i]); vectorized::DataTypePtr type = vectorized::DataTypeFactory::instance().create_data_type(column); col_uid_to_idx[column.unique_id()] = i; @@ -2781,10 +2709,11 @@ Status Tablet::fetch_value_by_rowids(RowsetSharedPtr input_rowset, uint32_t segi // read row data BetaRowsetSharedPtr rowset = std::static_pointer_cast(input_rowset); CHECK(rowset); + SegmentCacheHandle segment_cache_handle; std::unique_ptr column_iterator; OlapReaderStatistics stats; - RETURN_IF_ERROR( - _get_segment_column_iterator(rowset, segid, tablet_column, &column_iterator, &stats)); + RETURN_IF_ERROR(_get_segment_column_iterator(rowset, segid, tablet_column, + &segment_cache_handle, &column_iterator, &stats)); RETURN_IF_ERROR(column_iterator->read_by_rowids(rowids.data(), rowids.size(), dst)); return Status::OK(); } @@ -2805,10 +2734,11 @@ Status Tablet::lookup_row_data(const Slice& encoded_key, const RowLocation& row_ CHECK(rowset); const TabletSchemaSPtr tablet_schema = rowset->tablet_schema(); CHECK(tablet_schema->store_row_column()); + SegmentCacheHandle segment_cache_handle; std::unique_ptr column_iterator; RETURN_IF_ERROR(_get_segment_column_iterator(rowset, row_location.segment_id, tablet_schema->column(BeConsts::ROW_STORE_COL), - &column_iterator, &stats)); + &segment_cache_handle, &column_iterator, &stats)); // get and parse tuple row vectorized::MutableColumnPtr column_ptr = vectorized::ColumnString::create(); std::vector rowids {static_cast(row_location.row_id)}; @@ -3047,14 +2977,6 @@ Status Tablet::calc_segment_delete_bitmap(RowsetSharedPtr rowset, continue; } if (is_partial_update && rowset_writer != nullptr) { - if (delete_bitmap->contains( - {rowset_id, seg->id(), DeleteBitmap::TEMP_VERSION_FOR_DELETE_SIGN}, - row_id)) { - LOG(INFO) - << "DEBUG: skip a delete sign column while calc_segment_delete_bitmap " - << "processing confict for partial update"; - continue; - } // In publish version, record rows to be deleted for concurrent update // For example, if version 5 and 6 update a row, but version 6 only see // version 4 when write, and when publish version, version 5's value will @@ -3234,8 +3156,8 @@ Status Tablet::read_columns_by_plan(TabletSchemaSPtr tablet_schema, (*read_index)[id_and_pos.pos] = read_idx++; } if (has_row_column) { - auto st = fetch_value_through_row_column(rowset_iter->second, seg_it.first, rids, - cids_to_read, block); + auto st = fetch_value_through_row_column(rowset_iter->second, *tablet_schema, + seg_it.first, rids, cids_to_read, block); if (!st.ok()) { LOG(WARNING) << "failed to fetch value through row column"; return st; @@ -3572,7 +3494,8 @@ Status Tablet::check_rowid_conversion( Status Tablet::all_rs_id(int64_t max_version, RowsetIdUnorderedSet* rowset_ids) const { // Ensure that the obtained versions of rowsets are continuous std::vector version_path; - RETURN_IF_ERROR(capture_consistent_versions(Version(0, max_version), &version_path)); + RETURN_IF_ERROR( + capture_consistent_versions(Version(0, max_version), &version_path, false, false)); for (auto& ver : version_path) { if (ver.second == 1) { // [0-1] rowset is empty for each tablet, skip it diff --git a/be/src/olap/tablet.h b/be/src/olap/tablet.h index 58a2c7f8f6c127..5efeb25dd7ca1d 100644 --- a/be/src/olap/tablet.h +++ b/be/src/olap/tablet.h @@ -64,7 +64,6 @@ class BaseCompaction; class FullCompaction; class SingleReplicaCompaction; class RowsetWriter; -struct TabletTxnInfo; struct RowsetWriterContext; class RowIdConversion; class TTabletInfo; @@ -90,9 +89,6 @@ extern const std::chrono::seconds TRACE_TABLET_LOCK_THRESHOLD; class Tablet final : public BaseTablet { public: - static TabletSharedPtr create_tablet_from_meta(TabletMetaSharedPtr tablet_meta, - DataDir* data_dir = nullptr); - Tablet(TabletMetaSharedPtr tablet_meta, DataDir* data_dir, const std::string_view& cumulative_compaction_type = ""); @@ -161,7 +157,7 @@ class Tablet final : public BaseTablet { const RowsetSharedPtr rowset_with_max_version() const; - static RowsetMetaSharedPtr rowset_meta_with_max_schema_version( + static TabletSchemaSPtr tablet_schema_with_merged_max_schema_version( const std::vector& rowset_metas); Status add_inc_rowset(const RowsetSharedPtr& rowset); @@ -173,9 +169,10 @@ class Tablet final : public BaseTablet { // Given spec_version, find a continuous version path and store it in version_path. // If quiet is true, then only "does this path exist" is returned. + // If skip_missing_version is true, return ok even there are missing versions. Status capture_consistent_versions(const Version& spec_version, std::vector* version_path, - bool quiet = false) const; + bool skip_missing_version, bool quiet) const; // if quiet is true, no error log will be printed if there are missing versions Status check_version_integrity(const Version& version, bool quiet = false); bool check_version_exist(const Version& version) const; @@ -184,8 +181,9 @@ class Tablet final : public BaseTablet { Status capture_consistent_rowsets(const Version& spec_version, std::vector* rowsets) const; - Status capture_rs_readers(const Version& spec_version, - std::vector* rs_splits) const override; + // If skip_missing_version is true, skip versions if they are missing. + Status capture_rs_readers(const Version& spec_version, std::vector* rs_splits, + bool skip_missing_version) const override; Status capture_rs_readers(const std::vector& version_path, std::vector* rs_splits) const; @@ -262,7 +260,6 @@ class Tablet final : public BaseTablet { void check_tablet_path_exists(); bool check_path(const std::string& check_path) const; - bool check_rowset_id(const RowsetId& rowset_id); TabletInfo get_tablet_info() const; @@ -339,8 +336,8 @@ class Tablet final : public BaseTablet { const TabletSchemaSPtr& tablet_schema_unlocked() const { return _max_version_schema; } - Status create_rowset_writer(RowsetWriterContext& context, - std::unique_ptr* rowset_writer) override; + Result> create_rowset_writer(RowsetWriterContext& context, + bool vertical) override; Status create_transient_rowset_writer(RowsetSharedPtr rowset_ptr, std::unique_ptr* rowset_writer, @@ -348,9 +345,6 @@ class Tablet final : public BaseTablet { Status create_transient_rowset_writer(RowsetWriterContext& context, const RowsetId& rowset_id, std::unique_ptr* rowset_writer); - Status create_vertical_rowset_writer(RowsetWriterContext& context, - std::unique_ptr* rowset_writer); - Status create_rowset(const RowsetMetaSharedPtr& rowset_meta, RowsetSharedPtr* rowset); // MUST hold EXCLUSIVE `_meta_lock` @@ -397,12 +391,6 @@ class Tablet final : public BaseTablet { static void remove_unused_remote_files(); - // If a rowset is to be written to remote filesystem, MUST add it to `pending_remote_rowsets` before uploading, - // and then erase it from `pending_remote_rowsets` after it has been insert to the Tablet. - // `remove_unused_remote_files` MUST NOT delete files of these pending rowsets. - static void add_pending_remote_rowset(std::string rowset_id); - static void erase_pending_remote_rowset(const std::string& rowset_id); - uint32_t calc_cold_data_compaction_score() const; std::mutex& get_cold_compaction_lock() { return _cold_compaction_lock; } @@ -436,7 +424,11 @@ class Tablet final : public BaseTablet { const TabletColumn& tablet_column, vectorized::MutableColumnPtr& dst); - Status fetch_value_through_row_column(RowsetSharedPtr input_rowset, uint32_t segid, + // We use the TabletSchema from the caller because the TabletSchema in the rowset'meta + // may be outdated due to schema change. Also note that the the cids should indicate the indexes + // of the columns in the TabletSchema passed in. + Status fetch_value_through_row_column(RowsetSharedPtr input_rowset, + const TabletSchema& tablet_schema, uint32_t segid, const std::vector& rowids, const std::vector& cids, vectorized::Block& block); @@ -516,11 +508,16 @@ class Tablet final : public BaseTablet { RowsetSharedPtr get_rowset(const RowsetId& rowset_id); - void traverse_rowsets(std::function visitor) { + void traverse_rowsets(std::function visitor, + bool include_stale = false) { std::shared_lock rlock(_meta_lock); for (auto& [v, rs] : _rs_version_map) { visitor(rs); } + if (!include_stale) return; + for (auto& [v, rs] : _stale_rs_version_map) { + visitor(rs); + } } std::vector get_binlog_filepath(std::string_view binlog_version) const; @@ -566,6 +563,7 @@ class Tablet final : public BaseTablet { std::vector* rowsets = nullptr); Status _get_segment_column_iterator( const BetaRowsetSharedPtr& rowset, uint32_t segid, const TabletColumn& target_column, + SegmentCacheHandle* segment_cache_handle, std::unique_ptr* column_iterator, OlapReaderStatistics* stats); diff --git a/be/src/olap/tablet_manager.cpp b/be/src/olap/tablet_manager.cpp index fb5b759929d43d..a49deea3606654 100644 --- a/be/src/olap/tablet_manager.cpp +++ b/be/src/olap/tablet_manager.cpp @@ -32,7 +32,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/config.h" #include "common/logging.h" @@ -499,10 +498,9 @@ TabletSharedPtr TabletManager::_create_tablet_meta_and_dir_unlocked( } } - TabletSharedPtr new_tablet = Tablet::create_tablet_from_meta(tablet_meta, data_dir); + TabletSharedPtr new_tablet = std::make_shared(std::move(tablet_meta), data_dir); COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "CreateTabletFromMeta", parent_timer_name), static_cast(watch.reset())); - DCHECK(new_tablet != nullptr); return new_tablet; } return nullptr; @@ -618,12 +616,24 @@ TabletSharedPtr TabletManager::_get_tablet_unlocked(TTabletId tablet_id, bool in TabletSharedPtr tablet; tablet = _get_tablet_unlocked(tablet_id); if (tablet == nullptr && include_deleted) { - std::shared_lock rdlock(_shutdown_tablets_lock); - for (auto& deleted_tablet : _shutdown_tablets) { - CHECK(deleted_tablet != nullptr) << "deleted tablet is nullptr"; - if (deleted_tablet->tablet_id() == tablet_id) { - tablet = deleted_tablet; - break; + { + std::shared_lock rdlock(_shutdown_tablets_lock); + for (auto& deleted_tablet : _shutdown_tablets) { + CHECK(deleted_tablet != nullptr) << "deleted tablet is nullptr"; + if (deleted_tablet->tablet_id() == tablet_id) { + tablet = deleted_tablet; + break; + } + } + } + if (tablet == nullptr) { + std::shared_lock rdlock(_shutdown_deleting_tablets_lock); + for (auto& deleted_tablet : _shutdown_deleting_tablets) { + CHECK(deleted_tablet != nullptr) << "deleted tablet is nullptr"; + if (deleted_tablet->tablet_id() == tablet_id) { + tablet = deleted_tablet; + break; + } } } } @@ -634,7 +644,7 @@ TabletSharedPtr TabletManager::_get_tablet_unlocked(TTabletId tablet_id, bool in } return nullptr; } - +#ifndef BE_TEST if (!tablet->is_used()) { LOG(WARNING) << "tablet cannot be used. tablet=" << tablet_id; if (err != nullptr) { @@ -642,6 +652,7 @@ TabletSharedPtr TabletManager::_get_tablet_unlocked(TTabletId tablet_id, bool in } return nullptr; } +#endif return tablet; } @@ -824,11 +835,7 @@ Status TabletManager::load_tablet_from_meta(DataDir* data_dir, TTabletId tablet_ tablet_meta->set_tablet_state(TABLET_RUNNING); } - TabletSharedPtr tablet = Tablet::create_tablet_from_meta(tablet_meta, data_dir); - if (tablet == nullptr) { - return Status::Error( - "fail to load tablet. tablet_id={}, schema_hash={}", tablet_id, schema_hash); - } + TabletSharedPtr tablet = std::make_shared(std::move(tablet_meta), data_dir); // NOTE: method load_tablet_from_meta could be called by two cases as below // case 1: BE start; @@ -847,7 +854,7 @@ Status TabletManager::load_tablet_from_meta(DataDir* data_dir, TTabletId tablet_ } } - if (tablet_meta->tablet_state() == TABLET_SHUTDOWN) { + if (tablet->tablet_meta()->tablet_state() == TABLET_SHUTDOWN) { { std::lock_guard shutdown_tablets_wrlock(_shutdown_tablets_lock); _shutdown_tablets.push_back(tablet); @@ -1045,9 +1052,17 @@ Status TabletManager::start_trash_sweep() { clean_num = 0; // should get write lock here, because it will remove tablet from shut_down_tablets // and get tablet will access shut_down_tablets - std::lock_guard wrlock(_shutdown_tablets_lock); - auto it = _shutdown_tablets.begin(); - while (it != _shutdown_tablets.end()) { + { + std::lock_guard wrlock1(_shutdown_tablets_lock); + std::lock_guard wrlock2(_shutdown_deleting_tablets_lock); + for (const auto& tablet : _shutdown_tablets) { + _shutdown_deleting_tablets.push_back(tablet); + } + _shutdown_tablets.clear(); + } + std::lock_guard wrlock(_shutdown_deleting_tablets_lock); + auto it = _shutdown_deleting_tablets.begin(); + while (it != _shutdown_deleting_tablets.end()) { // check if the meta has the tablet info and its state is shutdown if (it->use_count() > 1) { // it means current tablet is referenced by other thread @@ -1066,7 +1081,7 @@ Status TabletManager::start_trash_sweep() { << " old tablet_uid=" << (*it)->tablet_uid() << " cur tablet_uid=" << tablet_meta->tablet_uid(); // remove it from list - it = _shutdown_tablets.erase(it); + it = _shutdown_deleting_tablets.erase(it); continue; } // move data to trash @@ -1095,7 +1110,7 @@ Status TabletManager::start_trash_sweep() { << "tablet_id=" << (*it)->tablet_id() << ", schema_hash=" << (*it)->schema_hash() << ", tablet_path=" << tablet_path; - it = _shutdown_tablets.erase(it); + it = _shutdown_deleting_tablets.erase(it); ++clean_num; } else { // if could not find tablet info in meta store, then check if dir existed @@ -1115,7 +1130,7 @@ Status TabletManager::start_trash_sweep() { << "tablet_id=" << (*it)->tablet_id() << ", schema_hash=" << (*it)->schema_hash() << ", tablet_path=" << tablet_path; - it = _shutdown_tablets.erase(it); + it = _shutdown_deleting_tablets.erase(it); } } @@ -1262,11 +1277,9 @@ Status TabletManager::_create_tablet_meta_unlocked(const TCreateTabletReq& reque VLOG_NOTICE << "creating tablet meta. next_unique_id=" << next_unique_id; // We generate a new tablet_uid for this new tablet. - uint64_t shard_id = 0; - RETURN_NOT_OK_STATUS_WITH_WARN(store->get_shard(&shard_id), "fail to get root path shard"); - Status res = TabletMeta::create(request, TabletUid::gen_uid(), shard_id, next_unique_id, - col_idx_to_unique_id, tablet_meta); - RETURN_IF_ERROR(res); + uint64_t shard_id = store->get_shard(); + *tablet_meta = TabletMeta::create(request, TabletUid::gen_uid(), shard_id, next_unique_id, + col_idx_to_unique_id); if (request.__isset.storage_format) { if (request.storage_format == TStorageFormat::DEFAULT) { (*tablet_meta) @@ -1280,7 +1293,7 @@ Status TabletManager::_create_tablet_meta_unlocked(const TCreateTabletReq& reque request.storage_format); } } - return res; + return Status::OK(); } TabletSharedPtr TabletManager::_get_tablet_unlocked(TTabletId tablet_id) { diff --git a/be/src/olap/tablet_manager.h b/be/src/olap/tablet_manager.h index dc97ae3c90f483..0afdbb3c847802 100644 --- a/be/src/olap/tablet_manager.h +++ b/be/src/olap/tablet_manager.h @@ -239,6 +239,10 @@ class TabletManager { std::map> _partition_tablet_map; std::vector _shutdown_tablets; + // gc thread will move _shutdown_tablets to _shutdown_deleting_tablets + std::shared_mutex _shutdown_deleting_tablets_lock; + std::list _shutdown_deleting_tablets; + std::mutex _tablet_stat_cache_mutex; std::shared_ptr> _tablet_stat_list_cache = std::make_shared>(); diff --git a/be/src/olap/tablet_meta.cpp b/be/src/olap/tablet_meta.cpp index b5f3b4fd36d748..76cf14a75ba74d 100644 --- a/be/src/olap/tablet_meta.cpp +++ b/be/src/olap/tablet_meta.cpp @@ -48,15 +48,15 @@ using std::vector; namespace doris { using namespace ErrorCode; -Status TabletMeta::create(const TCreateTabletReq& request, const TabletUid& tablet_uid, - uint64_t shard_id, uint32_t next_unique_id, - const unordered_map& col_ordinal_to_unique_id, - TabletMetaSharedPtr* tablet_meta) { +TabletMetaSharedPtr TabletMeta::create( + const TCreateTabletReq& request, const TabletUid& tablet_uid, uint64_t shard_id, + uint32_t next_unique_id, + const unordered_map& col_ordinal_to_unique_id) { std::optional binlog_config; if (request.__isset.binlog_config) { binlog_config = request.binlog_config; } - *tablet_meta = std::make_shared( + return std::make_shared( request.table_id, request.partition_id, request.tablet_id, request.replica_id, request.tablet_schema.schema_hash, shard_id, request.tablet_schema, next_unique_id, col_ordinal_to_unique_id, tablet_uid, @@ -69,7 +69,6 @@ Status TabletMeta::create(const TCreateTabletReq& request, const TabletUid& tabl request.time_series_compaction_goal_size_mbytes, request.time_series_compaction_file_count_threshold, request.time_series_compaction_time_threshold_seconds); - return Status::OK(); } TabletMeta::TabletMeta() @@ -364,7 +363,8 @@ void TabletMeta::init_column_from_tcolumn(uint32_t unique_id, const TColumn& tco } for (size_t i = 0; i < tcolumn.children_column.size(); i++) { ColumnPB* children_column = column->add_children_columns(); - init_column_from_tcolumn(i, tcolumn.children_column[i], children_column); + init_column_from_tcolumn(tcolumn.children_column[i].col_unique_id, + tcolumn.children_column[i], children_column); } } @@ -1074,4 +1074,26 @@ std::shared_ptr DeleteBitmap::get_agg(const BitmapKey& bmk) co std::atomic DeleteBitmap::AggCache::s_repr {nullptr}; +std::string tablet_state_name(TabletState state) { + switch (state) { + case TABLET_NOTREADY: + return "TABLET_NOTREADY"; + + case TABLET_RUNNING: + return "TABLET_RUNNING"; + + case TABLET_TOMBSTONED: + return "TABLET_TOMBSTONED"; + + case TABLET_STOPPED: + return "TABLET_STOPPED"; + + case TABLET_SHUTDOWN: + return "TABLET_SHUTDOWN"; + + default: + return "TabletState(" + std::to_string(state) + ")"; + } +} + } // namespace doris diff --git a/be/src/olap/tablet_meta.h b/be/src/olap/tablet_meta.h index 2eed63e51599a3..60cc485882aee7 100644 --- a/be/src/olap/tablet_meta.h +++ b/be/src/olap/tablet_meta.h @@ -91,10 +91,10 @@ class TBinlogConfig; // The concurrency control is handled in Tablet Class, not in this class. class TabletMeta { public: - static Status create(const TCreateTabletReq& request, const TabletUid& tablet_uid, - uint64_t shard_id, uint32_t next_unique_id, - const std::unordered_map& col_ordinal_to_unique_id, - TabletMetaSharedPtr* tablet_meta); + static TabletMetaSharedPtr create( + const TCreateTabletReq& request, const TabletUid& tablet_uid, uint64_t shard_id, + uint32_t next_unique_id, + const std::unordered_map& col_ordinal_to_unique_id); TabletMeta(); TabletMeta(int64_t table_id, int64_t partition_id, int64_t tablet_id, int64_t replica_id, @@ -336,7 +336,6 @@ class DeleteBitmap { // tablet's delete bitmap we can use arbitary version number in BitmapKey. Here we define some version numbers // for specific usage during this periods to avoid conflicts constexpr static inline uint64_t TEMP_VERSION_COMMON = 0; - constexpr static inline uint64_t TEMP_VERSION_FOR_DELETE_SIGN = 1; /** * @@ -633,6 +632,8 @@ inline bool TabletMeta::all_beta() const { return true; } +std::string tablet_state_name(TabletState state); + // Only for unit test now. bool operator==(const TabletMeta& a, const TabletMeta& b); bool operator!=(const TabletMeta& a, const TabletMeta& b); diff --git a/be/src/olap/tablet_schema.cpp b/be/src/olap/tablet_schema.cpp index 7b7f5461b80ae3..d2b16a907a0b3b 100644 --- a/be/src/olap/tablet_schema.cpp +++ b/be/src/olap/tablet_schema.cpp @@ -27,7 +27,6 @@ #include // IWYU pragma: keep #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/consts.h" #include "common/status.h" @@ -721,7 +720,7 @@ void TabletSchema::clear_columns() { } void TabletSchema::init_from_pb(const TabletSchemaPB& schema) { - SCOPED_MEM_COUNT(&_mem_size); + SCOPED_MEM_COUNT_BY_HOOK(&_mem_size); _keys_type = schema.keys_type(); _num_columns = 0; _num_variant_columns = 0; diff --git a/be/src/olap/task/engine_clone_task.cpp b/be/src/olap/task/engine_clone_task.cpp index 6edc746da3afa2..31b834bdafc5dc 100644 --- a/be/src/olap/task/engine_clone_task.cpp +++ b/be/src/olap/task/engine_clone_task.cpp @@ -279,7 +279,7 @@ Status EngineCloneTask::_make_and_download_snapshots(DataDir& data_dir, TBackend* src_host, string* snapshot_path, const std::vector& missed_versions, bool* allow_incremental_clone) { - Status status = Status::OK(); + Status status; const auto& token = _master_info.token; @@ -288,21 +288,13 @@ Status EngineCloneTask::_make_and_download_snapshots(DataDir& data_dir, timeout_s = _clone_req.timeout_s; } - for (auto& src : _clone_req.src_backends) { + for (auto&& src : _clone_req.src_backends) { // Make snapshot in remote olap engine *src_host = src; // make snapshot status = _make_snapshot(src.host, src.be_port, _clone_req.tablet_id, _clone_req.schema_hash, timeout_s, missed_versions, snapshot_path, allow_incremental_clone); - if (status.ok()) { - LOG_INFO("successfully make snapshot in remote BE") - .tag("host", src.host) - .tag("port", src.be_port) - .tag("tablet", _clone_req.tablet_id) - .tag("snapshot_path", *snapshot_path) - .tag("signature", _signature) - .tag("missed_versions", missed_versions); - } else { + if (!status.ok()) [[unlikely]] { LOG_WARNING("failed to make snapshot in remote BE") .tag("host", src.host) .tag("port", src.be_port) @@ -310,13 +302,28 @@ Status EngineCloneTask::_make_and_download_snapshots(DataDir& data_dir, .tag("signature", _signature) .tag("missed_versions", missed_versions) .error(status); - continue; + continue; // Try another BE } - + LOG_INFO("successfully make snapshot in remote BE") + .tag("host", src.host) + .tag("port", src.be_port) + .tag("tablet", _clone_req.tablet_id) + .tag("snapshot_path", *snapshot_path) + .tag("signature", _signature) + .tag("missed_versions", missed_versions); + Defer defer {[host = src.host, port = src.be_port, &snapshot_path = *snapshot_path, this] { + // TODO(plat1ko): Async release snapshot + auto st = _release_snapshot(host, port, snapshot_path); + if (!st.ok()) [[unlikely]] { + LOG_WARNING("failed to release snapshot in remote BE") + .tag("host", host) + .tag("port", port) + .tag("snapshot_path", snapshot_path) + .error(st); + } + }}; std::string remote_url_prefix; { - // TODO(zc): if snapshot path has been returned from source, it is some strange to - // concat tablet_id and schema hash here. std::stringstream ss; if (snapshot_path->back() == '/') { ss << "http://" << get_host_port(src.host, src.http_port) << HTTP_REQUEST_PREFIX @@ -327,37 +334,21 @@ Status EngineCloneTask::_make_and_download_snapshots(DataDir& data_dir, << HTTP_REQUEST_TOKEN_PARAM << token << HTTP_REQUEST_FILE_PARAM << *snapshot_path << "/" << _clone_req.tablet_id << "/" << _clone_req.schema_hash << "/"; } - remote_url_prefix = ss.str(); } status = _download_files(&data_dir, remote_url_prefix, local_data_path); - // when there is an error, keep this program executing to release snapshot - - if (status.ok()) { - // change all rowset ids because they maybe its id same with local rowset - status = SnapshotManager::instance()->convert_rowset_ids( - local_data_path, _clone_req.tablet_id, _clone_req.replica_id, - _clone_req.partition_id, _clone_req.schema_hash); - } else { + if (!status.ok()) [[unlikely]] { LOG_WARNING("failed to download snapshot from remote BE") .tag("url", remote_url_prefix) .error(status); + continue; // Try another BE } - - // Release snapshot, if failed, ignore it. OLAP engine will drop useless snapshot - auto st = _release_snapshot(src.host, src.be_port, *snapshot_path); - if (!st.ok()) { - LOG_WARNING("failed to release snapshot in remote BE") - .tag("host", src.host) - .tag("port", src.be_port) - .tag("snapshot_path", *snapshot_path) - .error(status); - // DON'T change the status - } - if (status.ok()) { - break; - } + // No need to try again with another BE + _pending_rs_guards = DORIS_TRY(SnapshotManager::instance()->convert_rowset_ids( + local_data_path, _clone_req.tablet_id, _clone_req.replica_id, + _clone_req.partition_id, _clone_req.schema_hash)); + break; } // clone copy from one backend return status; } diff --git a/be/src/olap/task/engine_clone_task.h b/be/src/olap/task/engine_clone_task.h index b8263b0658992d..f98561148ed327 100644 --- a/be/src/olap/task/engine_clone_task.h +++ b/be/src/olap/task/engine_clone_task.h @@ -94,6 +94,7 @@ class EngineCloneTask : public EngineTask { int64_t _copy_size; int64_t _copy_time_ms; std::shared_ptr _mem_tracker; + std::vector _pending_rs_guards; }; // EngineTask } // namespace doris diff --git a/be/src/olap/task/engine_publish_version_task.cpp b/be/src/olap/task/engine_publish_version_task.cpp index d914816e0adcf3..040b5a7edea9f3 100644 --- a/be/src/olap/task/engine_publish_version_task.cpp +++ b/be/src/olap/task/engine_publish_version_task.cpp @@ -244,14 +244,17 @@ Status EnginePublishVersionTask::finish() { add_error_tablet_id(tablet_id); if (res.ok()) { res = Status::Error( - "tablet {} not exists version {}", tablet_id, + "tablet {} with state {} not exists version {}", tablet_id, + tablet_state_name(tablet->tablet_state()), par_ver_info.version); } - LOG(WARNING) << "publish version failed on transaction, tablet version not " - "exists. " - << "transaction_id=" << transaction_id - << ", tablet_id=" << tablet_id - << ", version=" << par_ver_info.version; + LOG(WARNING) + << "publish version failed on transaction, tablet version not " + "exists. " + << "transaction_id=" << transaction_id + << ", tablet_id=" << tablet_id + << ", tablet_state=" << tablet_state_name(tablet->tablet_state()) + << ", version=" << par_ver_info.version; } } } diff --git a/be/src/olap/task/engine_storage_migration_task.cpp b/be/src/olap/task/engine_storage_migration_task.cpp index 82335c2e6d113a..e896b9f3a8f512 100644 --- a/be/src/olap/task/engine_storage_migration_task.cpp +++ b/be/src/olap/task/engine_storage_migration_task.cpp @@ -151,8 +151,9 @@ Status EngineStorageMigrationTask::_gen_and_write_header_to_hdr_file( // it will change rowset id and its create time // rowset create time is useful when load tablet from meta to check which tablet is the tablet to load - return SnapshotManager::instance()->convert_rowset_ids( - full_path, tablet_id, _tablet->replica_id(), _tablet->partition_id(), schema_hash); + _pending_rs_guards = DORIS_TRY(SnapshotManager::instance()->convert_rowset_ids( + full_path, tablet_id, _tablet->replica_id(), _tablet->partition_id(), schema_hash)); + return Status::OK(); } Status EngineStorageMigrationTask::_reload_tablet(const std::string& full_path) { @@ -213,7 +214,7 @@ Status EngineStorageMigrationTask::_migrate() { RETURN_IF_ERROR(_get_versions(start_version, &end_version, &consistent_rowsets)); // TODO(ygl): the tablet should not under schema change or rollup or load - RETURN_IF_ERROR(_dest_store->get_shard(&shard)); + shard = _dest_store->get_shard(); auto shard_path = fmt::format("{}/{}/{}", _dest_store->path(), DATA_PREFIX, shard); full_path = SnapshotManager::get_schema_hash_full_path(_tablet, shard_path); diff --git a/be/src/olap/task/engine_storage_migration_task.h b/be/src/olap/task/engine_storage_migration_task.h index d392de9cf2f7ee..ce80fbe4e85907 100644 --- a/be/src/olap/task/engine_storage_migration_task.h +++ b/be/src/olap/task/engine_storage_migration_task.h @@ -26,6 +26,7 @@ #include #include "common/status.h" +#include "olap/rowset/pending_rowset_helper.h" #include "olap/rowset/rowset.h" #include "olap/tablet.h" #include "olap/tablet_meta.h" @@ -79,6 +80,7 @@ class EngineStorageMigrationTask : public EngineTask { // destination data dir DataDir* _dest_store; int64_t _task_start_time; + std::vector _pending_rs_guards; }; // EngineTask } // namespace doris diff --git a/be/src/olap/task/index_builder.cpp b/be/src/olap/task/index_builder.cpp index e7b3234025fe96..6969edfe05c57b 100644 --- a/be/src/olap/task/index_builder.cpp +++ b/be/src/olap/task/index_builder.cpp @@ -58,10 +58,11 @@ Status IndexBuilder::update_inverted_index_info() { << ", is_drop_op=" << _is_drop_op; // index ids that will not be linked std::set without_index_uids; - for (auto i = 0; i < _input_rowsets.size(); ++i) { - auto input_rowset = _input_rowsets[i]; + _output_rowsets.reserve(_input_rowsets.size()); + _pending_rs_guards.reserve(_input_rowsets.size()); + for (auto&& input_rowset : _input_rowsets) { TabletSchemaSPtr output_rs_tablet_schema = std::make_shared(); - auto input_rs_tablet_schema = input_rowset->tablet_schema(); + const auto& input_rs_tablet_schema = input_rowset->tablet_schema(); output_rs_tablet_schema->copy_from(*input_rs_tablet_schema); if (_is_drop_op) { // base on input rowset's tablet_schema to build @@ -97,7 +98,6 @@ Status IndexBuilder::update_inverted_index_info() { RETURN_IF_ERROR(input_rowset->create_reader(&input_rs_reader)); // construct output rowset writer - std::unique_ptr output_rs_writer; RowsetWriterContext context; context.version = input_rs_reader->version(); context.rowset_state = VISIBLE; @@ -105,10 +105,8 @@ Status IndexBuilder::update_inverted_index_info() { context.tablet_schema = output_rs_tablet_schema; context.newest_write_timestamp = input_rs_reader->newest_write_timestamp(); context.fs = input_rs_reader->rowset()->rowset_meta()->fs(); - Status status = _tablet->create_rowset_writer(context, &output_rs_writer); - if (!status.ok()) { - return Status::Error(status.to_string()); - } + auto output_rs_writer = DORIS_TRY(_tablet->create_rowset_writer(context, false)); + _pending_rs_guards.push_back(StorageEngine::instance()->add_pending_rowset(context)); // if without_index_uids is not empty, copy _alter_index_ids to it // else just use _alter_index_ids to avoid copy @@ -479,12 +477,13 @@ Status IndexBuilder::do_build_inverted_index() { } Status IndexBuilder::modify_rowsets(const Merger::Statistics* stats) { - for (auto rowset_ptr : _output_rowsets) { - auto rowset_id = rowset_ptr->rowset_id(); - if (StorageEngine::instance()->check_rowset_id_in_unused_rowsets(rowset_id)) { - DCHECK(false) << "output rowset: " << rowset_id.to_string() << " in unused rowsets"; + DCHECK(std::ranges::all_of(_output_rowsets.begin(), _output_rowsets.end(), [](auto&& rs) { + if (StorageEngine::instance()->check_rowset_id_in_unused_rowsets(rs->rowset_id())) { + LOG(ERROR) << "output rowset: " << rs->rowset_id() << " in unused rowsets"; + return false; } - } + return true; + })); if (_tablet->keys_type() == KeysType::UNIQUE_KEYS && _tablet->enable_unique_key_merge_on_write()) { @@ -523,9 +522,8 @@ Status IndexBuilder::modify_rowsets(const Merger::Statistics* stats) { } void IndexBuilder::gc_output_rowset() { - for (auto output_rowset : _output_rowsets) { + for (auto&& output_rowset : _output_rowsets) { if (!output_rowset->is_local()) { - Tablet::erase_pending_remote_rowset(output_rowset->rowset_id().to_string()); _tablet->record_unused_remote_rowset(output_rowset->rowset_id(), output_rowset->rowset_meta()->resource_id(), output_rowset->num_segments()); diff --git a/be/src/olap/task/index_builder.h b/be/src/olap/task/index_builder.h index 2ff266f5a770d8..7baefbc035a78e 100644 --- a/be/src/olap/task/index_builder.h +++ b/be/src/olap/task/index_builder.h @@ -20,6 +20,7 @@ #include "olap/merger.h" #include "olap/olap_common.h" #include "olap/olap_define.h" +#include "olap/rowset/pending_rowset_helper.h" #include "olap/rowset/segment_v2/inverted_index_desc.h" #include "olap/rowset/segment_v2/inverted_index_writer.h" #include "olap/tablet.h" @@ -67,6 +68,7 @@ class IndexBuilder { std::set _alter_index_ids; std::vector _input_rowsets; std::vector _output_rowsets; + std::vector _pending_rs_guards; std::vector _input_rs_readers; std::unique_ptr _olap_data_convertor; // "" -> InvertedIndexColumnWriter diff --git a/be/src/olap/txn_manager.cpp b/be/src/olap/txn_manager.cpp index 1ec7899b460b05..2d5ce73b867d99 100644 --- a/be/src/olap/txn_manager.cpp +++ b/be/src/olap/txn_manager.cpp @@ -36,6 +36,7 @@ #include "olap/data_dir.h" #include "olap/delta_writer.h" #include "olap/olap_common.h" +#include "olap/rowset/pending_rowset_helper.h" #include "olap/rowset/rowset_meta.h" #include "olap/rowset/rowset_meta_manager.h" #include "olap/schema_change.h" @@ -62,6 +63,46 @@ using std::vector; namespace doris { using namespace ErrorCode; +struct TabletTxnInfo { + PUniqueId load_id; + RowsetSharedPtr rowset; + PendingRowsetGuard pending_rs_guard; + bool unique_key_merge_on_write {false}; + DeleteBitmapPtr delete_bitmap; + // records rowsets calc in commit txn + RowsetIdUnorderedSet rowset_ids; + int64_t creation_time; + bool ingest {false}; + std::shared_ptr partial_update_info; + TxnState state {TxnState::PREPARED}; + + TabletTxnInfo() = default; + + TabletTxnInfo(PUniqueId load_id, RowsetSharedPtr rowset) + : load_id(load_id), rowset(rowset), creation_time(UnixSeconds()) {} + + TabletTxnInfo(PUniqueId load_id, RowsetSharedPtr rowset, bool ingest_arg) + : load_id(load_id), rowset(rowset), creation_time(UnixSeconds()), ingest(ingest_arg) {} + + TabletTxnInfo(PUniqueId load_id, RowsetSharedPtr rowset, bool merge_on_write, + DeleteBitmapPtr delete_bitmap, const RowsetIdUnorderedSet& ids) + : load_id(load_id), + rowset(rowset), + unique_key_merge_on_write(merge_on_write), + delete_bitmap(delete_bitmap), + rowset_ids(ids), + creation_time(UnixSeconds()) {} + + void prepare() { state = TxnState::PREPARED; } + void commit() { state = TxnState::COMMITTED; } + void rollback() { state = TxnState::ROLLEDBACK; } + void abort() { + if (state == TxnState::PREPARED) { + state = TxnState::ABORTED; + } + } +}; + TxnManager::TxnManager(int32_t txn_map_shard_size, int32_t txn_shard_size) : _txn_map_shard_size(txn_map_shard_size), _txn_shard_size(txn_shard_size) { DCHECK_GT(_txn_map_shard_size, 0); @@ -113,11 +154,12 @@ Status TxnManager::prepare_txn(TPartitionId partition_id, TTransactionId transac } // found load for txn,tablet - TabletTxnInfo& load_info = load_itr->second; + auto& load_info = load_itr->second; // case 1: user commit rowset, then the load id must be equal // check if load id is equal - if (load_info.load_id.hi() == load_id.hi() && load_info.load_id.lo() == load_id.lo() && - load_info.rowset != nullptr) { + auto& load_id = load_info->load_id; + if (load_info->load_id.hi() == load_id.hi() && load_info->load_id.lo() == load_id.lo() && + load_info->rowset != nullptr) { LOG(WARNING) << "find transaction exists when add to engine." << "partition_id: " << key.first << ", transaction_id: " << key.second << ", tablet: " << tablet_info.to_string(); @@ -139,8 +181,9 @@ Status TxnManager::prepare_txn(TPartitionId partition_id, TTransactionId transac // not found load id // case 1: user start a new txn, rowset = null // case 2: loading txn from meta env - TabletTxnInfo load_info(load_id, nullptr, ingest); - txn_tablet_map[key][tablet_info] = load_info; + auto load_info = std::make_shared(load_id, nullptr, ingest); + load_info->prepare(); + txn_tablet_map[key][tablet_info] = std::move(load_info); _insert_txn_partition_map_unlocked(transaction_id, partition_id); VLOG_NOTICE << "add transaction to engine successfully." << "partition_id: " << key.first << ", transaction_id: " << key.second @@ -150,9 +193,11 @@ Status TxnManager::prepare_txn(TPartitionId partition_id, TTransactionId transac Status TxnManager::commit_txn(TPartitionId partition_id, const Tablet& tablet, TTransactionId transaction_id, const PUniqueId& load_id, - const RowsetSharedPtr& rowset_ptr, bool is_recovery) { + const RowsetSharedPtr& rowset_ptr, PendingRowsetGuard guard, + bool is_recovery) { return commit_txn(tablet.data_dir()->get_meta(), partition_id, transaction_id, - tablet.tablet_id(), tablet.tablet_uid(), load_id, rowset_ptr, is_recovery); + tablet.tablet_id(), tablet.tablet_uid(), load_id, rowset_ptr, + std::move(guard), is_recovery); } Status TxnManager::publish_txn(TPartitionId partition_id, const TabletSharedPtr& tablet, @@ -162,6 +207,29 @@ Status TxnManager::publish_txn(TPartitionId partition_id, const TabletSharedPtr& tablet->tablet_id(), tablet->tablet_uid(), version, stats); } +void TxnManager::abort_txn(TPartitionId partition_id, TTransactionId transaction_id, + TTabletId tablet_id, TabletUid tablet_uid) { + pair key(partition_id, transaction_id); + TabletInfo tablet_info(tablet_id, tablet_uid); + + std::shared_lock txn_rdlock(_get_txn_map_lock(transaction_id)); + + auto& txn_tablet_map = _get_txn_tablet_map(transaction_id); + auto it = txn_tablet_map.find(key); + if (it == txn_tablet_map.end()) { + return; + } + + auto& tablet_txn_info_map = it->second; + auto tablet_txn_info_iter = tablet_txn_info_map.find(tablet_info); + if (tablet_txn_info_iter == tablet_txn_info_map.end()) { + return; + } + + auto& txn_info = tablet_txn_info_iter->second; + txn_info->abort(); +} + // delete the txn from manager if it is not committed(not have a valid rowset) Status TxnManager::rollback_txn(TPartitionId partition_id, const Tablet& tablet, TTransactionId transaction_id) { @@ -200,23 +268,25 @@ void TxnManager::set_txn_related_delete_bitmap( << " may be cleared"; return; } - TabletTxnInfo& load_info = load_itr->second; - load_info.unique_key_merge_on_write = unique_key_merge_on_write; - load_info.delete_bitmap = delete_bitmap; - load_info.rowset_ids = rowset_ids; - load_info.partial_update_info = partial_update_info; + auto& load_info = load_itr->second; + load_info->unique_key_merge_on_write = unique_key_merge_on_write; + load_info->delete_bitmap = delete_bitmap; + load_info->rowset_ids = rowset_ids; + load_info->partial_update_info = partial_update_info; } } Status TxnManager::commit_txn(OlapMeta* meta, TPartitionId partition_id, TTransactionId transaction_id, TTabletId tablet_id, TabletUid tablet_uid, const PUniqueId& load_id, - const RowsetSharedPtr& rowset_ptr, bool is_recovery) { + const RowsetSharedPtr& rowset_ptr, PendingRowsetGuard guard, + bool is_recovery) { if (partition_id < 1 || transaction_id < 1 || tablet_id < 1) { LOG(FATAL) << "invalid commit req " << " partition_id=" << partition_id << " transaction_id=" << transaction_id << " tablet_id=" << tablet_id; } + pair key(partition_id, transaction_id); TabletInfo tablet_info(tablet_id, tablet_uid); if (rowset_ptr == nullptr) { @@ -250,30 +320,34 @@ Status TxnManager::commit_txn(OlapMeta* meta, TPartitionId partition_id, // found load for txn,tablet // case 1: user commit rowset, then the load id must be equal - TabletTxnInfo& load_info = load_itr->second; + auto& load_info = load_itr->second; // check if load id is equal - if (load_info.load_id.hi() == load_id.hi() && load_info.load_id.lo() == load_id.lo() && - load_info.rowset != nullptr && - load_info.rowset->rowset_id() == rowset_ptr->rowset_id()) { - // find a rowset with same rowset id, then it means a duplicate call + if (load_info->rowset == nullptr) { + break; + } + + if (load_info->load_id.hi() != load_id.hi() || load_info->load_id.lo() != load_id.lo()) { + break; + } + + // find a rowset with same rowset id, then it means a duplicate call + if (load_info->rowset->rowset_id() == rowset_ptr->rowset_id()) { LOG(INFO) << "find rowset exists when commit transaction to engine." << "partition_id: " << key.first << ", transaction_id: " << key.second << ", tablet: " << tablet_info.to_string() - << ", rowset_id: " << load_info.rowset->rowset_id(); + << ", rowset_id: " << load_info->rowset->rowset_id(); + // Should not remove this rowset from pending rowsets + load_info->pending_rs_guard = std::move(guard); return Status::OK(); - } else if (load_info.load_id.hi() == load_id.hi() && - load_info.load_id.lo() == load_id.lo() && load_info.rowset != nullptr && - load_info.rowset->rowset_id() != rowset_ptr->rowset_id()) { - // find a rowset with different rowset id, then it should not happen, just return errors - return Status::Error( - "find rowset exists when commit transaction to engine. but rowset ids are not " - "same. partition_id: {}, transaction_id: {}, tablet: {}, exist rowset_id: {}, " - "new rowset_id: {}", - key.first, key.second, tablet_info.to_string(), - load_info.rowset->rowset_id().to_string(), rowset_ptr->rowset_id().to_string()); - } else { - break; } + + // find a rowset with different rowset id, then it should not happen, just return errors + return Status::Error( + "find rowset exists when commit transaction to engine. but rowset ids are not " + "same. partition_id: {}, transaction_id: {}, tablet: {}, exist rowset_id: {}, new " + "rowset_id: {}", + key.first, key.second, tablet_info.to_string(), + load_info->rowset->rowset_id().to_string(), rowset_ptr->rowset_id().to_string()); } while (false); // if not in recovery mode, then should persist the meta to meta env @@ -292,17 +366,20 @@ Status TxnManager::commit_txn(OlapMeta* meta, TPartitionId partition_id, { std::lock_guard wrlock(_get_txn_map_lock(transaction_id)); - TabletTxnInfo load_info(load_id, rowset_ptr); + auto load_info = std::make_shared(load_id, rowset_ptr); + load_info->pending_rs_guard = std::move(guard); if (is_recovery) { TabletSharedPtr tablet = StorageEngine::instance()->tablet_manager()->get_tablet( tablet_info.tablet_id, tablet_info.tablet_uid); if (tablet != nullptr && tablet->enable_unique_key_merge_on_write()) { - load_info.unique_key_merge_on_write = true; - load_info.delete_bitmap.reset(new DeleteBitmap(tablet->tablet_id())); + load_info->unique_key_merge_on_write = true; + load_info->delete_bitmap.reset(new DeleteBitmap(tablet->tablet_id())); } } + load_info->commit(); + txn_tablet_map_t& txn_tablet_map = _get_txn_tablet_map(transaction_id); - txn_tablet_map[key][tablet_info] = load_info; + txn_tablet_map[key][tablet_info] = std::move(load_info); _insert_txn_partition_map_unlocked(transaction_id, partition_id); VLOG_NOTICE << "commit transaction to engine successfully." << " partition_id: " << key.first << ", transaction_id: " << key.second @@ -326,8 +403,8 @@ Status TxnManager::publish_txn(OlapMeta* meta, TPartitionId partition_id, pair key(partition_id, transaction_id); TabletInfo tablet_info(tablet_id, tablet_uid); - RowsetSharedPtr rowset = nullptr; - TabletTxnInfo tablet_txn_info; + RowsetSharedPtr rowset; + std::shared_ptr tablet_txn_info; int64_t t1 = MonotonicMicros(); /// Step 1: get rowset, tablet_txn_info by key { @@ -343,7 +420,7 @@ Status TxnManager::publish_txn(OlapMeta* meta, TPartitionId partition_id, // found load for txn,tablet // case 1: user commit rowset, then the load id must be equal tablet_txn_info = txn_info_iter->second; - rowset = tablet_txn_info.rowset; + rowset = tablet_txn_info->rowset; } } } @@ -361,19 +438,19 @@ Status TxnManager::publish_txn(OlapMeta* meta, TPartitionId partition_id, // it maybe a fatal error rowset->make_visible(version); // update delete_bitmap - if (tablet_txn_info.unique_key_merge_on_write) { + if (tablet_txn_info->unique_key_merge_on_write) { std::unique_ptr rowset_writer; RETURN_IF_ERROR(tablet->create_transient_rowset_writer( - rowset, &rowset_writer, tablet_txn_info.partial_update_info)); + rowset, &rowset_writer, tablet_txn_info->partial_update_info)); int64_t t2 = MonotonicMicros(); - RETURN_IF_ERROR(tablet->update_delete_bitmap(rowset, tablet_txn_info.rowset_ids, - tablet_txn_info.delete_bitmap, transaction_id, + RETURN_IF_ERROR(tablet->update_delete_bitmap(rowset, tablet_txn_info->rowset_ids, + tablet_txn_info->delete_bitmap, transaction_id, rowset_writer.get())); int64_t t3 = MonotonicMicros(); stats->calc_delete_bitmap_time_us = t3 - t2; - if (tablet_txn_info.partial_update_info && - tablet_txn_info.partial_update_info->is_partial_update) { + if (tablet_txn_info->partial_update_info && + tablet_txn_info->partial_update_info->is_partial_update) { // build rowset writer and merge transient rowset RETURN_IF_ERROR(rowset_writer->flush()); RowsetSharedPtr transient_rowset; @@ -386,7 +463,7 @@ Status TxnManager::publish_txn(OlapMeta* meta, TPartitionId partition_id, stats->partial_update_write_segment_us = MonotonicMicros() - t3; int64_t t4 = MonotonicMicros(); RETURN_IF_ERROR(TabletMetaManager::save_delete_bitmap( - tablet->data_dir(), tablet->tablet_id(), tablet_txn_info.delete_bitmap, + tablet->data_dir(), tablet->tablet_id(), tablet_txn_info->delete_bitmap, version.second)); stats->save_meta_time_us = MonotonicMicros() - t4; } @@ -453,30 +530,36 @@ Status TxnManager::rollback_txn(TPartitionId partition_id, TTransactionId transa TTabletId tablet_id, TabletUid tablet_uid) { pair key(partition_id, transaction_id); TabletInfo tablet_info(tablet_id, tablet_uid); + std::lock_guard wrlock(_get_txn_map_lock(transaction_id)); txn_tablet_map_t& txn_tablet_map = _get_txn_tablet_map(transaction_id); + auto it = txn_tablet_map.find(key); - if (it != txn_tablet_map.end()) { - auto load_itr = it->second.find(tablet_info); - if (load_itr != it->second.end()) { - // found load for txn,tablet - // case 1: user commit rowset, then the load id must be equal - TabletTxnInfo& load_info = load_itr->second; - if (load_info.rowset != nullptr) { - return Status::Error( - "if rowset is not null, it means other thread may commit the rowset should " - "not delete txn any more"); - } - } - it->second.erase(tablet_info); - LOG(INFO) << "rollback transaction from engine successfully." - << " partition_id: " << key.first << ", transaction_id: " << key.second - << ", tablet: " << tablet_info.to_string(); - if (it->second.empty()) { - txn_tablet_map.erase(it); - _clear_txn_partition_map_unlocked(transaction_id, partition_id); + if (it == txn_tablet_map.end()) { + return Status::OK(); + } + + auto& tablet_txn_info_map = it->second; + if (auto load_itr = tablet_txn_info_map.find(tablet_info); + load_itr != tablet_txn_info_map.end()) { + // found load for txn,tablet + // case 1: user commit rowset, then the load id must be equal + const auto& load_info = load_itr->second; + if (load_info->rowset != nullptr) { + return Status::Error( + "if rowset is not null, it means other thread may commit the rowset should " + "not delete txn any more"); } } + + tablet_txn_info_map.erase(tablet_info); + LOG(INFO) << "rollback transaction from engine successfully." + << " partition_id: " << key.first << ", transaction_id: " << key.second + << ", tablet: " << tablet_info.to_string(); + if (tablet_txn_info_map.empty()) { + txn_tablet_map.erase(it); + _clear_txn_partition_map_unlocked(transaction_id, partition_id); + } return Status::OK(); } @@ -497,32 +580,29 @@ Status TxnManager::delete_txn(OlapMeta* meta, TPartitionId partition_id, if (load_itr != it->second.end()) { // found load for txn,tablet // case 1: user commit rowset, then the load id must be equal - TabletTxnInfo& load_info = load_itr->second; - if (load_info.rowset != nullptr && meta != nullptr) { - if (load_info.rowset->version().first > 0) { + auto& load_info = load_itr->second; + auto& rowset = load_info->rowset; + if (rowset != nullptr && meta != nullptr) { + if (rowset->version().first > 0) { return Status::Error( "could not delete transaction from engine, just remove it from memory not " "delete from disk, because related rowset already published. partition_id: " "{}, transaction_id: {}, tablet: {}, rowset id: {}, version:{}", key.first, key.second, tablet_info.to_string(), - load_info.rowset->rowset_id().to_string(), - load_info.rowset->version().to_string()); + rowset->rowset_id().to_string(), rowset->version().to_string()); } else { - static_cast( - RowsetMetaManager::remove(meta, tablet_uid, load_info.rowset->rowset_id())); + static_cast(RowsetMetaManager::remove(meta, tablet_uid, rowset->rowset_id())); #ifndef BE_TEST - StorageEngine::instance()->add_unused_rowset(load_info.rowset); + StorageEngine::instance()->add_unused_rowset(rowset); #endif VLOG_NOTICE << "delete transaction from engine successfully." << " partition_id: " << key.first << ", transaction_id: " << key.second << ", tablet: " << tablet_info.to_string() << ", rowset: " - << (load_info.rowset != nullptr - ? load_info.rowset->rowset_id().to_string() - : "0"); + << (rowset != nullptr ? rowset->rowset_id().to_string() : "0"); } } } - it->second.erase(tablet_info); + it->second.erase(load_itr); if (it->second.empty()) { txn_tablet_map.erase(it); _clear_txn_partition_map_unlocked(transaction_id, partition_id); @@ -566,22 +646,21 @@ void TxnManager::force_rollback_tablet_related_txns(OlapMeta* meta, TTabletId ta for (auto it = txn_tablet_map.begin(); it != txn_tablet_map.end();) { auto load_itr = it->second.find(tablet_info); if (load_itr != it->second.end()) { - TabletTxnInfo& load_info = load_itr->second; - if (load_info.rowset != nullptr && meta != nullptr) { + auto& load_info = load_itr->second; + auto& rowset = load_info->rowset; + if (rowset != nullptr && meta != nullptr) { LOG(INFO) << " delete transaction from engine " << ", tablet: " << tablet_info.to_string() - << ", rowset id: " << load_info.rowset->rowset_id(); - static_cast(RowsetMetaManager::remove(meta, tablet_uid, - load_info.rowset->rowset_id())); + << ", rowset id: " << rowset->rowset_id(); + static_cast( + RowsetMetaManager::remove(meta, tablet_uid, rowset->rowset_id())); } LOG(INFO) << "remove tablet related txn." << " partition_id: " << it->first.first << ", transaction_id: " << it->first.second << ", tablet: " << tablet_info.to_string() << ", rowset: " - << (load_info.rowset != nullptr - ? load_info.rowset->rowset_id().to_string() - : "0"); - it->second.erase(tablet_info); + << (rowset != nullptr ? rowset->rowset_id().to_string() : "0"); + it->second.erase(load_itr); } if (it->second.empty()) { _clear_txn_partition_map_unlocked(it->first.second, it->first.first); @@ -606,14 +685,14 @@ void TxnManager::get_txn_related_tablets(const TTransactionId transaction_id, << " partition_id=" << partition_id << ", transaction_id=" << transaction_id; return; } - std::map& load_info_map = it->second; + auto& load_info_map = it->second; // each tablet for (auto& load_info : load_info_map) { const TabletInfo& tablet_info = load_info.first; // must not check rowset == null here, because if rowset == null // publish version should failed - tablet_infos->emplace(tablet_info, load_info.second.rowset); + tablet_infos->emplace(tablet_info, load_info.second->rowset); } } @@ -632,52 +711,45 @@ void TxnManager::get_all_commit_tablet_txn_info_by_tablet( const TabletSharedPtr& tablet, CommitTabletTxnInfoVec* commit_tablet_txn_info_vec) { for (int32_t i = 0; i < _txn_map_shard_size; i++) { std::shared_lock txn_rdlock(_txn_map_locks[i]); - for (const auto& it : _txn_tablet_maps[i]) { - auto tablet_load_it = it.second.find(tablet->get_tablet_info()); - if (tablet_load_it != it.second.end()) { - TPartitionId partition_id = it.first.first; - TTransactionId transaction_id = it.first.second; - const RowsetSharedPtr& rowset = tablet_load_it->second.rowset; - const DeleteBitmapPtr& delete_bitmap = tablet_load_it->second.delete_bitmap; - const RowsetIdUnorderedSet& rowset_ids = tablet_load_it->second.rowset_ids; + for (const auto& [txn_key, load_info_map] : _txn_tablet_maps[i]) { + auto tablet_load_it = load_info_map.find(tablet->get_tablet_info()); + if (tablet_load_it != load_info_map.end()) { + const auto& [_, load_info] = *tablet_load_it; + const auto& rowset = load_info->rowset; + const auto& delete_bitmap = load_info->delete_bitmap; if (!rowset || !delete_bitmap) { continue; } - commit_tablet_txn_info_vec->push_back(CommitTabletTxnInfo( - partition_id, transaction_id, delete_bitmap, rowset_ids)); + commit_tablet_txn_info_vec->push_back({ + .transaction_id = txn_key.second, + .partition_id = txn_key.first, + .delete_bitmap = delete_bitmap, + .rowset_ids = load_info->rowset_ids, + .partial_update_info = load_info->partial_update_info, + }); } } } } -bool TxnManager::has_txn(TPartitionId partition_id, TTransactionId transaction_id, - TTabletId tablet_id, TabletUid tablet_uid) { - pair key(partition_id, transaction_id); - TabletInfo tablet_info(tablet_id, tablet_uid); - std::shared_lock txn_rdlock(_get_txn_map_lock(transaction_id)); - txn_tablet_map_t& txn_tablet_map = _get_txn_tablet_map(transaction_id); - auto it = txn_tablet_map.find(key); - bool found = it != txn_tablet_map.end() && it->second.find(tablet_info) != it->second.end(); - - return found; -} - void TxnManager::build_expire_txn_map(std::map>* expire_txn_map) { int64_t now = UnixSeconds(); // traverse the txn map, and get all expired txns for (int32_t i = 0; i < _txn_map_shard_size; i++) { std::shared_lock txn_rdlock(_txn_map_locks[i]); - for (auto& it : _txn_tablet_maps[i]) { - auto txn_id = it.first.second; - for (auto& t_map : it.second) { - double diff = difftime(now, t_map.second.creation_time); - if (diff >= config::pending_data_expire_time_sec) { - (*expire_txn_map)[t_map.first].push_back(txn_id); - if (VLOG_IS_ON(3)) { - VLOG_NOTICE << "find expired txn." - << " tablet=" << t_map.first.to_string() - << " transaction_id=" << txn_id << " exist_sec=" << diff; - } + for (auto&& [txn_key, tablet_txn_infos] : _txn_tablet_maps[i]) { + auto txn_id = txn_key.second; + for (auto&& [tablet_info, txn_info] : tablet_txn_infos) { + double diff = difftime(now, txn_info->creation_time); + if (diff < config::pending_data_expire_time_sec) { + continue; + } + + (*expire_txn_map)[tablet_info].push_back(txn_id); + if (VLOG_IS_ON(3)) { + VLOG_NOTICE << "find expired txn." + << " tablet=" << tablet_info.to_string() + << " transaction_id=" << txn_id << " exist_sec=" << diff; } } } @@ -796,4 +868,27 @@ void TxnManager::update_tablet_version_txn(int64_t tablet_id, int64_t version, i _tablet_version_cache->release(handle); } +TxnState TxnManager::get_txn_state(TPartitionId partition_id, TTransactionId transaction_id, + TTabletId tablet_id, TabletUid tablet_uid) { + pair key(partition_id, transaction_id); + TabletInfo tablet_info(tablet_id, tablet_uid); + + std::shared_lock txn_rdlock(_get_txn_map_lock(transaction_id)); + + auto& txn_tablet_map = _get_txn_tablet_map(transaction_id); + auto it = txn_tablet_map.find(key); + if (it == txn_tablet_map.end()) { + return TxnState::NOT_FOUND; + } + + auto& tablet_txn_info_map = it->second; + auto tablet_txn_info_iter = tablet_txn_info_map.find(tablet_info); + if (tablet_txn_info_iter == tablet_txn_info_map.end()) { + return TxnState::NOT_FOUND; + } + + const auto& txn_info = tablet_txn_info_iter->second; + return txn_info->state; +} + } // namespace doris diff --git a/be/src/olap/txn_manager.h b/be/src/olap/txn_manager.h index 6d2222a3ec57ba..d994596ecbe167 100644 --- a/be/src/olap/txn_manager.h +++ b/be/src/olap/txn_manager.h @@ -37,6 +37,7 @@ #include "common/status.h" #include "olap/olap_common.h" #include "olap/partial_update_info.h" +#include "olap/rowset/pending_rowset_helper.h" #include "olap/rowset/rowset.h" #include "olap/rowset/rowset_meta.h" #include "olap/rowset/segment_v2/segment.h" @@ -51,46 +52,23 @@ class DeltaWriter; class OlapMeta; struct TabletPublishStatistics; -struct TabletTxnInfo { - PUniqueId load_id; - RowsetSharedPtr rowset; - bool unique_key_merge_on_write {false}; - DeleteBitmapPtr delete_bitmap; - // records rowsets calc in commit txn - RowsetIdUnorderedSet rowset_ids; - int64_t creation_time; - bool ingest {false}; - std::shared_ptr partial_update_info; - - TabletTxnInfo(PUniqueId load_id, RowsetSharedPtr rowset) - : load_id(load_id), rowset(rowset), creation_time(UnixSeconds()) {} - - TabletTxnInfo(PUniqueId load_id, RowsetSharedPtr rowset, bool ingest_arg) - : load_id(load_id), rowset(rowset), creation_time(UnixSeconds()), ingest(ingest_arg) {} - - TabletTxnInfo(PUniqueId load_id, RowsetSharedPtr rowset, bool merge_on_write, - DeleteBitmapPtr delete_bitmap, const RowsetIdUnorderedSet& ids) - : load_id(load_id), - rowset(rowset), - unique_key_merge_on_write(merge_on_write), - delete_bitmap(delete_bitmap), - rowset_ids(ids), - creation_time(UnixSeconds()) {} - - TabletTxnInfo() {} +enum class TxnState { + NOT_FOUND = 0, + PREPARED = 1, + COMMITTED = 2, + ROLLEDBACK = 3, + ABORTED = 4, + DELETED = 5, }; +struct TabletTxnInfo; + struct CommitTabletTxnInfo { - CommitTabletTxnInfo(TPartitionId partition_id, TTransactionId transaction_id, - DeleteBitmapPtr delete_bitmap, RowsetIdUnorderedSet rowset_ids) - : transaction_id(transaction_id), - partition_id(partition_id), - delete_bitmap(delete_bitmap), - rowset_ids(rowset_ids) {} - TTransactionId transaction_id; - TPartitionId partition_id; + TTransactionId transaction_id {0}; + TPartitionId partition_id {0}; DeleteBitmapPtr delete_bitmap; RowsetIdUnorderedSet rowset_ids; + std::shared_ptr partial_update_info; }; using CommitTabletTxnInfoVec = std::vector; @@ -122,7 +100,8 @@ class TxnManager { Status commit_txn(TPartitionId partition_id, const Tablet& tablet, TTransactionId transaction_id, const PUniqueId& load_id, - const RowsetSharedPtr& rowset_ptr, bool is_recovery); + const RowsetSharedPtr& rowset_ptr, PendingRowsetGuard guard, + bool is_recovery); Status publish_txn(TPartitionId partition_id, const TabletSharedPtr& tablet, TTransactionId transaction_id, const Version& version, @@ -137,7 +116,8 @@ class TxnManager { Status commit_txn(OlapMeta* meta, TPartitionId partition_id, TTransactionId transaction_id, TTabletId tablet_id, TabletUid tablet_uid, const PUniqueId& load_id, - const RowsetSharedPtr& rowset_ptr, bool is_recovery); + const RowsetSharedPtr& rowset_ptr, PendingRowsetGuard guard, + bool is_recovery); // remove a txn from txn manager // not persist rowset meta because @@ -145,6 +125,10 @@ class TxnManager { TTabletId tablet_id, TabletUid tablet_uid, const Version& version, TabletPublishStatistics* stats); + // only abort not committed txn + void abort_txn(TPartitionId partition_id, TTransactionId transaction_id, TTabletId tablet_id, + TabletUid tablet_uid); + // delete the txn from manager if it is not committed(not have a valid rowset) Status rollback_txn(TPartitionId partition_id, TTransactionId transaction_id, TTabletId tablet_id, TabletUid tablet_uid); @@ -163,10 +147,6 @@ class TxnManager { void get_all_related_tablets(std::set* tablet_infos); - // Just check if the txn exists. - bool has_txn(TPartitionId partition_id, TTransactionId transaction_id, TTabletId tablet_id, - TabletUid tablet_uid); - // Get all expired txns and save them in expire_txn_map. // This is currently called before reporting all tablet info, to avoid iterating txn map for every tablets. void build_expire_txn_map(std::map>* expire_txn_map); @@ -195,6 +175,9 @@ class TxnManager { int64_t get_txn_by_tablet_version(int64_t tablet_id, int64_t version); void update_tablet_version_txn(int64_t tablet_id, int64_t version, int64_t txn_id); + TxnState get_txn_state(TPartitionId partition_id, TTransactionId transaction_id, + TTabletId tablet_id, TabletUid tablet_uid); + private: using TxnKey = std::pair; // partition_id, transaction_id; @@ -214,8 +197,9 @@ class TxnManager { } }; - using txn_tablet_map_t = std::unordered_map, - TxnKeyHash, TxnKeyEqual>; + using txn_tablet_map_t = + std::unordered_map>, + TxnKeyHash, TxnKeyEqual>; using txn_partition_map_t = std::unordered_map>; using txn_tablet_delta_writer_map_t = std::unordered_map>; diff --git a/be/src/olap/types.cpp b/be/src/olap/types.cpp index 5ed7984bcc6b7e..b9a072cc5dfab4 100644 --- a/be/src/olap/types.cpp +++ b/be/src/olap/types.cpp @@ -21,7 +21,6 @@ #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "olap/tablet_schema.h" diff --git a/be/src/olap/types.h b/be/src/olap/types.h index 40c68a3e369e3c..a0c38975f72978 100644 --- a/be/src/olap/types.h +++ b/be/src/olap/types.h @@ -693,7 +693,7 @@ struct CppTypeTraits { }; template <> struct CppTypeTraits { - using CppType = Int256; + using CppType = wide::Int256; using UnsignedCppType = wide::UInt256; }; template <> @@ -1210,17 +1210,12 @@ struct FieldTypeTraits return Status::Error( "FieldTypeTraits::from_string meet PARSE_FAILURE"); } - *reinterpret_cast(buf) = value; + *reinterpret_cast(buf) = value; return Status::OK(); } static std::string to_string(const void* src) { - // TODO: support decimal256 - DCHECK(false); - return ""; - // auto value = reinterpret_cast(src); - // fmt::memory_buffer buffer; - // fmt::format_to(buffer, "{}", *value); - // return std::string(buffer.data(), buffer.size()); + const auto* value = reinterpret_cast(src); + return wide::to_string(*value); } }; diff --git a/be/src/olap/wal_manager.cpp b/be/src/olap/wal_manager.cpp index cf84c4a9661f2e..1eb91d85e0a4b7 100644 --- a/be/src/olap/wal_manager.cpp +++ b/be/src/olap/wal_manager.cpp @@ -19,10 +19,15 @@ #include +#include #include +#include #include +#include +#include #include "io/fs/local_file_system.h" +#include "olap/wal_writer.h" #include "runtime/client_cache.h" #include "runtime/fragment_mgr.h" #include "runtime/plan_fragment_executor.h" @@ -35,11 +40,13 @@ namespace doris { WalManager::WalManager(ExecEnv* exec_env, const std::string& wal_dir_list) : _exec_env(exec_env), _stop_background_threads_latch(1) { doris::vectorized::WalReader::string_split(wal_dir_list, ",", _wal_dirs); + _all_wal_disk_bytes = std::make_shared(0); } WalManager::~WalManager() { LOG(INFO) << "WalManager is destoried"; } + void WalManager::stop() { _stop = true; _stop_background_threads_latch.count_down(); @@ -117,8 +124,12 @@ Status WalManager::create_wal_writer(int64_t wal_id, std::shared_ptr& RETURN_IF_ERROR(io::global_local_filesystem()->create_directory(base_path)); } LOG(INFO) << "create wal " << wal_path; - wal_writer = std::make_shared(wal_path); + wal_writer = std::make_shared(wal_path, _all_wal_disk_bytes); RETURN_IF_ERROR(wal_writer->init()); + { + std::lock_guard wrlock(_wal_lock); + _wal_id_to_writer_map.emplace(wal_id, wal_writer); + } return Status::OK(); } @@ -241,6 +252,22 @@ size_t WalManager::get_wal_table_size(const std::string& table_id) { Status WalManager::delete_wal(int64_t wal_id) { { std::lock_guard wrlock(_wal_lock); + if (_wal_id_to_writer_map.find(wal_id) != _wal_id_to_writer_map.end()) { + _all_wal_disk_bytes->store( + _all_wal_disk_bytes->fetch_sub(_wal_id_to_writer_map[wal_id]->disk_bytes(), + std::memory_order_relaxed), + std::memory_order_relaxed); + _wal_id_to_writer_map[wal_id]->cv.notify_one(); + std::string wal_path = _wal_path_map[wal_id]; + LOG(INFO) << "wal delete file=" << wal_path << ", this file disk usage is" + << _wal_id_to_writer_map[wal_id]->disk_bytes() + << " ,after deleting it, all wals disk usage is " + << _all_wal_disk_bytes->load(std::memory_order_relaxed); + _wal_id_to_writer_map.erase(wal_id); + } + if (_wal_id_to_writer_map.empty()) { + CHECK_EQ(_all_wal_disk_bytes->load(std::memory_order_relaxed), 0); + } std::string wal_path = _wal_path_map[wal_id]; RETURN_IF_ERROR(io::global_local_filesystem()->delete_file(wal_path)); LOG(INFO) << "delete file=" << wal_path; diff --git a/be/src/olap/wal_manager.h b/be/src/olap/wal_manager.h index f5b49f6ddaf2c2..d8fa4a705278d2 100644 --- a/be/src/olap/wal_manager.h +++ b/be/src/olap/wal_manager.h @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +#include + #include "common/config.h" #include "gen_cpp/FrontendService.h" #include "gen_cpp/FrontendService_types.h" @@ -56,6 +58,8 @@ class WalManager { std::vector _wal_dirs; std::shared_mutex _wal_lock; std::unordered_map _wal_path_map; + std::unordered_map> _wal_id_to_writer_map; + std::shared_ptr _all_wal_disk_bytes; bool _stop = false; }; } // namespace doris \ No newline at end of file diff --git a/be/src/olap/wal_writer.cpp b/be/src/olap/wal_writer.cpp index 7cd427453bb361..b433c7aaaa2f13 100644 --- a/be/src/olap/wal_writer.cpp +++ b/be/src/olap/wal_writer.cpp @@ -17,6 +17,9 @@ #include "olap/wal_writer.h" +#include + +#include "common/config.h" #include "io/fs/file_writer.h" #include "io/fs/local_file_system.h" #include "io/fs/path.h" @@ -25,7 +28,12 @@ namespace doris { -WalWriter::WalWriter(const std::string& file_name) : _file_name(file_name) {} +WalWriter::WalWriter(const std::string& file_name, + const std::shared_ptr& all_wal_disk_bytes) + : _file_name(file_name), + _count(0), + _disk_bytes(0), + _all_wal_disk_bytes(all_wal_disk_bytes) {} WalWriter::~WalWriter() {} @@ -44,6 +52,12 @@ Status WalWriter::finalize() { } Status WalWriter::append_blocks(const PBlockArray& blocks) { + { + std::unique_lock l(_mutex); + while (_all_wal_disk_bytes->load(std::memory_order_relaxed) > config::wal_max_disk_size) { + cv.wait_for(l, std::chrono::milliseconds(WalWriter::MAX_WAL_WRITE_WAIT_TIME)); + } + } size_t total_size = 0; for (const auto& block : blocks) { total_size += LENGTH_SIZE + block->ByteSizeLong() + CHECKSUM_SIZE; @@ -62,6 +76,11 @@ Status WalWriter::append_blocks(const PBlockArray& blocks) { offset += CHECKSUM_SIZE; } DCHECK(offset == total_size); + _disk_bytes.store(_disk_bytes.fetch_add(total_size, std::memory_order_relaxed), + std::memory_order_relaxed); + _all_wal_disk_bytes->store( + _all_wal_disk_bytes->fetch_add(total_size, std::memory_order_relaxed), + std::memory_order_relaxed); // write rows RETURN_IF_ERROR(_file_writer->append({row_binary, offset})); _count++; diff --git a/be/src/olap/wal_writer.h b/be/src/olap/wal_writer.h index 12fd84f258fb7d..f11dbc20eb05cd 100644 --- a/be/src/olap/wal_writer.h +++ b/be/src/olap/wal_writer.h @@ -17,9 +17,13 @@ #pragma once +#include +#include + #include "common/status.h" #include "gen_cpp/internal_service.pb.h" #include "io/fs/file_reader_writer_fwd.h" +#include "util/lock.h" namespace doris { @@ -27,23 +31,30 @@ using PBlockArray = std::vector; class WalWriter { public: - explicit WalWriter(const std::string& file_name); + explicit WalWriter(const std::string& file_name, + const std::shared_ptr& all_wal_disk_bytes); ~WalWriter(); Status init(); Status finalize(); Status append_blocks(const PBlockArray& blocks); + size_t disk_bytes() const { return _disk_bytes.load(std::memory_order_relaxed); }; std::string file_name() { return _file_name; }; static const int64_t LENGTH_SIZE = 8; static const int64_t CHECKSUM_SIZE = 4; + doris::ConditionVariable cv; private: + static constexpr size_t MAX_WAL_WRITE_WAIT_TIME = 1000; std::string _file_name; io::FileWriterPtr _file_writer; int64_t _count; int64_t _batch; + std::atomic_size_t _disk_bytes; + std::shared_ptr _all_wal_disk_bytes; + doris::Mutex _mutex; }; } // namespace doris \ No newline at end of file diff --git a/be/src/pch/pch.h b/be/src/pch/pch.h index eb404a78c8c2e4..34428d739d02fc 100644 --- a/be/src/pch/pch.h +++ b/be/src/pch/pch.h @@ -268,39 +268,6 @@ #include #include -// opentelemetry headers -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - // parallel_hashmap headers #include #include diff --git a/be/src/pipeline/exec/aggregation_sink_operator.cpp b/be/src/pipeline/exec/aggregation_sink_operator.cpp index 93b2e2d968c375..9eb49d234f7733 100644 --- a/be/src/pipeline/exec/aggregation_sink_operator.cpp +++ b/be/src/pipeline/exec/aggregation_sink_operator.cpp @@ -67,7 +67,7 @@ template Status AggSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); - SCOPED_TIMER(Base::profile()->total_time_counter()); + SCOPED_TIMER(Base::exec_time_counter()); SCOPED_TIMER(Base::_open_timer); _agg_data = Base::_shared_state->agg_data.get(); _agg_arena_pool = Base::_shared_state->agg_arena_pool.get(); @@ -160,7 +160,7 @@ Status AggSinkLocalState::init(RuntimeState* state, template Status AggSinkLocalState::open(RuntimeState* state) { - SCOPED_TIMER(Base::profile()->total_time_counter()); + SCOPED_TIMER(Base::exec_time_counter()); SCOPED_TIMER(Base::_open_timer); RETURN_IF_ERROR(Base::open(state)); _agg_data = Base::_shared_state->agg_data.get(); @@ -784,7 +784,7 @@ Status AggSinkOperatorX::sink(doris::RuntimeState* state, vectorized::Block* in_block, SourceState source_state) { auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state.profile()->total_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)in_block->rows()); local_state._shared_state->input_num_rows += in_block->rows(); if (in_block->rows() > 0) { @@ -804,7 +804,7 @@ Status AggSinkOperatorX::sink(doris::RuntimeState* state, template Status AggSinkLocalState::close(RuntimeState* state, Status exec_status) { - SCOPED_TIMER(Base::profile()->total_time_counter()); + SCOPED_TIMER(Base::exec_time_counter()); SCOPED_TIMER(Base::_close_timer); if (Base::_closed) { return Status::OK(); diff --git a/be/src/pipeline/exec/aggregation_source_operator.cpp b/be/src/pipeline/exec/aggregation_source_operator.cpp index 482a263887b5d3..bb290cfbd2946a 100644 --- a/be/src/pipeline/exec/aggregation_source_operator.cpp +++ b/be/src/pipeline/exec/aggregation_source_operator.cpp @@ -40,7 +40,7 @@ AggLocalState::AggLocalState(RuntimeState* state, OperatorXBase* parent) Status AggLocalState::init(RuntimeState* state, LocalStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); _agg_data = _shared_state->agg_data.get(); _get_results_timer = ADD_TIMER(profile(), "GetResultsTime"); @@ -509,7 +509,7 @@ AggSourceOperatorX::AggSourceOperatorX(ObjectPool* pool, const TPlanNode& tnode, Status AggSourceOperatorX::get_block(RuntimeState* state, vectorized::Block* block, SourceState& source_state) { auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state.profile()->total_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); RETURN_IF_ERROR(local_state._executor.get_result(state, block, source_state)); local_state.make_nullable_output_key(block); // dispose the having clause, should not be execute in prestreaming agg @@ -528,7 +528,7 @@ void AggLocalState::make_nullable_output_key(vectorized::Block* block) { } Status AggLocalState::close(RuntimeState* state) { - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_close_timer); if (_closed) { return Status::OK(); diff --git a/be/src/pipeline/exec/analytic_sink_operator.cpp b/be/src/pipeline/exec/analytic_sink_operator.cpp index 7109fa9deabdb5..23a4838dffb7aa 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.cpp +++ b/be/src/pipeline/exec/analytic_sink_operator.cpp @@ -28,7 +28,7 @@ OPERATOR_CODE_GENERATOR(AnalyticSinkOperator, StreamingOperator) Status AnalyticSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(PipelineXSinkLocalState::init(state, info)); - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); auto& p = _parent->cast(); _shared_state->partition_by_column_idxs.resize(p._partition_by_eq_expr_ctxs.size()); @@ -134,7 +134,7 @@ Status AnalyticSinkOperatorX::open(RuntimeState* state) { Status AnalyticSinkOperatorX::sink(doris::RuntimeState* state, vectorized::Block* input_block, SourceState source_state) { auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state.profile()->total_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)input_block->rows()); local_state._shared_state->input_eos = source_state == SourceState::FINISHED; if (local_state._shared_state->input_eos && input_block->rows() == 0) { diff --git a/be/src/pipeline/exec/analytic_source_operator.cpp b/be/src/pipeline/exec/analytic_source_operator.cpp index 7c310db123737f..c705d849a3a660 100644 --- a/be/src/pipeline/exec/analytic_source_operator.cpp +++ b/be/src/pipeline/exec/analytic_source_operator.cpp @@ -38,7 +38,7 @@ AnalyticLocalState::AnalyticLocalState(RuntimeState* state, OperatorXBase* paren Status AnalyticLocalState::init(RuntimeState* state, LocalStateInfo& info) { RETURN_IF_ERROR(PipelineXLocalState::init(state, info)); - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); _agg_arena_pool = std::make_unique(); @@ -379,7 +379,7 @@ Status AnalyticSourceOperatorX::init(const TPlanNode& tnode, RuntimeState* state Status AnalyticSourceOperatorX::get_block(RuntimeState* state, vectorized::Block* block, SourceState& source_state) { auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state.profile()->total_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); if (local_state._shared_state->input_eos && (local_state._output_block_index == local_state._shared_state->input_blocks.size() || local_state._shared_state->input_total_rows == 0)) { @@ -415,7 +415,7 @@ Status AnalyticSourceOperatorX::get_block(RuntimeState* state, vectorized::Block } Status AnalyticLocalState::close(RuntimeState* state) { - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_close_timer); if (_closed) { return Status::OK(); diff --git a/be/src/pipeline/exec/assert_num_rows_operator.cpp b/be/src/pipeline/exec/assert_num_rows_operator.cpp index f58f2568cfca2e..baddce5a94d538 100644 --- a/be/src/pipeline/exec/assert_num_rows_operator.cpp +++ b/be/src/pipeline/exec/assert_num_rows_operator.cpp @@ -38,7 +38,7 @@ AssertNumRowsOperatorX::AssertNumRowsOperatorX(ObjectPool* pool, const TPlanNode Status AssertNumRowsOperatorX::pull(doris::RuntimeState* state, vectorized::Block* block, SourceState& source_state) { auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state.profile()->total_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); local_state.add_num_rows_returned(block->rows()); int64_t num_rows_returned = local_state.num_rows_returned(); bool assert_res = false; diff --git a/be/src/pipeline/exec/distinct_streaming_aggregation_sink_operator.cpp b/be/src/pipeline/exec/distinct_streaming_aggregation_sink_operator.cpp index ef2551124a39a6..8a240d6336141c 100644 --- a/be/src/pipeline/exec/distinct_streaming_aggregation_sink_operator.cpp +++ b/be/src/pipeline/exec/distinct_streaming_aggregation_sink_operator.cpp @@ -194,7 +194,7 @@ Status DistinctStreamingAggSinkOperatorX::init(const TPlanNode& tnode, RuntimeSt Status DistinctStreamingAggSinkOperatorX::sink(RuntimeState* state, vectorized::Block* in_block, SourceState source_state) { auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state.profile()->total_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)in_block->rows()); local_state._shared_state->input_num_rows += in_block->rows(); Status ret = Status::OK(); @@ -235,7 +235,7 @@ Status DistinctStreamingAggSinkLocalState::close(RuntimeState* state, Status exe if (_closed) { return Status::OK(); } - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_close_timer); if (_shared_state->data_queue && !_shared_state->data_queue->is_finish()) { // finish should be set, if not set here means error. diff --git a/be/src/pipeline/exec/distinct_streaming_aggregation_source_operator.cpp b/be/src/pipeline/exec/distinct_streaming_aggregation_source_operator.cpp index 43456f10e3e6ed..ffdbab0127d9c9 100644 --- a/be/src/pipeline/exec/distinct_streaming_aggregation_source_operator.cpp +++ b/be/src/pipeline/exec/distinct_streaming_aggregation_source_operator.cpp @@ -107,7 +107,7 @@ DistinctStreamingAggSourceOperatorX::DistinctStreamingAggSourceOperatorX(ObjectP Status DistinctStreamingAggSourceOperatorX::get_block(RuntimeState* state, vectorized::Block* block, SourceState& source_state) { auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state.profile()->total_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); std::unique_ptr agg_block; RETURN_IF_ERROR(local_state._shared_state->data_queue->get_block_from_queue(&agg_block)); if (agg_block != nullptr) { diff --git a/be/src/pipeline/exec/exchange_sink_buffer.cpp b/be/src/pipeline/exec/exchange_sink_buffer.cpp index 577eb4a46224fe..46364528458506 100644 --- a/be/src/pipeline/exec/exchange_sink_buffer.cpp +++ b/be/src/pipeline/exec/exchange_sink_buffer.cpp @@ -62,7 +62,8 @@ namespace pipeline { template ExchangeSinkBuffer::ExchangeSinkBuffer(PUniqueId query_id, PlanNodeId dest_node_id, int send_id, int be_number, QueryContext* context) - : _is_finishing(false), + : _queue_capacity(0), + _is_finishing(false), _query_id(query_id), _dest_node_id(dest_node_id), _sender_id(send_id), @@ -106,7 +107,7 @@ bool ExchangeSinkBuffer::is_pending_finish() { if (need_cancel && _instance_to_rpc_ctx.find(id) != _instance_to_rpc_ctx.end()) { auto& rpc_ctx = _instance_to_rpc_ctx[id]; if (!rpc_ctx.is_cancelled) { - brpc::StartCancel(rpc_ctx._closure->cntl.call_id()); + brpc::StartCancel(rpc_ctx._send_callback->cntl_->call_id()); rpc_ctx.is_cancelled = true; } } @@ -177,20 +178,17 @@ Status ExchangeSinkBuffer::add_block(TransmitInfo&& request) { } template -Status ExchangeSinkBuffer::add_block(BroadcastTransmitInfo&& request, - [[maybe_unused]] bool* sent) { +Status ExchangeSinkBuffer::add_block(BroadcastTransmitInfo&& request) { if (_is_finishing) { + request.block_holder->unref(); return Status::OK(); } TUniqueId ins_id = request.channel->_fragment_instance_id; if (_is_receiver_eof(ins_id.lo)) { + request.block_holder->unref(); return Status::EndOfFile("receiver eof"); } bool send_now = false; - if (sent) { - *sent = true; - } - request.block_holder->ref(); { std::unique_lock lock(*_instance_to_package_queue_mutex[ins_id.lo]); // Do not have in process rpc, directly send @@ -247,21 +245,21 @@ Status ExchangeSinkBuffer::_send_rpc(InstanceLoId id) { if (!request.exec_status.ok()) { request.exec_status.to_protobuf(brpc_request->mutable_exec_status()); } - auto* closure = request.channel->get_closure(id, request.eos, nullptr); + auto send_callback = request.channel->get_send_callback(id, request.eos, nullptr); - _instance_to_rpc_ctx[id]._closure = closure; + _instance_to_rpc_ctx[id]._send_callback = send_callback; _instance_to_rpc_ctx[id].is_cancelled = false; - closure->cntl.set_timeout_ms(request.channel->_brpc_timeout_ms); + send_callback->cntl_->set_timeout_ms(request.channel->_brpc_timeout_ms); if (config::exchange_sink_ignore_eovercrowded) { - closure->cntl.ignore_eovercrowded(); + send_callback->cntl_->ignore_eovercrowded(); } - closure->addFailedHandler( + send_callback->addFailedHandler( [&](const InstanceLoId& id, const std::string& err) { _failed(id, err); }); - closure->start_rpc_time = GetCurrentTimeNanos(); - closure->addSuccessHandler([&](const InstanceLoId& id, const bool& eos, - const PTransmitDataResult& result, - const int64_t& start_rpc_time) { + send_callback->start_rpc_time = GetCurrentTimeNanos(); + send_callback->addSuccessHandler([&](const InstanceLoId& id, const bool& eos, + const PTransmitDataResult& result, + const int64_t& start_rpc_time) { set_rpc_time(id, start_rpc_time, result.receive_time()); Status s(Status::create(result.status())); if (s.is()) { @@ -277,11 +275,17 @@ Status ExchangeSinkBuffer::_send_rpc(InstanceLoId id) { }); { SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(ExecEnv::GetInstance()->orphan_mem_tracker()); + auto send_remote_block_closure = + AutoReleaseClosure>:: + create_unique(brpc_request, send_callback); if (enable_http_send_block(*brpc_request)) { - RETURN_IF_ERROR(transmit_block_http(_context->exec_env(), closure, *brpc_request, - request.channel->_brpc_dest_addr)); + RETURN_IF_ERROR(transmit_block_httpv2(_context->exec_env(), + std::move(send_remote_block_closure), + request.channel->_brpc_dest_addr)); } else { - transmit_block(*request.channel->_brpc_stub, closure, *brpc_request); + transmit_blockv2(*request.channel->_brpc_stub, + std::move(send_remote_block_closure)); } } if (request.block) { @@ -305,23 +309,24 @@ Status ExchangeSinkBuffer::_send_rpc(InstanceLoId id) { auto statistic = brpc_request->mutable_query_statistics(); _statistics->to_pb(statistic); } - auto* closure = request.channel->get_closure(id, request.eos, request.block_holder); + auto send_callback = + request.channel->get_send_callback(id, request.eos, request.block_holder); ExchangeRpcContext rpc_ctx; - rpc_ctx._closure = closure; + rpc_ctx._send_callback = send_callback; rpc_ctx.is_cancelled = false; _instance_to_rpc_ctx[id] = rpc_ctx; - closure->cntl.set_timeout_ms(request.channel->_brpc_timeout_ms); + send_callback->cntl_->set_timeout_ms(request.channel->_brpc_timeout_ms); if (config::exchange_sink_ignore_eovercrowded) { - closure->cntl.ignore_eovercrowded(); + send_callback->cntl_->ignore_eovercrowded(); } - closure->addFailedHandler( + send_callback->addFailedHandler( [&](const InstanceLoId& id, const std::string& err) { _failed(id, err); }); - closure->start_rpc_time = GetCurrentTimeNanos(); - closure->addSuccessHandler([&](const InstanceLoId& id, const bool& eos, - const PTransmitDataResult& result, - const int64_t& start_rpc_time) { + send_callback->start_rpc_time = GetCurrentTimeNanos(); + send_callback->addSuccessHandler([&](const InstanceLoId& id, const bool& eos, + const PTransmitDataResult& result, + const int64_t& start_rpc_time) { set_rpc_time(id, start_rpc_time, result.receive_time()); Status s(Status::create(result.status())); if (s.is()) { @@ -337,11 +342,17 @@ Status ExchangeSinkBuffer::_send_rpc(InstanceLoId id) { }); { SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(ExecEnv::GetInstance()->orphan_mem_tracker()); + auto send_remote_block_closure = + AutoReleaseClosure>:: + create_unique(brpc_request, send_callback); if (enable_http_send_block(*brpc_request)) { - RETURN_IF_ERROR(transmit_block_http(_context->exec_env(), closure, *brpc_request, - request.channel->_brpc_dest_addr)); + RETURN_IF_ERROR(transmit_block_httpv2(_context->exec_env(), + std::move(send_remote_block_closure), + request.channel->_brpc_dest_addr)); } else { - transmit_block(*request.channel->_brpc_stub, closure, *brpc_request); + transmit_blockv2(*request.channel->_brpc_stub, + std::move(send_remote_block_closure)); } } if (request.block_holder->get_block()) { @@ -361,7 +372,7 @@ Status ExchangeSinkBuffer::_send_rpc(InstanceLoId id) { template void ExchangeSinkBuffer::_construct_request(InstanceLoId id, PUniqueId finst_id) { - _instance_to_request[id] = std::make_unique(); + _instance_to_request[id] = std::make_shared(); _instance_to_request[id]->mutable_finst_id()->CopyFrom(finst_id); _instance_to_request[id]->mutable_query_id()->CopyFrom(_query_id); @@ -372,12 +383,24 @@ void ExchangeSinkBuffer::_construct_request(InstanceLoId id, PUniqueId f template void ExchangeSinkBuffer::_ended(InstanceLoId id) { - std::unique_lock lock(*_instance_to_package_queue_mutex[id]); - if (!_rpc_channel_is_idle[id]) { - _busy_channels--; - _rpc_channel_is_idle[id] = true; - if (_finish_dependency && _busy_channels == 0) { - _finish_dependency->set_ready_to_finish(); + if (!_instance_to_package_queue_mutex.template contains(id)) { + std::stringstream ss; + ss << "failed find the instance id:" << id + << " now mutex map size:" << _instance_to_package_queue_mutex.size(); + for (const auto& p : _instance_to_package_queue_mutex) { + ss << " key:" << p.first << " value:" << p.second << "\n"; + } + LOG(INFO) << ss.str(); + + LOG(FATAL) << "not find the instance id"; + } else { + std::unique_lock lock(*_instance_to_package_queue_mutex[id]); + if (!_rpc_channel_is_idle[id]) { + _busy_channels--; + _rpc_channel_is_idle[id] = true; + if (_finish_dependency && _busy_channels == 0) { + _finish_dependency->set_ready_to_finish(); + } } } } diff --git a/be/src/pipeline/exec/exchange_sink_buffer.h b/be/src/pipeline/exec/exchange_sink_buffer.h index d63dd2a55f9bdb..9111f553b2799b 100644 --- a/be/src/pipeline/exec/exchange_sink_buffer.h +++ b/be/src/pipeline/exec/exchange_sink_buffer.h @@ -36,6 +36,7 @@ #include "runtime/query_statistics.h" #include "runtime/runtime_state.h" #include "service/backend_options.h" +#include "util/ref_count_closure.h" namespace doris { class PTransmitDataParams; @@ -76,15 +77,16 @@ class BroadcastPBlockHolder { BroadcastPBlockHolder(pipeline::BroadcastDependency* dep) : _ref_count(0), _dep(dep) {} ~BroadcastPBlockHolder() noexcept = default; + void ref(int delta) noexcept { _ref_count._value.fetch_add(delta); } void unref() noexcept; - void ref() noexcept { _ref_count._value.fetch_add(1); } + void ref() noexcept { ref(1); } bool available() { return _ref_count._value == 0; } PBlock* get_block() { return &pblock; } private: - AtomicWrapper _ref_count; + AtomicWrapper _ref_count; PBlock pblock; pipeline::BroadcastDependency* _dep; }; @@ -106,10 +108,12 @@ struct BroadcastTransmitInfo { bool eos; }; -template -class SelfDeleteClosure : public google::protobuf::Closure { +template +class ExchangeSendCallback : public ::doris::DummyBrpcCallback { + ENABLE_FACTORY_CREATOR(ExchangeSendCallback); + public: - SelfDeleteClosure() = default; + ExchangeSendCallback() = default; void init(InstanceLoId id, bool eos, vectorized::BroadcastPBlockHolder* data) { _id = id; @@ -117,32 +121,35 @@ class SelfDeleteClosure : public google::protobuf::Closure { _data = data; } - ~SelfDeleteClosure() override = default; - SelfDeleteClosure(const SelfDeleteClosure& other) = delete; - SelfDeleteClosure& operator=(const SelfDeleteClosure& other) = delete; + ~ExchangeSendCallback() override = default; + ExchangeSendCallback(const ExchangeSendCallback& other) = delete; + ExchangeSendCallback& operator=(const ExchangeSendCallback& other) = delete; void addFailedHandler( const std::function& fail_fn) { _fail_fn = fail_fn; } - void addSuccessHandler(const std::function& suc_fn) { + void addSuccessHandler(const std::function& suc_fn) { _suc_fn = suc_fn; } - void Run() noexcept override { + void call() noexcept override { try { if (_data) { _data->unref(); } - if (cntl.Failed()) { + if (::doris::DummyBrpcCallback::cntl_->Failed()) { std::string err = fmt::format( "failed to send brpc when exchange, error={}, error_text={}, client: {}, " "latency = {}", - berror(cntl.ErrorCode()), cntl.ErrorText(), BackendOptions::get_localhost(), - cntl.latency_us()); + berror(::doris::DummyBrpcCallback::cntl_->ErrorCode()), + ::doris::DummyBrpcCallback::cntl_->ErrorText(), + BackendOptions::get_localhost(), + ::doris::DummyBrpcCallback::cntl_->latency_us()); _fail_fn(_id, err); } else { - _suc_fn(_id, _eos, result, start_rpc_time); + _suc_fn(_id, _eos, *(::doris::DummyBrpcCallback::response_), + start_rpc_time); } } catch (const std::exception& exp) { LOG(FATAL) << "brpc callback error: " << exp.what(); @@ -150,21 +157,18 @@ class SelfDeleteClosure : public google::protobuf::Closure { LOG(FATAL) << "brpc callback error."; } } - - brpc::Controller cntl; - T result; int64_t start_rpc_time; private: std::function _fail_fn; - std::function _suc_fn; + std::function _suc_fn; InstanceLoId _id; bool _eos; vectorized::BroadcastPBlockHolder* _data; }; struct ExchangeRpcContext { - SelfDeleteClosure* _closure = nullptr; + std::shared_ptr> _send_callback = nullptr; bool is_cancelled = false; }; @@ -177,7 +181,7 @@ class ExchangeSinkBuffer { void register_sink(TUniqueId); Status add_block(TransmitInfo&& request); - Status add_block(BroadcastTransmitInfo&& request, [[maybe_unused]] bool* sent); + Status add_block(BroadcastTransmitInfo&& request); bool can_write() const; bool is_pending_finish(); void close(); @@ -198,6 +202,7 @@ class ExchangeSinkBuffer { phmap::flat_hash_map, std::list>>> _instance_to_package_queue; + size_t _queue_capacity; // store data in broadcast shuffle phmap::flat_hash_map, std::list>>> @@ -206,7 +211,7 @@ class ExchangeSinkBuffer { // must init zero // TODO: make all flat_hash_map to a STRUT phmap::flat_hash_map _instance_to_seq; - phmap::flat_hash_map> _instance_to_request; + phmap::flat_hash_map> _instance_to_request; // One channel is corresponding to a downstream instance. phmap::flat_hash_map _rpc_channel_is_idle; // Number of busy channels; @@ -236,7 +241,6 @@ class ExchangeSinkBuffer { std::atomic _total_queue_size = 0; static constexpr int QUEUE_CAPACITY_FACTOR = 64; - int _queue_capacity = 0; std::shared_ptr _queue_dependency = nullptr; std::shared_ptr _finish_dependency = nullptr; QueryStatistics* _statistics = nullptr; diff --git a/be/src/pipeline/exec/exchange_sink_operator.cpp b/be/src/pipeline/exec/exchange_sink_operator.cpp index 0ca38d81c13093..25418492954c5b 100644 --- a/be/src/pipeline/exec/exchange_sink_operator.cpp +++ b/be/src/pipeline/exec/exchange_sink_operator.cpp @@ -98,7 +98,7 @@ bool ExchangeSinkLocalState::transfer_large_data_by_brpc() const { Status ExchangeSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(PipelineXSinkLocalState<>::init(state, info)); - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); _sender_id = info.sender_id; @@ -192,17 +192,16 @@ Status ExchangeSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& inf ADD_CHILD_TIMER(_profile, "WaitForBroadcastBuffer", timer_name); } else if (local_size > 0) { size_t dep_id = 0; - _channels_dependency.resize(local_size); + _local_channels_dependency.resize(local_size); _wait_channel_timer.resize(local_size); auto deps_for_channels = AndDependency::create_shared(_parent->operator_id()); for (auto channel : channels) { if (channel->is_local()) { - _channels_dependency[dep_id] = - ChannelDependency::create_shared(_parent->operator_id()); - channel->set_dependency(_channels_dependency[dep_id]); - deps_for_channels->add_child(_channels_dependency[dep_id]); - _wait_channel_timer[dep_id] = - ADD_CHILD_TIMER(_profile, "WaitForLocalExchangeBuffer", timer_name); + _local_channels_dependency[dep_id] = channel->get_local_channel_dependency(); + DCHECK(_local_channels_dependency[dep_id] != nullptr); + deps_for_channels->add_child(_local_channels_dependency[dep_id]); + _wait_channel_timer[dep_id] = ADD_CHILD_TIMER( + _profile, fmt::format("WaitForLocalExchangeBuffer{}", dep_id), timer_name); dep_id++; } } @@ -214,12 +213,16 @@ Status ExchangeSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& inf new vectorized::XXHashPartitioner(channels.size())); RETURN_IF_ERROR(_partitioner->init(p._texprs)); RETURN_IF_ERROR(_partitioner->prepare(state, p._row_desc)); + _profile->add_info_string("Partitioner", + fmt::format("XXHashPartitioner({})", _partition_count)); } else if (p._part_type == TPartitionType::BUCKET_SHFFULE_HASH_PARTITIONED) { _partition_count = channel_shared_ptrs.size(); _partitioner.reset(new vectorized::Crc32HashPartitioner( channel_shared_ptrs.size())); RETURN_IF_ERROR(_partitioner->init(p._texprs)); RETURN_IF_ERROR(_partitioner->prepare(state, p._row_desc)); + _profile->add_info_string("Partitioner", + fmt::format("Crc32HashPartitioner({})", _partition_count)); } return Status::OK(); @@ -300,7 +303,7 @@ Status ExchangeSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block SourceState source_state) { auto& local_state = get_local_state(state); COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)block->rows()); - SCOPED_TIMER(local_state.profile()->total_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); local_state._peak_memory_usage_counter->set(_mem_tracker->peak_consumption()); bool all_receiver_eof = true; for (auto channel : local_state.channels) { @@ -345,23 +348,25 @@ Status ExchangeSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block } else { block_holder->get_block()->Clear(); } + local_state._broadcast_dependency->take_available_block(); + block_holder->ref(local_state.channels.size()); Status status; - bool sent = false; for (auto channel : local_state.channels) { if (!channel->is_receiver_eof()) { + Status status; if (channel->is_local()) { + block_holder->unref(); status = channel->send_local_block(&cur_block); } else { SCOPED_CONSUME_MEM_TRACKER(_mem_tracker.get()); status = channel->send_broadcast_block( - block_holder, &sent, source_state == SourceState::FINISHED); + block_holder, source_state == SourceState::FINISHED); } HANDLE_CHANNEL_STATUS(state, channel, status); + } else { + block_holder->unref(); } } - if (sent) { - local_state._broadcast_dependency->take_available_block(); - } cur_block.clear_column_data(); local_state._serializer.get_block()->set_muatable_columns( cur_block.mutate_columns()); @@ -392,9 +397,11 @@ Status ExchangeSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block } else if (_part_type == TPartitionType::HASH_PARTITIONED || _part_type == TPartitionType::BUCKET_SHFFULE_HASH_PARTITIONED) { auto rows = block->rows(); - SCOPED_TIMER(local_state._split_block_hash_compute_timer); - RETURN_IF_ERROR( - local_state._partitioner->do_partitioning(state, block, _mem_tracker.get())); + { + SCOPED_TIMER(local_state._split_block_hash_compute_timer); + RETURN_IF_ERROR( + local_state._partitioner->do_partitioning(state, block, _mem_tracker.get())); + } if (_part_type == TPartitionType::HASH_PARTITIONED) { RETURN_IF_ERROR(channel_add_rows(state, local_state.channels, local_state._partition_count, @@ -419,7 +426,8 @@ Status ExchangeSinkOperatorX::serialize_block(ExchangeSinkLocalState& state, vec { SCOPED_TIMER(state.serialize_batch_timer()); dest->Clear(); - size_t uncompressed_bytes = 0, compressed_bytes = 0; + size_t uncompressed_bytes = 0; + size_t compressed_bytes = 0; RETURN_IF_ERROR(src->serialize(_state->be_exec_version(), dest, &uncompressed_bytes, &compressed_bytes, _compression_type, _transfer_large_data_by_brpc)); @@ -449,7 +457,10 @@ Status ExchangeSinkLocalState::get_next_available_buffer( return Status::OK(); } } - return Status::InternalError("No broadcast buffer left!"); + return Status::InternalError("No broadcast buffer left! Available blocks: " + + std::to_string(_broadcast_dependency->available_blocks()) + + " and number of buffer is " + + std::to_string(_broadcast_pb_blocks.size())); } template @@ -502,16 +513,16 @@ Status ExchangeSinkLocalState::close(RuntimeState* state, Status exec_status) { if (_closed) { return Status::OK(); } - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_close_timer); COUNTER_UPDATE(_wait_queue_timer, _queue_dependency->write_watcher_elapse_time()); if (_broadcast_dependency) { COUNTER_UPDATE(_wait_broadcast_buffer_timer, _broadcast_dependency->write_watcher_elapse_time()); } - for (size_t i = 0; i < _channels_dependency.size(); i++) { + for (size_t i = 0; i < _local_channels_dependency.size(); i++) { COUNTER_UPDATE(_wait_channel_timer[i], - _channels_dependency[i]->write_watcher_elapse_time()); + _local_channels_dependency[i]->write_watcher_elapse_time()); } _sink_buffer->update_profile(profile()); _sink_buffer->close(); diff --git a/be/src/pipeline/exec/exchange_sink_operator.h b/be/src/pipeline/exec/exchange_sink_operator.h index 05fa799e5cf2fb..6b9d3b5e4b17c1 100644 --- a/be/src/pipeline/exec/exchange_sink_operator.h +++ b/be/src/pipeline/exec/exchange_sink_operator.h @@ -68,7 +68,7 @@ class ExchangeSinkQueueDependency final : public WriteDependency { public: ENABLE_FACTORY_CREATOR(ExchangeSinkQueueDependency); ExchangeSinkQueueDependency(int id) : WriteDependency(id, "ResultQueueDependency") {} - ~ExchangeSinkQueueDependency() = default; + ~ExchangeSinkQueueDependency() override = default; void* shared_state() override { return nullptr; } }; @@ -77,11 +77,11 @@ class BroadcastDependency final : public WriteDependency { public: ENABLE_FACTORY_CREATOR(BroadcastDependency); BroadcastDependency(int id) : WriteDependency(id, "BroadcastDependency"), _available_block(0) {} - virtual ~BroadcastDependency() = default; + ~BroadcastDependency() override = default; [[nodiscard]] WriteDependency* write_blocked_by() override { if (config::enable_fuzzy_mode && _available_block == 0 && - _write_dependency_watcher.elapsed_time() > SLOW_DEPENDENCY_THRESHOLD) { + _should_log(_write_dependency_watcher.elapsed_time())) { LOG(WARNING) << "========Dependency may be blocked by some reasons: " << name() << " " << id(); } @@ -107,17 +107,52 @@ class BroadcastDependency final : public WriteDependency { throw doris::Exception(ErrorCode::NOT_IMPLEMENTED_ERROR, "Should not reach here!"); } + int available_blocks() const { return _available_block; } + private: std::atomic _available_block; }; -class ChannelDependency final : public WriteDependency { +/** + * We use this to control the execution for local exchange. + * +---------------+ +---------------+ +---------------+ + * | ExchangeSink1 | | ExchangeSink2 | | ExchangeSink3 | + * +---------------+ +---------------+ +---------------+ + * | | | + * | +----------------------------+----------------------------------+ | + * +----+------------------|------------------------------------------+ | | + * | | +------------------------|--------------------|------------+-----+ + * Dependency 1-1 | Dependency 2-1 | Dependency 3-1 | Dependency 1-2 | Dependency 2-2 | Dependency 3-2 | + * +----------------------------------------------+ +----------------------------------------------+ + * | queue1 queue2 queue3 | | queue1 queue2 queue3 | + * | LocalRecvr | | LocalRecvr | + * +----------------------------------------------+ +----------------------------------------------+ + * +-----------------+ +------------------+ + * | ExchangeSource1 | | ExchangeSource2 | + * +-----------------+ +------------------+ + */ +class LocalExchangeChannelDependency final : public WriteDependency { public: - ENABLE_FACTORY_CREATOR(ChannelDependency); - ChannelDependency(int id) : WriteDependency(id, "ChannelDependency") {} - ~ChannelDependency() override = default; + ENABLE_FACTORY_CREATOR(LocalExchangeChannelDependency); + LocalExchangeChannelDependency(int id, std::shared_ptr mem_available) + : WriteDependency(id, "LocalExchangeChannelDependency"), + _mem_available(mem_available) {} + ~LocalExchangeChannelDependency() override = default; + + WriteDependency* write_blocked_by() override { + if (config::enable_fuzzy_mode && !_is_runnable() && + _should_log(_write_dependency_watcher.elapsed_time())) { + LOG(WARNING) << "========Dependency may be blocked by some reasons: " << name() << " " + << id(); + } + return _is_runnable() ? nullptr : this; + } void* shared_state() override { return nullptr; } + +private: + bool _is_runnable() const { return _ready_for_write || *_mem_available; } + std::shared_ptr _mem_available; }; class ExchangeSinkLocalState final : public PipelineXSinkLocalState<> { @@ -201,14 +236,13 @@ class ExchangeSinkLocalState final : public PipelineXSinkLocalState<> { // Sender instance id, unique within a fragment. int _sender_id; std::vector _broadcast_pb_blocks; - int _broadcast_pb_block_idx; vectorized::BlockSerializer _serializer; std::shared_ptr _queue_dependency = nullptr; std::shared_ptr _exchange_sink_dependency = nullptr; std::shared_ptr _broadcast_dependency = nullptr; - std::vector> _channels_dependency; + std::vector> _local_channels_dependency; std::unique_ptr _partitioner; int _partition_count; }; diff --git a/be/src/pipeline/exec/exchange_source_operator.cpp b/be/src/pipeline/exec/exchange_source_operator.cpp index 362853fa18eb61..3213ed55778684 100644 --- a/be/src/pipeline/exec/exchange_source_operator.cpp +++ b/be/src/pipeline/exec/exchange_source_operator.cpp @@ -45,7 +45,7 @@ ExchangeLocalState::ExchangeLocalState(RuntimeState* state, OperatorXBase* paren Status ExchangeLocalState::init(RuntimeState* state, LocalStateInfo& info) { RETURN_IF_ERROR(PipelineXLocalState<>::init(state, info)); - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); auto& p = _parent->cast(); stream_recvr = state->exec_env()->vstream_mgr()->create_recvr( @@ -60,11 +60,11 @@ Status ExchangeLocalState::init(RuntimeState* state, LocalStateInfo& info) { queues[i]->set_dependency(deps[i]); source_dependency->add_child(deps[i]); } + static const std::string timer_name = + "WaitForDependency[" + source_dependency->name() + "]Time"; + _wait_for_dependency_timer = ADD_TIMER(_runtime_profile, timer_name); for (size_t i = 0; i < queues.size(); i++) { - static const std::string timer_name = - "WaitForDependency[" + source_dependency->name() + "]Time"; - _wait_for_dependency_timer = ADD_TIMER(_runtime_profile, timer_name); - metrics[i] = ADD_CHILD_TIMER(_runtime_profile, "WaitForData", timer_name); + metrics[i] = ADD_CHILD_TIMER(_runtime_profile, fmt::format("WaitForData{}", i), timer_name); } RETURN_IF_ERROR(_parent->cast()._vsort_exec_exprs.clone( state, vsort_exec_exprs)); @@ -72,7 +72,7 @@ Status ExchangeLocalState::init(RuntimeState* state, LocalStateInfo& info) { } Status ExchangeLocalState::open(RuntimeState* state) { - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); RETURN_IF_ERROR(PipelineXLocalState<>::open(state)); return Status::OK(); @@ -125,7 +125,7 @@ Status ExchangeSourceOperatorX::open(RuntimeState* state) { Status ExchangeSourceOperatorX::get_block(RuntimeState* state, vectorized::Block* block, SourceState& source_state) { auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state.profile()->total_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); if (_is_merging && !local_state.is_ready) { RETURN_IF_ERROR(local_state.stream_recvr->create_merger( local_state.vsort_exec_exprs.lhs_ordering_expr_ctxs(), _is_asc_order, _nulls_first, @@ -168,7 +168,7 @@ Status ExchangeSourceOperatorX::get_block(RuntimeState* state, vectorized::Block } Status ExchangeLocalState::close(RuntimeState* state) { - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_close_timer); if (_closed) { return Status::OK(); diff --git a/be/src/pipeline/exec/exchange_source_operator.h b/be/src/pipeline/exec/exchange_source_operator.h index 12c6c38e4bc0cc..c41268f8eace7c 100644 --- a/be/src/pipeline/exec/exchange_source_operator.h +++ b/be/src/pipeline/exec/exchange_source_operator.h @@ -54,16 +54,8 @@ struct ExchangeDataDependency final : public Dependency { public: ENABLE_FACTORY_CREATOR(ExchangeDataDependency); ExchangeDataDependency(int id, vectorized::VDataStreamRecvr::SenderQueue* sender_queue) - : Dependency(id, "DataDependency"), _sender_queue(sender_queue), _always_done(false) {} + : Dependency(id, "DataDependency"), _always_done(false) {} void* shared_state() override { return nullptr; } - [[nodiscard]] Dependency* read_blocked_by() override { - if (config::enable_fuzzy_mode && _sender_queue->should_wait() && - _read_dependency_watcher.elapsed_time() > SLOW_DEPENDENCY_THRESHOLD) { - LOG(WARNING) << "========Dependency may be blocked by some reasons: " << name() << " " - << id(); - } - return _sender_queue->should_wait() ? this : nullptr; - } void set_always_done() { _always_done = true; @@ -74,17 +66,14 @@ struct ExchangeDataDependency final : public Dependency { _ready_for_read = true; } - void set_ready_for_read() override { - if (_always_done || !_ready_for_read) { + void block_reading() override { + if (_always_done) { return; } _ready_for_read = false; - // ScannerContext is set done outside this function now and only stop watcher here. - _read_dependency_watcher.start(); } private: - vectorized::VDataStreamRecvr::SenderQueue* _sender_queue; std::atomic _always_done; }; diff --git a/be/src/pipeline/exec/hashjoin_build_sink.cpp b/be/src/pipeline/exec/hashjoin_build_sink.cpp index 1af98cd77aae98..cb63f64ab4269b 100644 --- a/be/src/pipeline/exec/hashjoin_build_sink.cpp +++ b/be/src/pipeline/exec/hashjoin_build_sink.cpp @@ -46,7 +46,7 @@ HashJoinBuildSinkLocalState::HashJoinBuildSinkLocalState(DataSinkOperatorXBase* Status HashJoinBuildSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(JoinBuildSinkLocalState::init(state, info)); - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); _shared_hash_table_dependency = SharedHashTableDependency::create_shared(_parent->operator_id()); @@ -132,7 +132,7 @@ Status HashJoinBuildSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo } Status HashJoinBuildSinkLocalState::open(RuntimeState* state) { - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); RETURN_IF_ERROR(JoinBuildSinkLocalState::open(state)); auto& p = _parent->cast(); @@ -402,7 +402,7 @@ Status HashJoinBuildSinkOperatorX::open(RuntimeState* state) { Status HashJoinBuildSinkOperatorX::sink(RuntimeState* state, vectorized::Block* in_block, SourceState source_state) { auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state.profile()->total_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)in_block->rows()); // make one block for each 4 gigabytes diff --git a/be/src/pipeline/exec/hashjoin_build_sink.h b/be/src/pipeline/exec/hashjoin_build_sink.h index 8ba6e2fba3b9b3..10056a30e72cf4 100644 --- a/be/src/pipeline/exec/hashjoin_build_sink.h +++ b/be/src/pipeline/exec/hashjoin_build_sink.h @@ -68,7 +68,6 @@ class HashJoinBuildSinkLocalState final Status process_build_block(RuntimeState* state, vectorized::Block& block, uint8_t offset); void init_short_circuit_for_probe(); - HashJoinBuildSinkOperatorX* join_build() { return (HashJoinBuildSinkOperatorX*)_parent; } bool build_unique() const; std::vector& runtime_filter_descs() const; diff --git a/be/src/pipeline/exec/hashjoin_probe_operator.cpp b/be/src/pipeline/exec/hashjoin_probe_operator.cpp index 61f244c8decde4..b459d8eddd7de0 100644 --- a/be/src/pipeline/exec/hashjoin_probe_operator.cpp +++ b/be/src/pipeline/exec/hashjoin_probe_operator.cpp @@ -32,7 +32,7 @@ HashJoinProbeLocalState::HashJoinProbeLocalState(RuntimeState* state, OperatorXB Status HashJoinProbeLocalState::init(RuntimeState* state, LocalStateInfo& info) { RETURN_IF_ERROR(JoinProbeLocalState::init(state, info)); - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); auto& p = _parent->cast(); _shared_state->probe_ignore_null = p._probe_ignore_null; @@ -60,7 +60,7 @@ Status HashJoinProbeLocalState::init(RuntimeState* state, LocalStateInfo& info) } Status HashJoinProbeLocalState::open(RuntimeState* state) { - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); RETURN_IF_ERROR(JoinProbeLocalState::open(state)); @@ -122,7 +122,7 @@ vectorized::DataTypes HashJoinProbeLocalState::left_table_data_types() { } Status HashJoinProbeLocalState::close(RuntimeState* state) { - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_close_timer); if (_closed) { return Status::OK(); @@ -537,20 +537,18 @@ Status HashJoinProbeOperatorX::init(const TPlanNode& tnode, RuntimeState* state) DCHECK(_have_other_join_conjunct); } - std::vector hash_output_slot_ids; // init left/right output slots flags, only column of slot_id in _hash_output_slot_ids need // insert to output block of hash join. // _left_output_slots_flags : column of left table need to output set flag = true // _rgiht_output_slots_flags : column of right table need to output set flag = true // if _hash_output_slot_ids is empty, means all column of left/right table need to output. - auto init_output_slots_flags = [&hash_output_slot_ids](auto& tuple_descs, - auto& output_slot_flags) { + auto init_output_slots_flags = [&](auto& tuple_descs, auto& output_slot_flags) { for (const auto& tuple_desc : tuple_descs) { for (const auto& slot_desc : tuple_desc->slots()) { output_slot_flags.emplace_back( - hash_output_slot_ids.empty() || - std::find(hash_output_slot_ids.begin(), hash_output_slot_ids.end(), - slot_desc->id()) != hash_output_slot_ids.end()); + _hash_output_slot_ids.empty() || + std::find(_hash_output_slot_ids.begin(), _hash_output_slot_ids.end(), + slot_desc->id()) != _hash_output_slot_ids.end()); } } }; diff --git a/be/src/pipeline/exec/jdbc_table_sink_operator.cpp b/be/src/pipeline/exec/jdbc_table_sink_operator.cpp index ae8fcba3d1cb6d..fc90af3591b18a 100644 --- a/be/src/pipeline/exec/jdbc_table_sink_operator.cpp +++ b/be/src/pipeline/exec/jdbc_table_sink_operator.cpp @@ -58,7 +58,7 @@ Status JdbcTableSinkOperatorX::open(RuntimeState* state) { Status JdbcTableSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block, SourceState source_state) { auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state.profile()->total_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); RETURN_IF_ERROR(local_state.sink(state, block, source_state)); return Status::OK(); } diff --git a/be/src/pipeline/exec/multi_cast_data_stream_sink.h b/be/src/pipeline/exec/multi_cast_data_stream_sink.h index 83c5390a2fb59f..008238577f03a8 100644 --- a/be/src/pipeline/exec/multi_cast_data_stream_sink.h +++ b/be/src/pipeline/exec/multi_cast_data_stream_sink.h @@ -77,7 +77,7 @@ class MultiCastDataStreamSinkOperatorX final Status sink(RuntimeState* state, vectorized::Block* in_block, SourceState source_state) override { auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state.profile()->total_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)in_block->rows()); if (in_block->rows() > 0 || source_state == SourceState::FINISHED) { COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)in_block->rows()); diff --git a/be/src/pipeline/exec/multi_cast_data_stream_source.cpp b/be/src/pipeline/exec/multi_cast_data_stream_source.cpp index dd1c1e46d21113..4c6e21c5c6da25 100644 --- a/be/src/pipeline/exec/multi_cast_data_stream_source.cpp +++ b/be/src/pipeline/exec/multi_cast_data_stream_source.cpp @@ -107,7 +107,7 @@ Status MultiCastDataStreamerSourceOperator::get_block(RuntimeState* state, vecto if (!_output_expr_contexts.empty() && output_block->rows() > 0) { RETURN_IF_ERROR(vectorized::VExprContext::get_output_block_after_execute_exprs( - _output_expr_contexts, *output_block, block)); + _output_expr_contexts, *output_block, block, true)); materialize_block_inplace(*block); } if (eos) { @@ -135,7 +135,7 @@ MultiCastDataStreamSourceLocalState::MultiCastDataStreamSourceLocalState(Runtime Status MultiCastDataStreamSourceLocalState::init(RuntimeState* state, LocalStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); RETURN_IF_ERROR(RuntimeFilterConsumer::init(state)); - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); auto& p = _parent->cast(); static_cast(_dependency)->set_consumer_id(p._consumer_id); @@ -145,8 +145,7 @@ Status MultiCastDataStreamSourceLocalState::init(RuntimeState* state, LocalState } // init profile for runtime filter RuntimeFilterConsumer::_init_profile(profile()); - _filter_dependency->set_filter_blocked_by_fn( - [this]() { return this->runtime_filters_are_ready_or_timeout(); }); + init_runtime_filter_dependency(_filter_dependency.get()); return Status::OK(); } @@ -155,7 +154,7 @@ Status MultiCastDataStreamerSourceOperatorX::get_block(RuntimeState* state, SourceState& source_state) { //auto& local_state = get_local_state(state); auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state.profile()->total_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); bool eos = false; vectorized::Block tmp_block; vectorized::Block* output_block = block; @@ -171,7 +170,7 @@ Status MultiCastDataStreamerSourceOperatorX::get_block(RuntimeState* state, if (!local_state._output_expr_contexts.empty() && output_block->rows() > 0) { RETURN_IF_ERROR(vectorized::VExprContext::get_output_block_after_execute_exprs( - local_state._output_expr_contexts, *output_block, block)); + local_state._output_expr_contexts, *output_block, block, true)); materialize_block_inplace(*block); } COUNTER_UPDATE(local_state._rows_returned_counter, block->rows()); diff --git a/be/src/pipeline/exec/nested_loop_join_build_operator.cpp b/be/src/pipeline/exec/nested_loop_join_build_operator.cpp index 152bec5d7d61f6..f525a1abf1c78f 100644 --- a/be/src/pipeline/exec/nested_loop_join_build_operator.cpp +++ b/be/src/pipeline/exec/nested_loop_join_build_operator.cpp @@ -32,7 +32,7 @@ NestedLoopJoinBuildSinkLocalState::NestedLoopJoinBuildSinkLocalState(DataSinkOpe Status NestedLoopJoinBuildSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(JoinBuildSinkLocalState::init(state, info)); - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); auto& p = _parent->cast(); _shared_state->join_op_variants = p._join_op_variants; @@ -93,7 +93,7 @@ Status NestedLoopJoinBuildSinkOperatorX::open(RuntimeState* state) { Status NestedLoopJoinBuildSinkOperatorX::sink(doris::RuntimeState* state, vectorized::Block* block, SourceState source_state) { auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state.profile()->total_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)block->rows()); auto rows = block->rows(); auto mem_usage = block->allocated_bytes(); diff --git a/be/src/pipeline/exec/nested_loop_join_probe_operator.cpp b/be/src/pipeline/exec/nested_loop_join_probe_operator.cpp index ca51d0f2714328..f73b84c538e1da 100644 --- a/be/src/pipeline/exec/nested_loop_join_probe_operator.cpp +++ b/be/src/pipeline/exec/nested_loop_join_probe_operator.cpp @@ -52,7 +52,7 @@ NestedLoopJoinProbeLocalState::NestedLoopJoinProbeLocalState(RuntimeState* state Status NestedLoopJoinProbeLocalState::init(RuntimeState* state, LocalStateInfo& info) { RETURN_IF_ERROR(JoinProbeLocalState::init(state, info)); - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); auto& p = _parent->cast(); _join_conjuncts.resize(p._join_conjuncts.size()); @@ -66,7 +66,7 @@ Status NestedLoopJoinProbeLocalState::init(RuntimeState* state, LocalStateInfo& } Status NestedLoopJoinProbeLocalState::close(RuntimeState* state) { - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_close_timer); if (_closed) { return Status::OK(); diff --git a/be/src/pipeline/exec/olap_scan_operator.cpp b/be/src/pipeline/exec/olap_scan_operator.cpp index 5ea76ddd0d615c..a77dba4c00a468 100644 --- a/be/src/pipeline/exec/olap_scan_operator.cpp +++ b/be/src/pipeline/exec/olap_scan_operator.cpp @@ -217,7 +217,6 @@ Status OlapScanLocalState::_init_scanners(std::list* s return Status::OK(); } SCOPED_TIMER(_scanner_init_timer); - auto span = opentelemetry::trace::Tracer::GetCurrentSpan(); if (!_conjuncts.empty()) { std::string message; @@ -247,13 +246,19 @@ Status OlapScanLocalState::_init_scanners(std::list* s } int scanners_per_tablet = std::max(1, 64 / (int)_scan_ranges.size()); - auto build_new_scanner = [&](const TPaloScanRange& scan_range, + auto build_new_scanner = [&](BaseTabletSPtr tablet, int64_t version, const std::vector& key_ranges) { - std::shared_ptr scanner = - vectorized::NewOlapScanner::create_shared( - state(), this, p._limit_per_scanner, p._olap_scan_node.is_preaggregation, - scan_range, key_ranges, _scanner_profile.get()); - + auto scanner = vectorized::NewOlapScanner::create_shared( + this, vectorized::NewOlapScanner::Params { + state(), + _scanner_profile.get(), + key_ranges, + std::move(tablet), + version, + {}, + p._limit_per_scanner, + p._olap_scan_node.is_preaggregation, + }); RETURN_IF_ERROR(scanner->prepare(state(), _conjuncts)); scanner->set_compound_filters(_compound_filters); scanners->push_back(scanner); @@ -261,7 +266,9 @@ Status OlapScanLocalState::_init_scanners(std::list* s }; for (auto& scan_range : _scan_ranges) { auto tablet = DORIS_TRY(ExecEnv::get_tablet(scan_range->tablet_id)); - + int64_t version = 0; + std::from_chars(scan_range->version.data(), + scan_range->version.data() + scan_range->version.size(), version); std::vector>* ranges = &_cond_ranges; int size_based_scanners_per_tablet = 1; @@ -282,7 +289,7 @@ Status OlapScanLocalState::_init_scanners(std::list* s ++j, ++i) { scanner_ranges.push_back((*ranges)[i].get()); } - RETURN_IF_ERROR(build_new_scanner(*scan_range, scanner_ranges)); + RETURN_IF_ERROR(build_new_scanner(tablet, version, scanner_ranges)); } } @@ -300,7 +307,6 @@ void OlapScanLocalState::set_scan_ranges(RuntimeState* state, _scan_ranges.emplace_back(new TPaloScanRange(scan_range.scan_range.palo_scan_range)); // COUNTER_UPDATE(_tablet_counter, 1); } - // telemetry::set_current_span_attribute(_tablet_counter); } static std::string olap_filter_to_string(const doris::TCondition& condition) { diff --git a/be/src/pipeline/exec/olap_table_sink_operator.cpp b/be/src/pipeline/exec/olap_table_sink_operator.cpp index 999bf921d54606..f5f9da081364d9 100644 --- a/be/src/pipeline/exec/olap_table_sink_operator.cpp +++ b/be/src/pipeline/exec/olap_table_sink_operator.cpp @@ -31,7 +31,7 @@ OperatorPtr OlapTableSinkOperatorBuilder::build_operator() { Status OlapTableSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); auto& p = _parent->cast(); RETURN_IF_ERROR(_writer->init_properties(p._pool, p._group_commit)); @@ -43,7 +43,7 @@ Status OlapTableSinkLocalState::close(RuntimeState* state, Status exec_status) { return Status::OK(); } SCOPED_TIMER(_close_timer); - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); if (_closed) { return _close_status; } diff --git a/be/src/pipeline/exec/olap_table_sink_operator.h b/be/src/pipeline/exec/olap_table_sink_operator.h index 877a42937da48b..ad35f7972864c4 100644 --- a/be/src/pipeline/exec/olap_table_sink_operator.h +++ b/be/src/pipeline/exec/olap_table_sink_operator.h @@ -54,7 +54,7 @@ class OlapTableSinkLocalState final : Base(parent, state) {}; Status init(RuntimeState* state, LocalSinkStateInfo& info) override; Status open(RuntimeState* state) override { - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); return Base::open(state); } @@ -95,7 +95,7 @@ class OlapTableSinkOperatorX final : public DataSinkOperatorXtotal_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)in_block->rows()); return local_state.sink(state, in_block, source_state); } diff --git a/be/src/pipeline/exec/partition_sort_sink_operator.cpp b/be/src/pipeline/exec/partition_sort_sink_operator.cpp index 5abbecbef89d58..d0c3774a58bb00 100644 --- a/be/src/pipeline/exec/partition_sort_sink_operator.cpp +++ b/be/src/pipeline/exec/partition_sort_sink_operator.cpp @@ -30,7 +30,7 @@ OperatorPtr PartitionSortSinkOperatorBuilder::build_operator() { Status PartitionSortSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(PipelineXSinkLocalState::init(state, info)); - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); auto& p = _parent->cast(); RETURN_IF_ERROR(p._vsort_exec_exprs.clone(state, _vsort_exec_exprs)); _partition_expr_ctxs.resize(p._partition_expr_ctxs.size()); @@ -98,7 +98,7 @@ Status PartitionSortSinkOperatorX::sink(RuntimeState* state, vectorized::Block* SourceState source_state) { auto& local_state = get_local_state(state); auto current_rows = input_block->rows(); - SCOPED_TIMER(local_state.profile()->total_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); if (current_rows > 0) { local_state.child_input_rows = local_state.child_input_rows + current_rows; if (UNLIKELY(_partition_exprs_num == 0)) { diff --git a/be/src/pipeline/exec/partition_sort_source_operator.cpp b/be/src/pipeline/exec/partition_sort_source_operator.cpp index 84a20a2f640b58..dfbe1fd40f53b0 100644 --- a/be/src/pipeline/exec/partition_sort_source_operator.cpp +++ b/be/src/pipeline/exec/partition_sort_source_operator.cpp @@ -31,7 +31,7 @@ OperatorPtr PartitionSortSourceOperatorBuilder::build_operator() { Status PartitionSortSourceLocalState::init(RuntimeState* state, LocalStateInfo& info) { RETURN_IF_ERROR(PipelineXLocalState::init(state, info)); - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); _get_next_timer = ADD_TIMER(profile(), "GetResultTime"); _get_sorted_timer = ADD_TIMER(profile(), "GetSortedTime"); @@ -43,7 +43,7 @@ Status PartitionSortSourceOperatorX::get_block(RuntimeState* state, vectorized:: SourceState& source_state) { RETURN_IF_CANCELLED(state); auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state.profile()->total_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); SCOPED_TIMER(local_state._get_next_timer); output_block->clear_column_data(); { diff --git a/be/src/pipeline/exec/repeat_operator.cpp b/be/src/pipeline/exec/repeat_operator.cpp index c9e0a38ec4ccbe..40f6f5d7b21fb4 100644 --- a/be/src/pipeline/exec/repeat_operator.cpp +++ b/be/src/pipeline/exec/repeat_operator.cpp @@ -52,7 +52,7 @@ RepeatLocalState::RepeatLocalState(RuntimeState* state, OperatorXBase* parent) Status RepeatLocalState::init(RuntimeState* state, LocalStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); auto& p = _parent->cast(); _expr_ctxs.resize(p._expr_ctxs.size()); @@ -211,7 +211,6 @@ Status RepeatOperatorX::push(RuntimeState* state, vectorized::Block* input_block Status RepeatOperatorX::pull(doris::RuntimeState* state, vectorized::Block* output_block, SourceState& source_state) const { auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state.profile()->total_time_counter()); auto& _repeat_id_idx = local_state._repeat_id_idx; auto& _child_block = *local_state._child_block; auto& _child_eos = local_state._child_eos; diff --git a/be/src/pipeline/exec/repeat_operator.h b/be/src/pipeline/exec/repeat_operator.h index f6c52a0be8d5a9..18d373b77dfa6b 100644 --- a/be/src/pipeline/exec/repeat_operator.h +++ b/be/src/pipeline/exec/repeat_operator.h @@ -70,6 +70,7 @@ class RepeatLocalState final : public PipelineXLocalState { std::unique_ptr _intermediate_block {}; vectorized::VExprContextSPtrs _expr_ctxs; }; + class RepeatOperatorX final : public StatefulOperatorX { public: using Base = StatefulOperatorX; diff --git a/be/src/pipeline/exec/result_file_sink_operator.cpp b/be/src/pipeline/exec/result_file_sink_operator.cpp index ea765055cdadca..3193c1b07c4921 100644 --- a/be/src/pipeline/exec/result_file_sink_operator.cpp +++ b/be/src/pipeline/exec/result_file_sink_operator.cpp @@ -101,11 +101,16 @@ Status ResultFileSinkOperatorX::open(RuntimeState* state) { Status ResultFileSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); _sender_id = info.sender_id; _brpc_wait_timer = ADD_TIMER(_profile, "BrpcSendTime.Wait"); + _local_send_timer = ADD_TIMER(_profile, "LocalSendTime"); + _brpc_send_timer = ADD_TIMER(_profile, "BrpcSendTime"); + _split_block_distribute_by_channel_timer = + ADD_TIMER(_profile, "SplitBlockDistributeByChannelTime"); + _brpc_send_timer = ADD_TIMER(_profile, "BrpcSendTime"); auto& p = _parent->cast(); CHECK(p._file_opts.get() != nullptr); if (p._is_top_sink) { @@ -153,7 +158,7 @@ Status ResultFileSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& i } Status ResultFileSinkLocalState::open(RuntimeState* state) { - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); return Base::open(state); } @@ -163,7 +168,7 @@ Status ResultFileSinkLocalState::close(RuntimeState* state, Status exec_status) return Status::OK(); } SCOPED_TIMER(_close_timer); - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); auto& p = _parent->cast(); if (_closed) { @@ -236,7 +241,7 @@ Status ResultFileSinkLocalState::close(RuntimeState* state, Status exec_status) } else { SCOPED_CONSUME_MEM_TRACKER(_mem_tracker.get()); status = channel->send_broadcast_block(_block_holder.get(), - nullptr, true); + true); } HANDLE_CHANNEL_STATUS(state, channel, status); } @@ -264,7 +269,7 @@ void ResultFileSinkLocalState::_handle_eof_channel(RuntimeState* state, ChannelP Status ResultFileSinkOperatorX::sink(RuntimeState* state, vectorized::Block* in_block, SourceState source_state) { auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state.profile()->total_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)in_block->rows()); return local_state.sink(state, in_block, source_state); } diff --git a/be/src/pipeline/exec/result_file_sink_operator.h b/be/src/pipeline/exec/result_file_sink_operator.h index 3a98401de60738..63217aad04781b 100644 --- a/be/src/pipeline/exec/result_file_sink_operator.h +++ b/be/src/pipeline/exec/result_file_sink_operator.h @@ -58,6 +58,12 @@ class ResultFileSinkLocalState final [[nodiscard]] int sender_id() const { return _sender_id; } RuntimeProfile::Counter* brpc_wait_timer() { return _brpc_wait_timer; } + RuntimeProfile::Counter* local_send_timer() { return _local_send_timer; } + RuntimeProfile::Counter* brpc_send_timer() { return _brpc_send_timer; } + RuntimeProfile::Counter* merge_block_timer() { return _merge_block_timer; } + RuntimeProfile::Counter* split_block_distribute_by_channel_timer() { + return _split_block_distribute_by_channel_timer; + } private: friend class ResultFileSinkOperatorX; @@ -73,6 +79,10 @@ class ResultFileSinkLocalState final vectorized::BlockSerializer _serializer; std::unique_ptr _block_holder; RuntimeProfile::Counter* _brpc_wait_timer; + RuntimeProfile::Counter* _local_send_timer; + RuntimeProfile::Counter* _brpc_send_timer; + RuntimeProfile::Counter* _merge_block_timer; + RuntimeProfile::Counter* _split_block_distribute_by_channel_timer; int _sender_id; }; diff --git a/be/src/pipeline/exec/result_sink_operator.cpp b/be/src/pipeline/exec/result_sink_operator.cpp index 12c06794a952f6..b65a5ba1b86afe 100644 --- a/be/src/pipeline/exec/result_sink_operator.cpp +++ b/be/src/pipeline/exec/result_sink_operator.cpp @@ -52,7 +52,7 @@ bool ResultSinkOperator::can_write() { Status ResultSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(PipelineXSinkLocalState<>::init(state, info)); - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); static const std::string timer_name = "WaitForDependencyTime"; _wait_for_dependency_timer = ADD_TIMER(_profile, timer_name); @@ -79,7 +79,7 @@ Status ResultSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) } Status ResultSinkLocalState::open(RuntimeState* state) { - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); RETURN_IF_ERROR(PipelineXSinkLocalState<>::open(state)); auto& p = _parent->cast(); @@ -139,7 +139,7 @@ Status ResultSinkOperatorX::open(RuntimeState* state) { Status ResultSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block, SourceState source_state) { auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state.profile()->total_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); if (_fetch_option.use_two_phase_fetch && block->rows() > 0) { RETURN_IF_ERROR(_second_phase_fetch_data(state, block)); } @@ -173,14 +173,12 @@ Status ResultSinkLocalState::close(RuntimeState* state, Status exec_status) { } SCOPED_TIMER(_close_timer); COUNTER_UPDATE(_wait_for_queue_timer, _queue_dependency->write_watcher_elapse_time()); - COUNTER_UPDATE(profile()->total_time_counter(), _queue_dependency->write_watcher_elapse_time()); + COUNTER_UPDATE(exec_time_counter(), _queue_dependency->write_watcher_elapse_time()); COUNTER_SET(_wait_for_buffer_timer, _buffer_dependency->write_watcher_elapse_time()); - COUNTER_UPDATE(profile()->total_time_counter(), - _buffer_dependency->write_watcher_elapse_time()); + COUNTER_UPDATE(exec_time_counter(), _buffer_dependency->write_watcher_elapse_time()); COUNTER_SET(_wait_for_cancel_timer, _cancel_dependency->write_watcher_elapse_time()); - COUNTER_UPDATE(profile()->total_time_counter(), - _cancel_dependency->write_watcher_elapse_time()); - SCOPED_TIMER(profile()->total_time_counter()); + COUNTER_UPDATE(exec_time_counter(), _cancel_dependency->write_watcher_elapse_time()); + SCOPED_TIMER(exec_time_counter()); Status final_status = exec_status; if (_writer) { // close the writer diff --git a/be/src/pipeline/exec/scan_operator.cpp b/be/src/pipeline/exec/scan_operator.cpp index 71876045257780..601bf5b8b9f340 100644 --- a/be/src/pipeline/exec/scan_operator.cpp +++ b/be/src/pipeline/exec/scan_operator.cpp @@ -118,7 +118,7 @@ bool ScanLocalState::should_run_serial() const { template Status ScanLocalState::init(RuntimeState* state, LocalStateInfo& info) { RETURN_IF_ERROR(PipelineXLocalState<>::init(state, info)); - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); RETURN_IF_ERROR(RuntimeFilterConsumer::init(state)); @@ -128,8 +128,6 @@ Status ScanLocalState::init(RuntimeState* state, LocalStateInfo& info) _source_dependency->add_child(_open_dependency); _eos_dependency = EosDependency::create_shared(PipelineXLocalState<>::_parent->operator_id()); _source_dependency->add_child(_eos_dependency); - _filter_dependency->set_filter_blocked_by_fn( - [this]() { return this->runtime_filters_are_ready_or_timeout(); }); auto& p = _parent->cast(); set_scan_ranges(state, info.scan_ranges); _common_expr_ctxs_push_down.resize(p._common_expr_ctxs_push_down.size()); @@ -143,6 +141,7 @@ Status ScanLocalState::init(RuntimeState* state, LocalStateInfo& info) } // init profile for runtime filter RuntimeFilterConsumer::_init_profile(profile()); + init_runtime_filter_dependency(_filter_dependency.get()); // 1: running at not pipeline mode will init profile. // 2: the scan node should create scanner at pipeline mode will init profile. @@ -167,7 +166,7 @@ Status ScanLocalState::init(RuntimeState* state, LocalStateInfo& info) template Status ScanLocalState::open(RuntimeState* state) { - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); if (_open_dependency == nullptr) { return Status::OK(); @@ -1344,21 +1343,18 @@ Status ScanLocalState::close(RuntimeState* state) { SCOPED_TIMER(_close_timer); if (_data_ready_dependency) { COUNTER_UPDATE(_wait_for_data_timer, _data_ready_dependency->read_watcher_elapse_time()); - COUNTER_UPDATE(profile()->total_time_counter(), - _data_ready_dependency->read_watcher_elapse_time()); + COUNTER_UPDATE(exec_time_counter(), _data_ready_dependency->read_watcher_elapse_time()); } if (_eos_dependency) { COUNTER_SET(_wait_for_eos_timer, _eos_dependency->read_watcher_elapse_time()); - COUNTER_UPDATE(profile()->total_time_counter(), - _eos_dependency->read_watcher_elapse_time()); + COUNTER_UPDATE(exec_time_counter(), _eos_dependency->read_watcher_elapse_time()); } if (_scanner_done_dependency) { COUNTER_SET(_wait_for_scanner_done_timer, _scanner_done_dependency->read_watcher_elapse_time()); - COUNTER_UPDATE(profile()->total_time_counter(), - _scanner_done_dependency->read_watcher_elapse_time()); + COUNTER_UPDATE(exec_time_counter(), _scanner_done_dependency->read_watcher_elapse_time()); } - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); if (_scanner_ctx.get()) { _scanner_ctx->clear_and_join(reinterpret_cast(this), state); } @@ -1371,7 +1367,7 @@ Status ScanOperatorX::get_block(RuntimeState* state, vectorized: SourceState& source_state) { auto& local_state = get_local_state(state); SCOPED_TIMER(local_state._get_next_timer); - SCOPED_TIMER(local_state.profile()->total_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); // in inverted index apply logic, in order to optimize query performance, // we built some temporary columns into block, these columns only used in scan node level, // remove them when query leave scan node to avoid other nodes use block->columns() to make a wrong decision diff --git a/be/src/pipeline/exec/scan_operator.h b/be/src/pipeline/exec/scan_operator.h index c71811c831d141..68d006006f6989 100644 --- a/be/src/pipeline/exec/scan_operator.h +++ b/be/src/pipeline/exec/scan_operator.h @@ -103,7 +103,7 @@ class DataReadyDependency final : public Dependency { _scanner_ctx->reschedule_scanner_ctx(); } if (config::enable_fuzzy_mode && !_ready_for_read && - _read_dependency_watcher.elapsed_time() > SLOW_DEPENDENCY_THRESHOLD) { + _should_log(_read_dependency_watcher.elapsed_time())) { LOG(WARNING) << "========Dependency may be blocked by some reasons: " << name() << " " << id(); } diff --git a/be/src/pipeline/exec/schema_scan_operator.cpp b/be/src/pipeline/exec/schema_scan_operator.cpp index 430315b99872af..da16453c90f6dd 100644 --- a/be/src/pipeline/exec/schema_scan_operator.cpp +++ b/be/src/pipeline/exec/schema_scan_operator.cpp @@ -47,7 +47,7 @@ Status SchemaScanOperator::close(RuntimeState* state) { Status SchemaScanLocalState::init(RuntimeState* state, LocalStateInfo& info) { RETURN_IF_ERROR(PipelineXLocalState<>::init(state, info)); - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); auto& p = _parent->cast(); _scanner_param.common_param = p._common_scanner_param; @@ -212,7 +212,7 @@ Status SchemaScanOperatorX::prepare(RuntimeState* state) { Status SchemaScanOperatorX::get_block(RuntimeState* state, vectorized::Block* block, SourceState& source_state) { auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state.profile()->total_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); RETURN_IF_CANCELLED(state); bool schema_eos = false; diff --git a/be/src/pipeline/exec/select_operator.h b/be/src/pipeline/exec/select_operator.h index 6c998c8ab3cd0a..d75c0a78481645 100644 --- a/be/src/pipeline/exec/select_operator.h +++ b/be/src/pipeline/exec/select_operator.h @@ -61,7 +61,7 @@ class SelectOperatorX final : public StreamingOperatorX { Status pull(RuntimeState* state, vectorized::Block* block, SourceState& source_state) override { auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state.profile()->total_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); RETURN_IF_CANCELLED(state); RETURN_IF_ERROR(vectorized::VExprContext::filter_block(local_state._conjuncts, block, block->columns())); diff --git a/be/src/pipeline/exec/set_probe_sink_operator.cpp b/be/src/pipeline/exec/set_probe_sink_operator.cpp index 81f0c986b744c2..36503df2db8a9a 100644 --- a/be/src/pipeline/exec/set_probe_sink_operator.cpp +++ b/be/src/pipeline/exec/set_probe_sink_operator.cpp @@ -106,7 +106,7 @@ Status SetProbeSinkOperatorX::sink(RuntimeState* state, vectorized SourceState source_state) { RETURN_IF_CANCELLED(state); auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state.profile()->total_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)in_block->rows()); if (_cur_child_id > 1) { @@ -141,7 +141,7 @@ Status SetProbeSinkOperatorX::sink(RuntimeState* state, vectorized template Status SetProbeSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(PipelineXSinkLocalState::init(state, info)); - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); Parent& parent = _parent->cast(); static_cast(_dependency)->set_cur_child_id(parent._cur_child_id); diff --git a/be/src/pipeline/exec/set_sink_operator.cpp b/be/src/pipeline/exec/set_sink_operator.cpp index 07205e6c8a29be..90cc792d471c62 100644 --- a/be/src/pipeline/exec/set_sink_operator.cpp +++ b/be/src/pipeline/exec/set_sink_operator.cpp @@ -56,7 +56,7 @@ Status SetSinkOperatorX::sink(RuntimeState* state, vectorized::Blo RETURN_IF_CANCELLED(state); auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state.profile()->total_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)in_block->rows()); auto& mem_used = local_state._shared_state->mem_used; @@ -160,7 +160,7 @@ Status SetSinkOperatorX::_extract_build_column( template Status SetSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(PipelineXSinkLocalState::init(state, info)); - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); _build_timer = ADD_TIMER(_profile, "BuildTime"); diff --git a/be/src/pipeline/exec/set_source_operator.cpp b/be/src/pipeline/exec/set_source_operator.cpp index e679274d92c6ea..d3840285c38d36 100644 --- a/be/src/pipeline/exec/set_source_operator.cpp +++ b/be/src/pipeline/exec/set_source_operator.cpp @@ -63,7 +63,7 @@ Status SetSourceLocalState::init(RuntimeState* state, LocalStateIn template Status SetSourceLocalState::open(RuntimeState* state) { - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); RETURN_IF_ERROR(PipelineXLocalState::open(state)); auto& child_exprs_lists = _shared_state->child_exprs_lists; @@ -94,7 +94,7 @@ Status SetSourceOperatorX::get_block(RuntimeState* state, vectoriz SourceState& source_state) { RETURN_IF_CANCELLED(state); auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state.profile()->total_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); _create_mutable_cols(local_state, block); auto st = std::visit( [&](auto&& arg) -> Status { diff --git a/be/src/pipeline/exec/sort_sink_operator.cpp b/be/src/pipeline/exec/sort_sink_operator.cpp index cc41fbc4bff132..5471be54e5585c 100644 --- a/be/src/pipeline/exec/sort_sink_operator.cpp +++ b/be/src/pipeline/exec/sort_sink_operator.cpp @@ -30,7 +30,7 @@ OPERATOR_CODE_GENERATOR(SortSinkOperator, StreamingOperator) Status SortSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(PipelineXSinkLocalState::init(state, info)); - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); auto& p = _parent->cast(); @@ -61,7 +61,6 @@ Status SortSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { _shared_state->sorter->init_profile(_profile); - SCOPED_TIMER(_profile->total_time_counter()); _profile->add_info_string("TOP-N", p._limit == -1 ? "false" : "true"); _memory_usage_counter = ADD_LABEL_COUNTER(_profile, "MemoryUsage"); @@ -139,7 +138,7 @@ Status SortSinkOperatorX::open(RuntimeState* state) { Status SortSinkOperatorX::sink(doris::RuntimeState* state, vectorized::Block* in_block, SourceState source_state) { auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state.profile()->total_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)in_block->rows()); if (in_block->rows() > 0) { RETURN_IF_ERROR(local_state._shared_state->sorter->append_block(in_block)); diff --git a/be/src/pipeline/exec/sort_source_operator.cpp b/be/src/pipeline/exec/sort_source_operator.cpp index 6732094272aaf4..e13e518f768c46 100644 --- a/be/src/pipeline/exec/sort_source_operator.cpp +++ b/be/src/pipeline/exec/sort_source_operator.cpp @@ -35,7 +35,7 @@ SortSourceOperatorX::SortSourceOperatorX(ObjectPool* pool, const TPlanNode& tnod Status SortSourceOperatorX::get_block(RuntimeState* state, vectorized::Block* block, SourceState& source_state) { auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state.profile()->total_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); bool eos = false; RETURN_IF_ERROR_OR_CATCH_EXCEPTION( local_state._shared_state->sorter->get_next(state, block, &eos)); diff --git a/be/src/pipeline/exec/streaming_aggregation_sink_operator.cpp b/be/src/pipeline/exec/streaming_aggregation_sink_operator.cpp index 3d246ec9d9c6f1..3052222e2475cb 100644 --- a/be/src/pipeline/exec/streaming_aggregation_sink_operator.cpp +++ b/be/src/pipeline/exec/streaming_aggregation_sink_operator.cpp @@ -21,7 +21,6 @@ #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "pipeline/exec/data_queue.h" #include "pipeline/exec/operator.h" @@ -354,7 +353,7 @@ Status StreamingAggSinkOperatorX::init(const TPlanNode& tnode, RuntimeState* sta Status StreamingAggSinkOperatorX::sink(RuntimeState* state, vectorized::Block* in_block, SourceState source_state) { auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state.profile()->total_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)in_block->rows()); local_state._shared_state->input_num_rows += in_block->rows(); Status ret = Status::OK(); diff --git a/be/src/pipeline/exec/streaming_aggregation_source_operator.cpp b/be/src/pipeline/exec/streaming_aggregation_source_operator.cpp index 45d2e1902f6dd7..353f997ce76629 100644 --- a/be/src/pipeline/exec/streaming_aggregation_source_operator.cpp +++ b/be/src/pipeline/exec/streaming_aggregation_source_operator.cpp @@ -81,7 +81,7 @@ StreamingAggSourceOperatorX::StreamingAggSourceOperatorX(ObjectPool* pool, const Status StreamingAggSourceOperatorX::get_block(RuntimeState* state, vectorized::Block* block, SourceState& source_state) { auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state.profile()->total_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); if (!local_state._shared_state->data_queue->data_exhausted()) { std::unique_ptr agg_block; DCHECK(local_state._dependency->read_blocked_by() == nullptr); diff --git a/be/src/pipeline/exec/union_sink_operator.cpp b/be/src/pipeline/exec/union_sink_operator.cpp index 9b07fb6a83595d..2a235123bdaf15 100644 --- a/be/src/pipeline/exec/union_sink_operator.cpp +++ b/be/src/pipeline/exec/union_sink_operator.cpp @@ -19,7 +19,6 @@ #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/status.h" #include "pipeline/exec/data_queue.h" @@ -96,7 +95,7 @@ Status UnionSinkOperator::close(RuntimeState* state) { Status UnionSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); auto& p = _parent->cast(); _child_expr.resize(p._child_expr.size()); @@ -146,7 +145,7 @@ Status UnionSinkOperatorX::open(RuntimeState* state) { Status UnionSinkOperatorX::sink(RuntimeState* state, vectorized::Block* in_block, SourceState source_state) { auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state.profile()->total_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)in_block->rows()); if (local_state._output_block == nullptr) { local_state._output_block = diff --git a/be/src/pipeline/exec/union_source_operator.cpp b/be/src/pipeline/exec/union_source_operator.cpp index 1f27ab920c8547..8e5179734cb071 100644 --- a/be/src/pipeline/exec/union_source_operator.cpp +++ b/be/src/pipeline/exec/union_source_operator.cpp @@ -119,7 +119,7 @@ Status UnionSourceLocalState::init(RuntimeState* state, LocalStateInfo& info) { } RETURN_IF_ERROR(Base::init(state, info)); ss->data_queue.set_dependency(_dependency); - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); // Const exprs materialized by this node. These exprs don't refer to any children. // Only materialized by the first fragment instance to avoid duplication. @@ -152,7 +152,7 @@ std::shared_ptr UnionSourceLocalState::create_shared_state() { Status UnionSourceOperatorX::get_block(RuntimeState* state, vectorized::Block* block, SourceState& source_state) { auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state.profile()->total_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); if (local_state._need_read_for_const_expr) { if (has_more_const(state)) { static_cast(get_next_const(state, block)); diff --git a/be/src/pipeline/pipeline_fragment_context.cpp b/be/src/pipeline/pipeline_fragment_context.cpp index 4616ff2fb49995..9610122bc026c4 100644 --- a/be/src/pipeline/pipeline_fragment_context.cpp +++ b/be/src/pipeline/pipeline_fragment_context.cpp @@ -21,10 +21,6 @@ #include #include #include -#include -#include -#include -#include #include #include // IWYU pragma: no_include @@ -99,7 +95,6 @@ #include "task_scheduler.h" #include "util/container_util.hpp" #include "util/debug_util.h" -#include "util/telemetry/telemetry.h" #include "util/uid_util.h" #include "vec/common/assert_cast.h" #include "vec/exec/join/vhash_join_node.h" @@ -211,22 +206,20 @@ Status PipelineFragmentContext::prepare(const doris::TPipelineFragmentParams& re SCOPED_TIMER(_prepare_timer); auto* fragment_context = this; - OpentelemetryTracer tracer = telemetry::get_noop_tracer(); - if (opentelemetry::trace::Tracer::GetCurrentSpan()->GetContext().IsValid()) { - tracer = telemetry::get_tracer(print_id(_query_id)); - } LOG_INFO("Preparing instance {}, backend_num {}", PrintInstanceStandardInfo(_query_id, local_params.fragment_instance_id), local_params.backend_num); // 1. init _runtime_state - _runtime_state = RuntimeState::create_unique(local_params, request.query_id, - request.fragment_id, request.query_options, - _query_ctx->query_globals, _exec_env); + _runtime_state = RuntimeState::create_unique( + local_params.fragment_instance_id, request.query_id, request.fragment_id, + request.query_options, _query_ctx->query_globals, _exec_env); + if (local_params.__isset.runtime_filter_params) { + _runtime_state->set_runtime_filter_params(local_params.runtime_filter_params); + } _runtime_state->set_query_ctx(_query_ctx.get()); _runtime_state->set_query_mem_tracker(_query_ctx->query_mem_tracker); - _runtime_state->set_tracer(std::move(tracer)); // TODO should be combine with plan_fragment_executor.prepare funciton SCOPED_ATTACH_TASK(_runtime_state.get()); @@ -315,6 +308,9 @@ Status PipelineFragmentContext::prepare(const doris::TPipelineFragmentParams& re _runtime_state->set_per_fragment_instance_idx(local_params.sender_id); _runtime_state->set_num_per_fragment_instances(request.num_senders); + _runtime_state->set_load_stream_per_node(request.load_stream_per_node); + _runtime_state->set_total_load_streams(request.total_load_streams); + _runtime_state->set_num_local_sink(request.num_local_sink); if (request.fragment.__isset.output_sink) { RETURN_IF_ERROR_OR_CATCH_EXCEPTION(DataSink::create_data_sink( diff --git a/be/src/pipeline/pipeline_task.cpp b/be/src/pipeline/pipeline_task.cpp index 111259062cf88a..72552b484686ca 100644 --- a/be/src/pipeline/pipeline_task.cpp +++ b/be/src/pipeline/pipeline_task.cpp @@ -256,6 +256,18 @@ Status PipelineTask::execute(bool* eos) { } } + auto status = Status::OK(); + auto handle_group_commit = [&]() { + if (UNLIKELY(_fragment_context->is_group_commit() && !status.ok() && _block != nullptr)) { + auto* future_block = dynamic_cast(_block.get()); + std::unique_lock l(*(future_block->lock)); + if (!future_block->is_handled()) { + future_block->set_result(status, 0, 0); + future_block->cv->notify_all(); + } + } + }; + this->set_begin_execute_time(); while (!_fragment_context->is_canceled()) { if (_data_state != SourceState::MORE_DATA && !source_can_read()) { @@ -279,7 +291,11 @@ Status PipelineTask::execute(bool* eos) { { SCOPED_TIMER(_get_block_timer); _get_block_counter->update(1); - RETURN_IF_ERROR(_root->get_block(_state, block, _data_state)); + status = _root->get_block(_state, block, _data_state); + if (UNLIKELY(!status.ok())) { + handle_group_commit(); + return status; + } } *eos = _data_state == SourceState::FINISHED; @@ -289,17 +305,8 @@ Status PipelineTask::execute(bool* eos) { _collect_query_statistics_with_every_batch) { RETURN_IF_ERROR(_collect_query_statistics()); } - auto status = _sink->sink(_state, block, _data_state); - if (UNLIKELY(!status.ok() || block->rows() == 0)) { - if (_fragment_context->is_group_commit()) { - auto* future_block = dynamic_cast(block); - std::unique_lock l(*(future_block->lock)); - if (!future_block->is_handled()) { - future_block->set_result(status, 0, 0); - future_block->cv->notify_all(); - } - } - } + status = _sink->sink(_state, block, _data_state); + handle_group_commit(); if (!status.is()) { RETURN_IF_ERROR(status); } diff --git a/be/src/pipeline/pipeline_task.h b/be/src/pipeline/pipeline_task.h index ef923868a51017..690c4e3419d9f4 100644 --- a/be/src/pipeline/pipeline_task.h +++ b/be/src/pipeline/pipeline_task.h @@ -249,6 +249,8 @@ class PipelineTask { TUniqueId instance_id() const { return _state->fragment_instance_id(); } + void set_parent_profile(RuntimeProfile* profile) { _parent_profile = profile; } + protected: void _finish_p_dependency() { for (const auto& p : _pipeline->_parents) { diff --git a/be/src/pipeline/pipeline_x/dependency.cpp b/be/src/pipeline/pipeline_x/dependency.cpp index d56679f32a24b8..32bd06f5983728 100644 --- a/be/src/pipeline/pipeline_x/dependency.cpp +++ b/be/src/pipeline/pipeline_x/dependency.cpp @@ -17,6 +17,10 @@ #include "dependency.h" +#include +#include + +#include "common/logging.h" #include "runtime/memory/mem_tracker.h" namespace doris::pipeline { @@ -326,4 +330,113 @@ Status HashJoinDependency::extract_join_column(vectorized::Block& block, return Status::OK(); } +bool RuntimeFilterTimer::has_ready() { + std::unique_lock lc(_lock); + return _runtime_filter->is_ready(); +} + +void RuntimeFilterTimer::call_timeout() { + std::unique_lock lc(_lock); + if (_call_ready) { + return; + } + _call_timeout = true; + if (_parent) { + _parent->sub_filters(); + } +} + +void RuntimeFilterTimer::call_ready() { + std::unique_lock lc(_lock); + if (_call_timeout) { + return; + } + _call_ready = true; + if (_parent) { + _parent->sub_filters(); + } +} + +void RuntimeFilterTimer::call_has_ready() { + std::unique_lock lc(_lock); + DCHECK(!_call_timeout); + if (!_call_ready) { + _parent->sub_filters(); + } +} + +void RuntimeFilterTimer::call_has_release() { + // When the use count is equal to 1, only the timer queue still holds ownership, + // so there is no need to take any action. +} + +struct RuntimeFilterTimerQueue { + constexpr static int64_t interval = 50; + void start() { + while (true) { + std::unique_lock lk(cv_m); + + cv.wait(lk, [this] { return !_que.empty(); }); + { + std::unique_lock lc(_que_lock); + std::list> new_que; + for (auto& it : _que) { + if (it.use_count() == 1) { + it->call_has_release(); + } else if (it->has_ready()) { + it->call_has_ready(); + } else { + int64_t ms_since_registration = MonotonicMillis() - it->registration_time(); + if (ms_since_registration > it->wait_time_ms()) { + it->call_timeout(); + } else { + new_que.push_back(std::move(it)); + } + } + } + new_que.swap(_que); + } + std::this_thread::sleep_for(std::chrono::milliseconds(interval)); + } + } + ~RuntimeFilterTimerQueue() { _thread.detach(); } + RuntimeFilterTimerQueue() { _thread = std::thread(&RuntimeFilterTimerQueue::start, this); } + static void push_filter_timer(std::shared_ptr filter) { + static RuntimeFilterTimerQueue timer_que; + + timer_que.push(filter); + } + + void push(std::shared_ptr filter) { + std::unique_lock lc(_que_lock); + _que.push_back(filter); + cv.notify_all(); + } + + std::thread _thread; + std::condition_variable cv; + std::mutex cv_m; + std::mutex _que_lock; + + std::list> _que; +}; + +void RuntimeFilterDependency::add_filters(IRuntimeFilter* runtime_filter) { + _filters++; + int64_t registration_time = runtime_filter->registration_time(); + int32 wait_time_ms = runtime_filter->wait_time_ms(); + auto filter_timer = std::make_shared( + registration_time, wait_time_ms, + std::dynamic_pointer_cast(shared_from_this()), runtime_filter); + runtime_filter->set_filter_timer(filter_timer); + RuntimeFilterTimerQueue::push_filter_timer(filter_timer); +} + +void RuntimeFilterDependency::sub_filters() { + _filters--; + if (_filters == 0) { + *_blocked_by_rf = false; + } +} + } // namespace doris::pipeline diff --git a/be/src/pipeline/pipeline_x/dependency.h b/be/src/pipeline/pipeline_x/dependency.h index 131198c449693a..1d575690f8e0ef 100644 --- a/be/src/pipeline/pipeline_x/dependency.h +++ b/be/src/pipeline/pipeline_x/dependency.h @@ -19,11 +19,16 @@ #include +#include #include #include #include +#include +#include +#include "common/logging.h" #include "concurrentqueue.h" +#include "gutil/integral_types.h" #include "pipeline/exec/data_queue.h" #include "pipeline/exec/multi_cast_data_streamer.h" #include "vec/common/hash_table/hash_map_context_creator.h" @@ -41,7 +46,8 @@ class Dependency; using DependencySPtr = std::shared_ptr; static constexpr auto SLOW_DEPENDENCY_THRESHOLD = 10 * 1000L * 1000L * 1000L; - +static constexpr auto TIME_UNIT_DEPENDENCY_LOG = 5 * 1000L * 1000L * 1000L; +static_assert(TIME_UNIT_DEPENDENCY_LOG < SLOW_DEPENDENCY_THRESHOLD); class Dependency : public std::enable_shared_from_this { public: Dependency(int id, std::string name) : _id(id), _name(name), _ready_for_read(false) {} @@ -68,7 +74,7 @@ class Dependency : public std::enable_shared_from_this { // Which dependency current pipeline task is blocked by. `nullptr` if this dependency is ready. [[nodiscard]] virtual Dependency* read_blocked_by() { if (config::enable_fuzzy_mode && !_ready_for_read && - _read_dependency_watcher.elapsed_time() > SLOW_DEPENDENCY_THRESHOLD) { + _should_log(_read_dependency_watcher.elapsed_time())) { LOG(WARNING) << "========Dependency may be blocked by some reasons: " << name() << " " << id(); } @@ -97,6 +103,17 @@ class Dependency : public std::enable_shared_from_this { void remove_first_child() { _children.erase(_children.begin()); } protected: + bool _should_log(uint64_t cur_time) { + if (cur_time < SLOW_DEPENDENCY_THRESHOLD) { + return false; + } + if ((cur_time - _last_log_time) < TIME_UNIT_DEPENDENCY_LOG) { + return false; + } + _last_log_time = cur_time; + return true; + } + int _id; std::string _name; std::atomic _ready_for_read; @@ -105,6 +122,8 @@ class Dependency : public std::enable_shared_from_this { std::weak_ptr _parent; std::list> _children; + + uint64_t _last_log_time = 0; }; class WriteDependency : public Dependency { @@ -128,7 +147,7 @@ class WriteDependency : public Dependency { [[nodiscard]] virtual WriteDependency* write_blocked_by() { if (config::enable_fuzzy_mode && !_ready_for_write && - _write_dependency_watcher.elapsed_time() > SLOW_DEPENDENCY_THRESHOLD) { + _should_log(_write_dependency_watcher.elapsed_time())) { LOG(WARNING) << "========Dependency may be blocked by some reasons: " << name() << " " << id(); } @@ -169,7 +188,7 @@ class FinishDependency final : public Dependency { [[nodiscard]] FinishDependency* finish_blocked_by() { if (config::enable_fuzzy_mode && !_ready_to_finish && - _finish_dependency_watcher.elapsed_time() > SLOW_DEPENDENCY_THRESHOLD) { + _should_log(_finish_dependency_watcher.elapsed_time())) { LOG(WARNING) << "========Dependency may be blocked by some reasons: " << name() << " " << _node_id; } @@ -194,30 +213,64 @@ class FinishDependency final : public Dependency { const int _node_id; }; -class FilterDependency final : public Dependency { +class RuntimeFilterDependency; +class RuntimeFilterTimer { +public: + RuntimeFilterTimer(int64_t registration_time, int32_t wait_time_ms, + std::shared_ptr parent, + IRuntimeFilter* runtime_filter) + : _parent(std::move(parent)), + _registration_time(registration_time), + _wait_time_ms(wait_time_ms), + _runtime_filter(runtime_filter) {} + + void call_ready(); + + void call_timeout(); + + void call_has_ready(); + + void call_has_release(); + + bool has_ready(); + + int64_t registration_time() const { return _registration_time; } + int32_t wait_time_ms() const { return _wait_time_ms; } + +private: + bool _call_ready {}; + bool _call_timeout {}; + std::shared_ptr _parent; + std::mutex _lock; + const int64_t _registration_time; + const int32_t _wait_time_ms; + IRuntimeFilter* _runtime_filter; +}; +class RuntimeFilterDependency final : public Dependency { public: - FilterDependency(int id, int node_id, std::string name) - : Dependency(id, name), - _runtime_filters_are_ready_or_timeout(nullptr), - _node_id(node_id) {} + RuntimeFilterDependency(int id, int node_id, std::string name) + : Dependency(id, name), _node_id(node_id) {} - FilterDependency* filter_blocked_by() { - if (!_runtime_filters_are_ready_or_timeout) { + RuntimeFilterDependency* filter_blocked_by() { + if (!_blocked_by_rf) { return nullptr; } - if (!_runtime_filters_are_ready_or_timeout()) { + if (*_blocked_by_rf) { return this; } return nullptr; } void* shared_state() override { return nullptr; } - void set_filter_blocked_by_fn(std::function call_fn) { - _runtime_filters_are_ready_or_timeout = call_fn; + void add_filters(IRuntimeFilter* runtime_filter); + void sub_filters(); + void set_blocked_by_rf(std::shared_ptr blocked_by_rf) { + _blocked_by_rf = blocked_by_rf; } protected: - std::function _runtime_filters_are_ready_or_timeout; const int _node_id; + std::atomic_int _filters; + std::shared_ptr _blocked_by_rf; }; class AndDependency final : public WriteDependency { @@ -687,7 +740,7 @@ class PartitionSortDependency final : public WriteDependency { [[nodiscard]] Dependency* read_blocked_by() override { if (config::enable_fuzzy_mode && !(_ready_for_read || _eos) && - _read_dependency_watcher.elapsed_time() > SLOW_DEPENDENCY_THRESHOLD) { + _should_log(_read_dependency_watcher.elapsed_time())) { LOG(WARNING) << "========Dependency may be blocked by some reasons: " << name() << " " << id(); } @@ -806,7 +859,7 @@ class SetDependency final : public WriteDependency { // Which dependency current pipeline task is blocked by. `nullptr` if this dependency is ready. [[nodiscard]] Dependency* read_blocked_by() override { if (config::enable_fuzzy_mode && !_set_state->ready_for_read && - _read_dependency_watcher.elapsed_time() > SLOW_DEPENDENCY_THRESHOLD) { + _should_log(_read_dependency_watcher.elapsed_time())) { LOG(WARNING) << "========Dependency may be blocked by some reasons: " << name() << " " << id(); } @@ -866,7 +919,7 @@ struct LocalExchangeDependency final : public WriteDependency { Dependency* read_blocked_by() override { if (config::enable_fuzzy_mode && !_should_run() && - _read_dependency_watcher.elapsed_time() > SLOW_DEPENDENCY_THRESHOLD) { + _should_log(_read_dependency_watcher.elapsed_time())) { LOG(WARNING) << "========Dependency may be blocked by some reasons: " << name() << " " << id(); } diff --git a/be/src/pipeline/pipeline_x/local_exchange/local_exchange_sink_operator.cpp b/be/src/pipeline/pipeline_x/local_exchange/local_exchange_sink_operator.cpp index b69150ba512386..eba75c0fc1f6cd 100644 --- a/be/src/pipeline/pipeline_x/local_exchange/local_exchange_sink_operator.cpp +++ b/be/src/pipeline/pipeline_x/local_exchange/local_exchange_sink_operator.cpp @@ -21,7 +21,7 @@ namespace doris::pipeline { Status LocalExchangeSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); _compute_hash_value_timer = ADD_TIMER(profile(), "ComputeHashValueTime"); _distribute_timer = ADD_TIMER(profile(), "DistributeDataTime"); @@ -68,7 +68,7 @@ Status LocalExchangeSinkLocalState::split_rows(RuntimeState* state, Status LocalExchangeSinkOperatorX::sink(RuntimeState* state, vectorized::Block* in_block, SourceState source_state) { auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state.profile()->total_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)in_block->rows()); { SCOPED_TIMER(local_state._compute_hash_value_timer); diff --git a/be/src/pipeline/pipeline_x/local_exchange/local_exchange_source_operator.cpp b/be/src/pipeline/pipeline_x/local_exchange/local_exchange_source_operator.cpp index 75d9b4ac471d79..127e14dba6b789 100644 --- a/be/src/pipeline/pipeline_x/local_exchange/local_exchange_source_operator.cpp +++ b/be/src/pipeline/pipeline_x/local_exchange/local_exchange_source_operator.cpp @@ -21,7 +21,7 @@ namespace doris::pipeline { Status LocalExchangeSourceLocalState::init(RuntimeState* state, LocalStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); - SCOPED_TIMER(profile()->total_time_counter()); + SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); _dependency->set_shared_state(info.local_exchange_state); _shared_state = (LocalExchangeSharedState*)_dependency->shared_state(); @@ -37,7 +37,7 @@ Status LocalExchangeSourceLocalState::init(RuntimeState* state, LocalStateInfo& Status LocalExchangeSourceOperatorX::get_block(RuntimeState* state, vectorized::Block* block, SourceState& source_state) { auto& local_state = get_local_state(state); - SCOPED_TIMER(local_state.profile()->total_time_counter()); + SCOPED_TIMER(local_state.exec_time_counter()); PartitionedBlock partitioned_block; std::unique_ptr mutable_block = nullptr; diff --git a/be/src/pipeline/pipeline_x/operator.cpp b/be/src/pipeline/pipeline_x/operator.cpp index c29e1b8af87749..f8430a571598fa 100644 --- a/be/src/pipeline/pipeline_x/operator.cpp +++ b/be/src/pipeline/pipeline_x/operator.cpp @@ -175,7 +175,7 @@ void PipelineXLocalStateBase::clear_origin_block() { Status OperatorXBase::do_projections(RuntimeState* state, vectorized::Block* origin_block, vectorized::Block* output_block) const { auto local_state = state->get_local_state(operator_id()); - SCOPED_TIMER(local_state->profile()->total_time_counter()); + SCOPED_TIMER(local_state->exec_time_counter()); SCOPED_TIMER(local_state->_projection_timer); using namespace vectorized; vectorized::MutableBlock mutable_block = @@ -229,9 +229,11 @@ void PipelineXLocalStateBase::reached_limit(vectorized::Block* block, SourceStat source_state = SourceState::FINISHED; } - _num_rows_returned += block->rows(); - COUNTER_UPDATE(_blocks_returned_counter, 1); - COUNTER_SET(_rows_returned_counter, _num_rows_returned); + if (auto rows = block->rows()) { + _num_rows_returned += rows; + COUNTER_UPDATE(_blocks_returned_counter, 1); + COUNTER_SET(_rows_returned_counter, _num_rows_returned); + } } std::string DataSinkOperatorXBase::debug_string(int indentation_level) const { @@ -315,7 +317,7 @@ PipelineXLocalStateBase::PipelineXLocalStateBase(RuntimeState* state, OperatorXB _state(state), _finish_dependency(new FinishDependency(parent->operator_id(), parent->node_id(), parent->get_name() + "_FINISH_DEPENDENCY")) { - _filter_dependency = std::make_unique( + _filter_dependency = std::make_shared( parent->operator_id(), parent->node_id(), parent->get_name() + "_FILTER_DEPENDENCY"); } @@ -357,6 +359,7 @@ Status PipelineXLocalState::init(RuntimeState* state, LocalState _projection_timer = ADD_TIMER_WITH_LEVEL(_runtime_profile, "ProjectionTime", 1); _open_timer = ADD_TIMER_WITH_LEVEL(_runtime_profile, "OpenTime", 1); _close_timer = ADD_TIMER_WITH_LEVEL(_runtime_profile, "CloseTime", 1); + _exec_timer = ADD_TIMER_WITH_LEVEL(_runtime_profile, "ExecTime", 1); _mem_tracker = std::make_unique("PipelineXLocalState:" + _runtime_profile->name()); _memory_used_counter = ADD_LABEL_COUNTER(_runtime_profile, "MemoryUsage"); _peak_memory_usage_counter = _runtime_profile->AddHighWaterMarkCounter( @@ -377,7 +380,6 @@ Status PipelineXLocalState::close(RuntimeState* state) { if (_rows_returned_counter != nullptr) { COUNTER_SET(_rows_returned_counter, _num_rows_returned); } - profile()->add_to_span(_span); _closed = true; return Status::OK(); } @@ -406,6 +408,7 @@ Status PipelineXSinkLocalState::init(RuntimeState* state, _rows_input_counter = ADD_COUNTER_WITH_LEVEL(_profile, "InputRows", TUnit::UNIT, 1); _open_timer = ADD_TIMER_WITH_LEVEL(_profile, "OpenTime", 1); _close_timer = ADD_TIMER_WITH_LEVEL(_profile, "CloseTime", 1); + _exec_timer = ADD_TIMER_WITH_LEVEL(_profile, "ExecTime", 1); info.parent_profile->add_child(_profile, true, nullptr); _mem_tracker = std::make_unique(_parent->get_name()); return Status::OK(); @@ -447,11 +450,15 @@ Status StatefulOperatorX::get_block(RuntimeState* state, vectori local_state._child_source_state != SourceState::FINISHED) { return Status::OK(); } - RETURN_IF_ERROR( - push(state, local_state._child_block.get(), local_state._child_source_state)); + { + SCOPED_TIMER(local_state.exec_time_counter()); + RETURN_IF_ERROR( + push(state, local_state._child_block.get(), local_state._child_source_state)); + } } if (!need_more_input_data(state)) { + SCOPED_TIMER(local_state.exec_time_counter()); SourceState new_state = SourceState::DEPEND_ON_SOURCE; RETURN_IF_ERROR(pull(state, block, new_state)); if (new_state == SourceState::FINISHED) { diff --git a/be/src/pipeline/pipeline_x/operator.h b/be/src/pipeline/pipeline_x/operator.h index 79762e6ee194d8..265579bfaab0e0 100644 --- a/be/src/pipeline/pipeline_x/operator.h +++ b/be/src/pipeline/pipeline_x/operator.h @@ -86,7 +86,7 @@ class PipelineXLocalStateBase { MemTracker* mem_tracker() { return _mem_tracker.get(); } RuntimeProfile::Counter* rows_returned_counter() { return _rows_returned_counter; } RuntimeProfile::Counter* blocks_returned_counter() { return _blocks_returned_counter; } - + RuntimeProfile::Counter* exec_time_counter() { return _exec_timer; } OperatorXBase* parent() { return _parent; } RuntimeState* state() { return _state; } vectorized::VExprContextSPtrs& conjuncts() { return _conjuncts; } @@ -100,7 +100,7 @@ class PipelineXLocalStateBase { virtual Dependency* dependency() { return nullptr; } FinishDependency* finishdependency() { return _finish_dependency.get(); } - FilterDependency* filterdependency() { return _filter_dependency.get(); } + RuntimeFilterDependency* filterdependency() { return _filter_dependency.get(); } protected: friend class OperatorXBase; @@ -120,12 +120,12 @@ class PipelineXLocalStateBase { RuntimeProfile::Counter* _memory_used_counter; RuntimeProfile::Counter* _wait_for_finish_dependency_timer; RuntimeProfile::Counter* _projection_timer; + RuntimeProfile::Counter* _exec_timer; // Account for peak memory used by this node RuntimeProfile::Counter* _peak_memory_usage_counter; RuntimeProfile::Counter* _open_timer = nullptr; RuntimeProfile::Counter* _close_timer = nullptr; - OpentelemetrySpan _span; OperatorXBase* _parent; RuntimeState* _state; vectorized::VExprContextSPtrs _conjuncts; @@ -133,7 +133,7 @@ class PipelineXLocalStateBase { bool _closed = false; vectorized::Block _origin_block; std::shared_ptr _finish_dependency; - std::unique_ptr _filter_dependency; + std::shared_ptr _filter_dependency; }; class OperatorXBase : public OperatorBase { @@ -369,7 +369,7 @@ class PipelineXSinkLocalStateBase { } RuntimeProfile::Counter* rows_input_counter() { return _rows_input_counter; } - + RuntimeProfile::Counter* exec_time_counter() { return _exec_timer; } virtual WriteDependency* dependency() { return nullptr; } FinishDependency* finishdependency() { return _finish_dependency.get(); } @@ -396,7 +396,7 @@ class PipelineXSinkLocalStateBase { RuntimeProfile::Counter* _close_timer = nullptr; RuntimeProfile::Counter* _wait_for_dependency_timer; RuntimeProfile::Counter* _wait_for_finish_dependency_timer; - + RuntimeProfile::Counter* _exec_timer; std::shared_ptr _finish_dependency; }; diff --git a/be/src/pipeline/pipeline_x/pipeline_x_fragment_context.cpp b/be/src/pipeline/pipeline_x/pipeline_x_fragment_context.cpp index 5ca5829e1a8d16..200ac23504e3cb 100644 --- a/be/src/pipeline/pipeline_x/pipeline_x_fragment_context.cpp +++ b/be/src/pipeline/pipeline_x/pipeline_x_fragment_context.cpp @@ -21,10 +21,6 @@ #include #include #include -#include -#include -#include -#include #include #include #include @@ -97,7 +93,6 @@ #include "service/backend_options.h" #include "util/container_util.hpp" #include "util/debug_util.h" -#include "util/telemetry/telemetry.h" #include "util/uid_util.h" #include "vec/common/assert_cast.h" #include "vec/runtime/vdata_stream_mgr.h" @@ -166,10 +161,6 @@ Status PipelineXFragmentContext::prepare(const doris::TPipelineFragmentParams& r SCOPED_TIMER(_prepare_timer); auto* fragment_context = this; - OpentelemetryTracer tracer = telemetry::get_noop_tracer(); - if (opentelemetry::trace::Tracer::GetCurrentSpan()->GetContext().IsValid()) { - tracer = telemetry::get_tracer(print_id(_query_id)); - } LOG_INFO("PipelineXFragmentContext::prepare") .tag("query_id", print_id(_query_id)) @@ -186,7 +177,6 @@ Status PipelineXFragmentContext::prepare(const doris::TPipelineFragmentParams& r _exec_env); _runtime_state->set_query_ctx(_query_ctx.get()); _runtime_state->set_query_mem_tracker(_query_ctx->query_mem_tracker); - _runtime_state->set_tracer(std::move(tracer)); SCOPED_ATTACH_TASK(_runtime_state.get()); if (request.__isset.backend_id) { @@ -211,6 +201,9 @@ Status PipelineXFragmentContext::prepare(const doris::TPipelineFragmentParams& r } _runtime_state->set_desc_tbl(_desc_tbl); _runtime_state->set_num_per_fragment_instances(request.num_senders); + _runtime_state->set_load_stream_per_node(request.load_stream_per_node); + _runtime_state->set_total_load_streams(request.total_load_streams); + _runtime_state->set_num_local_sink(request.num_local_sink); // 2. Build pipelines with operators in this fragment. auto root_pipeline = add_pipeline(); @@ -390,12 +383,14 @@ Status PipelineXFragmentContext::_build_pipeline_tasks( for (size_t i = 0; i < target_size; i++) { const auto& local_params = request.local_params[i]; - _runtime_states[i] = RuntimeState::create_unique(local_params, request.query_id, - request.fragment_id, request.query_options, - _query_ctx->query_globals, _exec_env); + _runtime_states[i] = RuntimeState::create_unique( + local_params.fragment_instance_id, request.query_id, request.fragment_id, + request.query_options, _query_ctx->query_globals, _exec_env); + if (local_params.__isset.runtime_filter_params) { + _runtime_states[i]->set_runtime_filter_params(local_params.runtime_filter_params); + } _runtime_states[i]->set_query_ctx(_query_ctx.get()); _runtime_states[i]->set_query_mem_tracker(_query_ctx->query_mem_tracker); - _runtime_states[i]->set_tracer(_runtime_state->get_tracer()); static_cast(_runtime_states[i]->runtime_filter_mgr()->init()); _runtime_states[i]->set_be_number(local_params.backend_num); @@ -450,6 +445,29 @@ Status PipelineXFragmentContext::_build_pipeline_tasks( * and JoinProbeOperator2. */ + // First, set up the parent profile, + // then prepare the task profile and add it to operator_id_to_task_profile. + std::vector operator_id_to_task_profile( + max_operator_id(), _runtime_states[i]->runtime_profile()); + auto prepare_and_set_parent_profile = [&](PipelineXTask* task) { + auto sink = task->sink(); + const auto& dests_id = sink->dests_id(); + int dest_id = dests_id.front(); + DCHECK(dest_id < operator_id_to_task_profile.size()); + task->set_parent_profile(operator_id_to_task_profile[dest_id]); + + RETURN_IF_ERROR(task->prepare(_runtime_states[i].get(), local_params, + request.fragment.output_sink)); + + for (auto o : task->operatorXs()) { + int id = o->operator_id(); + DCHECK(id < operator_id_to_task_profile.size()); + auto* op_local_state = _runtime_states[i].get()->get_local_state(o->operator_id()); + operator_id_to_task_profile[id] = op_local_state->profile(); + } + return Status::OK(); + }; + for (size_t pip_idx = 0; pip_idx < _pipelines.size(); pip_idx++) { auto task = pipeline_id_to_task[_pipelines[pip_idx]->id()]; DCHECK(task != nullptr); @@ -462,8 +480,7 @@ Status PipelineXFragmentContext::_build_pipeline_tasks( pipeline_id_to_task[dep]->get_downstream_dependency()); } } - RETURN_IF_ERROR(task->prepare(_runtime_states[i].get(), local_params, - request.fragment.output_sink)); + RETURN_IF_ERROR(prepare_and_set_parent_profile(task)); } { @@ -595,8 +612,14 @@ Status PipelineXFragmentContext::_create_operator(ObjectPool* pool, const TPlanN break; } case doris::TPlanNodeType::JDBC_SCAN_NODE: { - op.reset(new JDBCScanOperatorX(pool, tnode, next_operator_id(), descs)); - RETURN_IF_ERROR(cur_pipe->add_operator(op)); + if (config::enable_java_support) { + op.reset(new JDBCScanOperatorX(pool, tnode, next_operator_id(), descs)); + RETURN_IF_ERROR(cur_pipe->add_operator(op)); + } else { + return Status::InternalError( + "Jdbc scan node is disabled, you can change be config enable_java_support " + "to true and restart be."); + } break; } case doris::TPlanNodeType::FILE_SCAN_NODE: { diff --git a/be/src/pipeline/pipeline_x/pipeline_x_task.cpp b/be/src/pipeline/pipeline_x/pipeline_x_task.cpp index 9ad91341ce5e23..140fecbb1d5bcf 100644 --- a/be/src/pipeline/pipeline_x/pipeline_x_task.cpp +++ b/be/src/pipeline/pipeline_x/pipeline_x_task.cpp @@ -135,7 +135,7 @@ Status PipelineXTask::extract_dependencies() { void PipelineXTask::_init_profile() { std::stringstream ss; - ss << "PipelineTask" + ss << "PipelineXTask" << " (index=" << _index << ")"; auto* task_profile = new RuntimeProfile(ss.str()); _parent_profile->add_child(task_profile, true, nullptr); diff --git a/be/src/pipeline/pipeline_x/pipeline_x_task.h b/be/src/pipeline/pipeline_x/pipeline_x_task.h index 5155c2fe7667cf..90fdda921f0620 100644 --- a/be/src/pipeline/pipeline_x/pipeline_x_task.h +++ b/be/src/pipeline/pipeline_x/pipeline_x_task.h @@ -151,6 +151,12 @@ class PipelineXTask : public PipelineTask { void push_blocked_task_to_dependency(Dependency* dep) {} + DataSinkOperatorXPtr sink() const { return _sink; } + + OperatorXPtr source() const { return _source; } + + OperatorXs operatorXs() { return _operators; } + private: void set_close_pipeline_time() override {} void _init_profile() override; @@ -166,7 +172,7 @@ class PipelineXTask : public PipelineTask { std::vector _read_dependencies; WriteDependency* _write_dependencies; std::vector _finish_dependencies; - FilterDependency* _filter_dependency; + RuntimeFilterDependency* _filter_dependency; DependencyMap _upstream_dependency; diff --git a/be/src/runtime/buffer_control_block.cpp b/be/src/runtime/buffer_control_block.cpp index 5e8efda14f858f..17f90889022ae3 100644 --- a/be/src/runtime/buffer_control_block.cpp +++ b/be/src/runtime/buffer_control_block.cpp @@ -43,7 +43,6 @@ void GetResultBatchCtx::on_failure(const Status& status) { status.to_protobuf(result->mutable_status()); { // call by result sink - SCOPED_TRACK_MEMORY_TO_UNKNOWN(); done->Run(); } delete this; @@ -57,10 +56,7 @@ void GetResultBatchCtx::on_close(int64_t packet_seq, QueryStatistics* statistics } result->set_packet_seq(packet_seq); result->set_eos(true); - { - SCOPED_TRACK_MEMORY_TO_UNKNOWN(); - done->Run(); - } + { done->Run(); } delete this; } @@ -85,10 +81,7 @@ void GetResultBatchCtx::on_data(const std::unique_ptr& t_resul result->set_eos(eos); } st.to_protobuf(result->mutable_status()); - { - SCOPED_TRACK_MEMORY_TO_UNKNOWN(); - done->Run(); - } + { done->Run(); } delete this; } diff --git a/be/src/runtime/client_cache.h b/be/src/runtime/client_cache.h index 7ba5cafabea81a..ff45055ca6543b 100644 --- a/be/src/runtime/client_cache.h +++ b/be/src/runtime/client_cache.h @@ -187,6 +187,8 @@ class ClientConnection { Status reopen() { return _client_cache->reopen_client(&_client, 0); } + inline bool is_alive() { return _client != nullptr; } + T* operator->() const { return _client; } private: diff --git a/be/src/runtime/descriptors.h b/be/src/runtime/descriptors.h index f77f5fec3b99be..fbd233a18d1d7b 100644 --- a/be/src/runtime/descriptors.h +++ b/be/src/runtime/descriptors.h @@ -32,7 +32,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/global_types.h" #include "common/status.h" diff --git a/be/src/runtime/fold_constant_executor.cpp b/be/src/runtime/fold_constant_executor.cpp index e5fd16418d6ff7..43319507aba78f 100644 --- a/be/src/runtime/fold_constant_executor.cpp +++ b/be/src/runtime/fold_constant_executor.cpp @@ -32,7 +32,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/status.h" #include "runtime/decimalv2_value.h" @@ -195,6 +194,11 @@ string FoldConstantExecutor::_get_result(void* src, size_t size, const TypeDescr double val = *reinterpret_cast(src); return fmt::format("{}", val); } + case TYPE_TIMEV2: { + constexpr static auto ratio_to_time = (1000 * 1000); + double val = *reinterpret_cast(src); + return fmt::format("{}", val / ratio_to_time); + } case TYPE_CHAR: case TYPE_VARCHAR: case TYPE_STRING: diff --git a/be/src/runtime/fragment_mgr.cpp b/be/src/runtime/fragment_mgr.cpp index 83b57c28fd7957..e4b703e792f47c 100644 --- a/be/src/runtime/fragment_mgr.cpp +++ b/be/src/runtime/fragment_mgr.cpp @@ -31,9 +31,6 @@ #include #include #include -#include -#include -#include #include #include #include @@ -58,7 +55,6 @@ #include "common/utils.h" #include "gutil/strings/substitute.h" #include "io/fs/stream_load_pipe.h" -#include "opentelemetry/trace/scope.h" #include "pipeline/pipeline_fragment_context.h" #include "runtime/client_cache.h" #include "runtime/descriptors.h" @@ -85,7 +81,6 @@ #include "util/network_util.h" #include "util/pretty_printer.h" #include "util/runtime_profile.h" -#include "util/telemetry/telemetry.h" #include "util/thread.h" #include "util/threadpool.h" #include "util/thrift_util.h" @@ -671,18 +666,31 @@ Status FragmentMgr::_get_query_ctx(const Params& params, TUniqueId query_id, boo LOG(INFO) << "Query/load id: " << print_id(query_ctx->query_id()) << " use task group: " << tg->debug_string() << " cpu_hard_limit: " << task_group_info.cpu_hard_limit - << " cpu_share:" << task_group_info.cpu_share; + << " cpu_share:" << task_group_info.cpu_share + << " enable cgroup soft cpu:" << config::enable_cgroup_cpu_soft_limit; if (task_group_info.cpu_hard_limit > 0) { Status ret = _exec_env->task_group_manager()->create_and_get_task_scheduler( - tg_id, tg_name, task_group_info.cpu_hard_limit, _exec_env, - query_ctx.get()); + tg_id, tg_name, task_group_info.cpu_hard_limit, + task_group_info.cpu_share, _exec_env, query_ctx.get()); if (!ret.ok()) { LOG(INFO) << "workload group init failed " << ", name=" << tg_name << ", id=" << tg_id << ", reason=" << ret.to_string(); } } else { - query_ctx->set_task_group(tg); + if (!config::enable_cgroup_cpu_soft_limit) { + query_ctx->set_task_group(tg); + } else { + Status ret = + _exec_env->task_group_manager()->create_and_get_task_scheduler( + tg_id, tg_name, task_group_info.cpu_hard_limit, + task_group_info.cpu_share, _exec_env, query_ctx.get()); + if (!ret.ok()) { + LOG(INFO) << "workload group cpu soft limit init failed " + << ", name=" << tg_name << ", id=" << tg_id + << ", reason=" << ret.to_string(); + } + } } } } else { @@ -712,12 +720,6 @@ Status FragmentMgr::_get_query_ctx(const Params& params, TUniqueId query_id, boo Status FragmentMgr::exec_plan_fragment(const TExecPlanFragmentParams& params, const FinishCallback& cb) { - auto tracer = telemetry::is_current_span_valid() ? telemetry::get_tracer("tracer") - : telemetry::get_noop_tracer(); - auto cur_span = opentelemetry::trace::Tracer::GetCurrentSpan(); - cur_span->SetAttribute("query_id", print_id(params.params.query_id)); - cur_span->SetAttribute("instance_id", print_id(params.params.fragment_instance_id)); - VLOG_ROW << "exec_plan_fragment params is " << apache::thrift::ThriftDebugString(params).c_str(); // sometimes TExecPlanFragmentParams debug string is too long and glog @@ -777,11 +779,7 @@ Status FragmentMgr::exec_plan_fragment(const TExecPlanFragmentParams& params, _cv.notify_all(); } auto st = _thread_pool->submit_func( - [this, fragment_executor, cb, - parent_span = opentelemetry::trace::Tracer::GetCurrentSpan()] { - OpentelemetryScope scope {parent_span}; - _exec_actual(fragment_executor, cb); - }); + [this, fragment_executor, cb] { _exec_actual(fragment_executor, cb); }); if (!st.ok()) { { // Remove the exec state added @@ -801,11 +799,6 @@ Status FragmentMgr::exec_plan_fragment(const TExecPlanFragmentParams& params, Status FragmentMgr::exec_plan_fragment(const TPipelineFragmentParams& params, const FinishCallback& cb) { - auto tracer = telemetry::is_current_span_valid() ? telemetry::get_tracer("tracer") - : telemetry::get_noop_tracer(); - auto cur_span = opentelemetry::trace::Tracer::GetCurrentSpan(); - cur_span->SetAttribute("query_id", print_id(params.query_id)); - VLOG_ROW << "query: " << print_id(params.query_id) << " exec_plan_fragment params is " << apache::thrift::ThriftDebugString(params).c_str(); // sometimes TExecPlanFragmentParams debug string is too long and glog @@ -854,8 +847,6 @@ Status FragmentMgr::exec_plan_fragment(const TPipelineFragmentParams& params, } query_ctx->fragment_instance_ids.push_back(fragment_instance_id); } - START_AND_SCOPE_SPAN(tracer, span, "exec_instance"); - span->SetAttribute("instance_id", print_id(fragment_instance_id)); if (!params.__isset.need_wait_execution_trigger || !params.need_wait_execution_trigger) { @@ -891,8 +882,6 @@ Status FragmentMgr::exec_plan_fragment(const TPipelineFragmentParams& params, } query_ctx->fragment_instance_ids.push_back(fragment_instance_id); } - START_AND_SCOPE_SPAN(tracer, span, "exec_instance"); - span->SetAttribute("instance_id", print_id(fragment_instance_id)); int64_t duration_ns = 0; if (!params.__isset.need_wait_execution_trigger || diff --git a/be/src/runtime/group_commit_mgr.cpp b/be/src/runtime/group_commit_mgr.cpp index 1bdb5b586f05ab..77995da8d43072 100644 --- a/be/src/runtime/group_commit_mgr.cpp +++ b/be/src/runtime/group_commit_mgr.cpp @@ -17,45 +17,39 @@ #include "runtime/group_commit_mgr.h" -#include -#include -#include -#include #include #include "client_cache.h" -#include "common/object_pool.h" -#include "exec/data_sink.h" -#include "io/fs/stream_load_pipe.h" +#include "common/config.h" #include "olap/wal_manager.h" #include "runtime/exec_env.h" #include "runtime/fragment_mgr.h" #include "runtime/runtime_state.h" -#include "runtime/stream_load/new_load_stream_mgr.h" -#include "runtime/stream_load/stream_load_context.h" #include "util/thrift_rpc_helper.h" -#include "vec/exec/scan/new_file_scan_node.h" -#include "vec/sink/group_commit_block_sink.h" namespace doris { -class TPlan; - Status LoadBlockQueue::add_block(std::shared_ptr block) { DCHECK(block->get_schema_version() == schema_version); - std::unique_lock l(*_mutex); + std::unique_lock l(mutex); RETURN_IF_ERROR(_status); + while (*_all_block_queues_bytes > config::group_commit_max_queue_size) { + _put_cond.wait_for( + l, std::chrono::milliseconds(LoadBlockQueue::MAX_BLOCK_QUEUE_ADD_WAIT_TIME)); + } if (block->rows() > 0) { _block_queue.push_back(block); + *_all_block_queues_bytes += block->bytes(); + *_single_block_queue_bytes += block->bytes(); } - _cv->notify_one(); + _get_cond.notify_all(); return Status::OK(); } Status LoadBlockQueue::get_block(vectorized::Block* block, bool* find_block, bool* eos) { *find_block = false; *eos = false; - std::unique_lock l(*_mutex); + std::unique_lock l(mutex); if (!need_commit) { auto left_milliseconds = config::group_commit_interval_ms - std::chrono::duration_cast( @@ -67,6 +61,7 @@ Status LoadBlockQueue::get_block(vectorized::Block* block, bool* find_block, boo } while (_status.ok() && _block_queue.empty() && (!need_commit || (need_commit && !_load_ids.empty()))) { + CHECK(*_single_block_queue_bytes == 0); auto left_milliseconds = config::group_commit_interval_ms; if (!need_commit) { left_milliseconds = config::group_commit_interval_ms - @@ -79,9 +74,9 @@ Status LoadBlockQueue::get_block(vectorized::Block* block, bool* find_block, boo } } #if !defined(USE_BTHREAD_SCANNER) - _cv->wait_for(l, std::chrono::milliseconds(left_milliseconds)); + _get_cond.wait_for(l, std::chrono::milliseconds(left_milliseconds)); #else - _cv->wait_for(l, left_milliseconds * 1000); + _get_cond.wait_for(l, left_milliseconds * 1000); #endif } if (!_block_queue.empty()) { @@ -90,25 +85,29 @@ Status LoadBlockQueue::get_block(vectorized::Block* block, bool* find_block, boo fblock->swap_future_block(future_block); *find_block = true; _block_queue.pop_front(); + *_all_block_queues_bytes -= fblock->bytes(); + *_single_block_queue_bytes -= block->bytes(); } if (_block_queue.empty() && need_commit && _load_ids.empty()) { + CHECK(*_single_block_queue_bytes == 0); *eos = true; } else { *eos = false; } + _put_cond.notify_all(); return Status::OK(); } void LoadBlockQueue::remove_load_id(const UniqueId& load_id) { - std::unique_lock l(*_mutex); + std::unique_lock l(mutex); if (_load_ids.find(load_id) != _load_ids.end()) { _load_ids.erase(load_id); - _cv->notify_one(); + _get_cond.notify_all(); } } Status LoadBlockQueue::add_load_id(const UniqueId& load_id) { - std::unique_lock l(*_mutex); + std::unique_lock l(mutex); if (need_commit) { return Status::InternalError("block queue is set need commit, id=" + load_instance_id.to_string()); @@ -119,13 +118,15 @@ Status LoadBlockQueue::add_load_id(const UniqueId& load_id) { void LoadBlockQueue::cancel(const Status& st) { DCHECK(!st.ok()); - std::unique_lock l(*_mutex); + std::unique_lock l(mutex); _status = st; while (!_block_queue.empty()) { { auto& future_block = _block_queue.front(); std::unique_lock l0(*(future_block->lock)); future_block->set_result(st, future_block->rows(), 0); + *_all_block_queues_bytes -= future_block->bytes(); + *_single_block_queue_bytes -= future_block->bytes(); future_block->cv->notify_all(); } _block_queue.pop_front(); @@ -185,7 +186,7 @@ Status GroupCommitTable::get_first_block_load_queue( Status GroupCommitTable::_create_group_commit_load( std::shared_ptr& load_block_queue) { Status st = Status::OK(); - std::unique_ptr> remove_pipe_func((int*)0x01, [&](int*) { + std::unique_ptr> finish_plan_func((int*)0x01, [&](int*) { if (!st.ok()) { std::unique_lock l(_lock); _need_plan_fragment = false; @@ -248,8 +249,9 @@ Status GroupCommitTable::_create_group_commit_load( << ", txn_id=" << txn_id << ", instance_id=" << print_id(instance_id) << ", is_pipeline=" << is_pipeline; { - load_block_queue = - std::make_shared(instance_id, label, txn_id, schema_version); + load_block_queue = std::make_shared( + instance_id, label, txn_id, schema_version, _all_block_queues_bytes, + result.wait_internal_group_commit_finish); std::unique_lock l(_lock); _load_block_queues.emplace(instance_id, load_block_queue); _need_plan_fragment = false; @@ -268,16 +270,6 @@ Status GroupCommitTable::_finish_group_commit_load(int64_t db_id, int64_t table_ const std::string& label, int64_t txn_id, const TUniqueId& instance_id, Status& status, bool prepare_failed, RuntimeState* state) { - { - std::lock_guard l(_lock); - if (prepare_failed || !status.ok()) { - auto it = _load_block_queues.find(instance_id); - if (it != _load_block_queues.end()) { - it->second->cancel(status); - } - } - _load_block_queues.erase(instance_id); - } Status st; Status result_status; if (status.ok()) { @@ -316,6 +308,21 @@ Status GroupCommitTable::_finish_group_commit_load(int64_t db_id, int64_t table_ 10000L); result_status = Status::create(result.status); } + { + std::lock_guard l(_lock); + auto it = _load_block_queues.find(instance_id); + if (it != _load_block_queues.end()) { + auto& load_block_queue = it->second; + if (prepare_failed || !status.ok()) { + load_block_queue->cancel(status); + } + if (load_block_queue->wait_internal_group_commit_finish) { + std::unique_lock l2(load_block_queue->mutex); + load_block_queue->internal_group_commit_finish_cv.notify_all(); + } + } + _load_block_queues.erase(instance_id); + } if (!st.ok()) { LOG(WARNING) << "request finish error, db_id=" << db_id << ", table_id=" << table_id << ", label=" << label << ", txn_id=" << txn_id @@ -390,14 +397,11 @@ Status GroupCommitTable::get_load_block_queue(const TUniqueId& instance_id, } GroupCommitMgr::GroupCommitMgr(ExecEnv* exec_env) : _exec_env(exec_env) { - static_cast(ThreadPoolBuilder("InsertIntoGroupCommitThreadPool") - .set_min_threads(config::group_commit_insert_threads) - .set_max_threads(config::group_commit_insert_threads) - .build(&_insert_into_thread_pool)); static_cast(ThreadPoolBuilder("GroupCommitThreadPool") .set_min_threads(1) .set_max_threads(config::group_commit_insert_threads) .build(&_thread_pool)); + _all_block_queues_bytes = std::make_shared(0); } GroupCommitMgr::~GroupCommitMgr() { @@ -405,129 +409,10 @@ GroupCommitMgr::~GroupCommitMgr() { } void GroupCommitMgr::stop() { - _insert_into_thread_pool->shutdown(); _thread_pool->shutdown(); LOG(INFO) << "GroupCommitMgr is stopped"; } -Status GroupCommitMgr::group_commit_insert(int64_t table_id, const TPlan& plan, - const TDescriptorTable& tdesc_tbl, - const TScanRangeParams& scan_range_params, - const PGroupCommitInsertRequest* request, - PGroupCommitInsertResponse* response) { - auto& nodes = plan.nodes; - DCHECK(nodes.size() > 0); - auto& plan_node = nodes.at(0); - - TUniqueId load_id; - load_id.__set_hi(request->load_id().hi()); - load_id.__set_lo(request->load_id().lo()); - - std::vector> future_blocks; - { - // 1. Prepare a pipe, then append rows to pipe, - // then scan node scans from the pipe, like stream load. - std::shared_ptr load_block_queue; - auto pipe = std::make_shared( - io::kMaxPipeBufferedBytes /* max_buffered_bytes */, 64 * 1024 /* min_chunk_size */, - -1 /* total_length */, true /* use_proto */); - std::shared_ptr ctx = std::make_shared(_exec_env); - ctx->pipe = pipe; - RETURN_IF_ERROR(_exec_env->new_load_stream_mgr()->put(load_id, ctx)); - std::unique_ptr> remove_pipe_func((int*)0x01, [&](int*) { - if (load_block_queue != nullptr) { - load_block_queue->remove_load_id(load_id); - } - _exec_env->new_load_stream_mgr()->remove(load_id); - }); - static_cast(_insert_into_thread_pool->submit_func( - std::bind(&GroupCommitMgr::_append_row, this, pipe, request))); - - // 2. FileScanNode consumes data from the pipe. - std::unique_ptr runtime_state = RuntimeState::create_unique(); - TQueryOptions query_options; - query_options.query_type = TQueryType::LOAD; - TQueryGlobals query_globals; - static_cast(runtime_state->init(load_id, query_options, query_globals, _exec_env)); - runtime_state->set_query_mem_tracker(std::make_shared( - MemTrackerLimiter::Type::LOAD, fmt::format("Load#Id={}", print_id(load_id)), -1)); - DescriptorTbl* desc_tbl = nullptr; - RETURN_IF_ERROR(DescriptorTbl::create(runtime_state->obj_pool(), tdesc_tbl, &desc_tbl)); - runtime_state->set_desc_tbl(desc_tbl); - auto file_scan_node = - vectorized::NewFileScanNode(runtime_state->obj_pool(), plan_node, *desc_tbl); - std::unique_ptr> close_scan_node_func((int*)0x01, [&](int*) { - static_cast(file_scan_node.close(runtime_state.get())); - }); - // TFileFormatType::FORMAT_PROTO, TFileType::FILE_STREAM, set _range.load_id - RETURN_IF_ERROR(file_scan_node.init(plan_node, runtime_state.get())); - RETURN_IF_ERROR(file_scan_node.prepare(runtime_state.get())); - std::vector params_vector; - params_vector.emplace_back(scan_range_params); - file_scan_node.set_scan_ranges(runtime_state.get(), params_vector); - RETURN_IF_ERROR(file_scan_node.open(runtime_state.get())); - - // 3. Put the block into block queue. - std::unique_ptr _block = - doris::vectorized::Block::create_unique(); - bool eof = false; - while (!eof) { - // TODO what to do if read one block error - RETURN_IF_ERROR(file_scan_node.get_next(runtime_state.get(), _block.get(), &eof)); - std::shared_ptr future_block = - std::make_shared(); - future_block->swap(*(_block.get())); - future_block->set_info(request->base_schema_version(), load_id); - if (load_block_queue == nullptr) { - RETURN_IF_ERROR(get_first_block_load_queue(request->db_id(), table_id, future_block, - load_block_queue)); - response->set_label(load_block_queue->label); - response->set_txn_id(load_block_queue->txn_id); - } - // TODO what to do if add one block error - if (future_block->rows() > 0) { - future_blocks.emplace_back(future_block); - } - RETURN_IF_ERROR(load_block_queue->add_block(future_block)); - } - if (!runtime_state->get_error_log_file_path().empty()) { - LOG(INFO) << "id=" << print_id(load_id) - << ", url=" << runtime_state->get_error_log_file_path() - << ", load rows=" << runtime_state->num_rows_load_total() - << ", filter rows=" << runtime_state->num_rows_load_filtered() - << ", unselect rows=" << runtime_state->num_rows_load_unselected() - << ", success rows=" << runtime_state->num_rows_load_success(); - } - } - int64_t total_rows = 0; - int64_t loaded_rows = 0; - // 4. wait to wal - for (const auto& future_block : future_blocks) { - std::unique_lock l(*(future_block->lock)); - if (!future_block->is_handled()) { - future_block->cv->wait(l); - } - // future_block->get_status() - total_rows += future_block->get_total_rows(); - loaded_rows += future_block->get_loaded_rows(); - } - response->set_loaded_rows(loaded_rows); - response->set_filtered_rows(total_rows - loaded_rows); - return Status::OK(); -} - -Status GroupCommitMgr::_append_row(std::shared_ptr pipe, - const PGroupCommitInsertRequest* request) { - for (int i = 0; i < request->data().size(); ++i) { - std::unique_ptr row(new PDataRow()); - row->CopyFrom(request->data(i)); - // TODO append may error when pipe is cancelled - RETURN_IF_ERROR(pipe->append(std::move(row))); - } - static_cast(pipe->finish()); - return Status::OK(); -} - Status GroupCommitMgr::get_first_block_load_queue( int64_t db_id, int64_t table_id, std::shared_ptr block, std::shared_ptr& load_block_queue) { @@ -536,7 +421,8 @@ Status GroupCommitMgr::get_first_block_load_queue( std::lock_guard wlock(_lock); if (_table_map.find(table_id) == _table_map.end()) { _table_map.emplace(table_id, std::make_shared( - _exec_env, _thread_pool.get(), db_id, table_id)); + _exec_env, _thread_pool.get(), db_id, table_id, + _all_block_queues_bytes)); } group_commit_table = _table_map[table_id]; } diff --git a/be/src/runtime/group_commit_mgr.h b/be/src/runtime/group_commit_mgr.h index aa8d05534ca29c..9c0a8a047f9739 100644 --- a/be/src/runtime/group_commit_mgr.h +++ b/be/src/runtime/group_commit_mgr.h @@ -19,36 +19,34 @@ #include +#include +#include + #include "common/status.h" -#include "io/fs/stream_load_pipe.h" #include "util/lock.h" #include "util/threadpool.h" -#include "util/thrift_util.h" #include "vec/core/block.h" #include "vec/core/future_block.h" namespace doris { class ExecEnv; -class TPlan; -class TDescriptorTable; class TUniqueId; -class TExecPlanFragmentParams; -class ObjectPool; class RuntimeState; -class StreamLoadContext; -class StreamLoadPipe; class LoadBlockQueue { public: LoadBlockQueue(const UniqueId& load_instance_id, std::string& label, int64_t txn_id, - int64_t schema_version) + int64_t schema_version, + std::shared_ptr all_block_queues_bytes, + bool wait_internal_group_commit_finish) : load_instance_id(load_instance_id), label(label), txn_id(txn_id), schema_version(schema_version), - _start_time(std::chrono::steady_clock::now()) { - _mutex = std::make_shared(); - _cv = std::make_shared(); + wait_internal_group_commit_finish(wait_internal_group_commit_finish), + _start_time(std::chrono::steady_clock::now()), + _all_block_queues_bytes(all_block_queues_bytes) { + _single_block_queue_bytes = std::make_shared(0); }; Status add_block(std::shared_ptr block); @@ -57,29 +55,41 @@ class LoadBlockQueue { void remove_load_id(const UniqueId& load_id); void cancel(const Status& st); + static constexpr size_t MAX_BLOCK_QUEUE_ADD_WAIT_TIME = 1000; UniqueId load_instance_id; std::string label; int64_t txn_id; int64_t schema_version; bool need_commit = false; + bool wait_internal_group_commit_finish = false; + doris::Mutex mutex; + doris::ConditionVariable internal_group_commit_finish_cv; private: std::chrono::steady_clock::time_point _start_time; - std::shared_ptr _mutex; - std::shared_ptr _cv; + doris::ConditionVariable _put_cond; + doris::ConditionVariable _get_cond; // the set of load ids of all blocks in this queue std::set _load_ids; std::list> _block_queue; Status _status = Status::OK(); + // memory consumption of all tables' load block queues, used for back pressure. + std::shared_ptr _all_block_queues_bytes; + // memory consumption of one load block queue, used for correctness check. + std::shared_ptr _single_block_queue_bytes; }; class GroupCommitTable { public: GroupCommitTable(ExecEnv* exec_env, doris::ThreadPool* thread_pool, int64_t db_id, - int64_t table_id) - : _exec_env(exec_env), _thread_pool(thread_pool), _db_id(db_id), _table_id(table_id) {}; + int64_t table_id, std::shared_ptr all_block_queue_bytes) + : _exec_env(exec_env), + _thread_pool(thread_pool), + _db_id(db_id), + _table_id(table_id), + _all_block_queues_bytes(all_block_queue_bytes) {}; Status get_first_block_load_queue(int64_t table_id, std::shared_ptr block, std::shared_ptr& load_block_queue); @@ -105,6 +115,8 @@ class GroupCommitTable { // fragment_instance_id to load_block_queue std::unordered_map> _load_block_queues; bool _need_plan_fragment = false; + // memory consumption of all tables' load block queues, used for back pressure. + std::shared_ptr _all_block_queues_bytes; }; class GroupCommitMgr { @@ -114,13 +126,6 @@ class GroupCommitMgr { void stop(); - // insert into - Status group_commit_insert(int64_t table_id, const TPlan& plan, - const TDescriptorTable& desc_tbl, - const TScanRangeParams& scan_range_params, - const PGroupCommitInsertRequest* request, - PGroupCommitInsertResponse* response); - // used when init group_commit_scan_node Status get_load_block_queue(int64_t table_id, const TUniqueId& instance_id, std::shared_ptr& load_block_queue); @@ -129,19 +134,14 @@ class GroupCommitMgr { std::shared_ptr& load_block_queue); private: - // used by insert into - Status _append_row(std::shared_ptr pipe, - const PGroupCommitInsertRequest* request); - ExecEnv* _exec_env; doris::Mutex _lock; // TODO remove table when unused std::unordered_map> _table_map; - - // thread pool to handle insert into: append data to pipe - std::unique_ptr _insert_into_thread_pool; std::unique_ptr _thread_pool; + // memory consumption of all tables' load block queues, used for back pressure. + std::shared_ptr _all_block_queues_bytes; }; } // namespace doris \ No newline at end of file diff --git a/be/src/runtime/load_channel_mgr.h b/be/src/runtime/load_channel_mgr.h index a77c0b0cc2daab..d34cb69a9e6d4e 100644 --- a/be/src/runtime/load_channel_mgr.h +++ b/be/src/runtime/load_channel_mgr.h @@ -27,7 +27,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/status.h" #include "gutil/ref_counted.h" diff --git a/be/src/runtime/load_path_mgr.cpp b/be/src/runtime/load_path_mgr.cpp index 2ae43934df9051..f961fa9b7ecff3 100644 --- a/be/src/runtime/load_path_mgr.cpp +++ b/be/src/runtime/load_path_mgr.cpp @@ -27,7 +27,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep // IWYU pragma: no_include #include // IWYU pragma: keep diff --git a/be/src/runtime/load_stream.cpp b/be/src/runtime/load_stream.cpp index daca91e978c837..25e79ccab582a4 100644 --- a/be/src/runtime/load_stream.cpp +++ b/be/src/runtime/load_stream.cpp @@ -30,6 +30,7 @@ #include "runtime/load_channel.h" #include "runtime/load_stream_mgr.h" #include "runtime/load_stream_writer.h" +#include "util/runtime_profile.h" #include "util/thrift_util.h" #include "util/uid_util.h" @@ -129,7 +130,7 @@ Status TabletStream::append_data(const PStreamHeader& header, butil::IOBuf* data LOG(INFO) << "write data failed " << *this; } }; - return _flush_tokens[segid % _flush_tokens.size()]->submit_func(flush_func); + return _flush_tokens[new_segid % _flush_tokens.size()]->submit_func(flush_func); } Status TabletStream::add_segment(const PStreamHeader& header, butil::IOBuf* data) { @@ -151,7 +152,14 @@ Status TabletStream::add_segment(const PStreamHeader& header, butil::IOBuf* data } DCHECK(new_segid != std::numeric_limits::max()); - return _load_stream_writer->add_segment(new_segid, stat); + auto add_segment_func = [this, new_segid, stat]() { + auto st = _load_stream_writer->add_segment(new_segid, stat); + if (!st.ok() && _failed_st->ok()) { + _failed_st = std::make_shared(st); + LOG(INFO) << "add segment failed " << *this; + } + }; + return _flush_tokens[new_segid % _flush_tokens.size()]->submit_func(add_segment_func); } Status TabletStream::close() { @@ -242,8 +250,10 @@ LoadStream::~LoadStream() { LOG(INFO) << "load stream is deconstructed " << *this; } -Status LoadStream::init(const POpenStreamSinkRequest* request) { +Status LoadStream::init(const POpenLoadStreamRequest* request) { _txn_id = request->txn_id(); + _total_streams = request->total_streams(); + DCHECK(_total_streams > 0) << "total streams should be greator than 0"; _schema = std::make_shared(); RETURN_IF_ERROR(_schema->init(request->schema())); @@ -258,41 +268,49 @@ Status LoadStream::init(const POpenStreamSinkRequest* request) { Status LoadStream::close(int64_t src_id, const std::vector& tablets_to_commit, std::vector* success_tablet_ids, std::vector* failed_tablet_ids) { - std::lock_guard lock_guard(_lock); + std::lock_guard lock_guard(_lock); SCOPED_TIMER(_close_wait_timer); // we do nothing until recv CLOSE_LOAD from all stream to ensure all data are handled before ack _open_streams[src_id]--; - LOG(INFO) << "received CLOSE_LOAD from sender " << src_id << ", remaining " - << _open_streams[src_id] << " streams"; if (_open_streams[src_id] == 0) { _open_streams.erase(src_id); } + _close_load_cnt++; + LOG(INFO) << "received CLOSE_LOAD from sender " << src_id << ", remaining " + << _total_streams - _close_load_cnt << " senders"; + + _tablets_to_commit.insert(_tablets_to_commit.end(), tablets_to_commit.begin(), + tablets_to_commit.end()); + + if (_close_load_cnt < _total_streams) { + // do not return commit info if there is remaining streams. + return Status::OK(); + } Status st = Status::OK(); - if (_open_streams.size() == 0) { + { bthread::Mutex mutex; std::unique_lock lock(mutex); bthread::ConditionVariable cond; - bool ret = _load_stream_mgr->heavy_work_pool()->try_offer([this, &success_tablet_ids, - &failed_tablet_ids, - &tablets_to_commit, &mutex, - &cond, &st]() { - signal::set_signal_task_id(_load_id); - for (auto& it : _index_streams_map) { - st = it.second->close(tablets_to_commit, success_tablet_ids, failed_tablet_ids); - if (!st.ok()) { + bool ret = _load_stream_mgr->heavy_work_pool()->try_offer( + [this, &success_tablet_ids, &failed_tablet_ids, &mutex, &cond, &st]() { + signal::set_signal_task_id(_load_id); + for (auto& it : _index_streams_map) { + st = it.second->close(_tablets_to_commit, success_tablet_ids, + failed_tablet_ids); + if (!st.ok()) { + std::unique_lock lock(mutex); + cond.notify_one(); + return; + } + } + LOG(INFO) << "close load " << *this + << ", failed_tablet_num=" << failed_tablet_ids->size() + << ", success_tablet_num=" << success_tablet_ids->size(); std::unique_lock lock(mutex); cond.notify_one(); - return; - } - } - LOG(INFO) << "close load " << *this - << ", failed_tablet_num=" << failed_tablet_ids->size() - << ", success_tablet_num=" << success_tablet_ids->size(); - std::unique_lock lock(mutex); - cond.notify_one(); - }); + }); if (ret) { cond.wait(lock); } else { @@ -300,24 +318,21 @@ Status LoadStream::close(int64_t src_id, const std::vector& tablets_t "there is not enough thread resource for close load"); } } - - // do not return commit info for non-last one. return st; } -void LoadStream::_report_result(StreamId stream, Status& st, - std::vector* success_tablet_ids, - std::vector* failed_tablet_ids) { - LOG(INFO) << "report result, success tablet num " << success_tablet_ids->size() - << ", failed tablet num " << failed_tablet_ids->size(); +void LoadStream::_report_result(StreamId stream, const Status& st, + const std::vector& success_tablet_ids, + const std::vector& failed_tablet_ids) { + LOG(INFO) << "report result, success tablet num " << success_tablet_ids.size() + << ", failed tablet num " << failed_tablet_ids.size(); butil::IOBuf buf; PWriteStreamSinkResponse response; st.to_protobuf(response.mutable_status()); - for (auto& id : *success_tablet_ids) { + for (auto& id : success_tablet_ids) { response.add_success_tablet_ids(id); } - - for (auto& id : *failed_tablet_ids) { + for (auto& id : failed_tablet_ids) { response.add_failed_tablet_ids(id); } @@ -345,6 +360,32 @@ void LoadStream::_report_result(StreamId stream, Status& st, } } +void LoadStream::_report_schema(StreamId stream, const PStreamHeader& hdr) { + butil::IOBuf buf; + PWriteStreamSinkResponse response; + Status st = Status::OK(); + for (const auto& req : hdr.tablets()) { + TabletManager* tablet_mgr = StorageEngine::instance()->tablet_manager(); + TabletSharedPtr tablet = tablet_mgr->get_tablet(req.tablet_id()); + if (tablet == nullptr) { + st = Status::NotFound("Tablet {} not found", req.tablet_id()); + break; + } + auto resp = response.add_tablet_schemas(); + resp->set_index_id(req.index_id()); + resp->set_enable_unique_key_merge_on_write(tablet->enable_unique_key_merge_on_write()); + tablet->tablet_schema()->to_schema_pb(resp->mutable_tablet_schema()); + } + st.to_protobuf(response.mutable_status()); + + buf.append(response.SerializeAsString()); + int ret = brpc::StreamWrite(stream, buf); + // TODO: handle EAGAIN + if (ret != 0) { + LOG(INFO) << "stream write report schema " << ret << ": " << std::strerror(ret); + } +} + void LoadStream::_parse_header(butil::IOBuf* const message, PStreamHeader& hdr) { butil::IOBufAsZeroCopyInputStream wrapper(*message); hdr.ParseFromZeroCopyStream(&wrapper); @@ -414,18 +455,20 @@ int LoadStream::on_received_messages(StreamId id, butil::IOBuf* const messages[] void LoadStream::_dispatch(StreamId id, const PStreamHeader& hdr, butil::IOBuf* data) { VLOG_DEBUG << PStreamHeader_Opcode_Name(hdr.opcode()) << " from " << hdr.src_id() << " with tablet " << hdr.tablet_id(); + if (UniqueId(hdr.load_id()) != UniqueId(_load_id)) { + Status st = Status::Error("invalid load id {}, expected {}", + UniqueId(hdr.load_id()).to_string(), + UniqueId(_load_id).to_string()); + _report_failure(id, st, hdr); + return; + } { std::lock_guard lock_guard(_lock); if (!_open_streams.contains(hdr.src_id())) { - std::vector success_tablet_ids; - std::vector failed_tablet_ids; - if (hdr.has_tablet_id()) { - failed_tablet_ids.push_back(hdr.tablet_id()); - } Status st = Status::Error("no open stream from source {}", hdr.src_id()); - _report_result(id, st, &success_tablet_ids, &failed_tablet_ids); + _report_failure(id, st, hdr); return; } } @@ -435,21 +478,20 @@ void LoadStream::_dispatch(StreamId id, const PStreamHeader& hdr, butil::IOBuf* case PStreamHeader::APPEND_DATA: { auto st = _append_data(hdr, data); if (!st.ok()) { - std::vector success_tablet_ids; - std::vector failed_tablet_ids; - failed_tablet_ids.push_back(hdr.tablet_id()); - _report_result(id, st, &success_tablet_ids, &failed_tablet_ids); + _report_failure(id, st, hdr); } } break; case PStreamHeader::CLOSE_LOAD: { std::vector success_tablet_ids; std::vector failed_tablet_ids; - std::vector tablets_to_commit(hdr.tablets_to_commit().begin(), - hdr.tablets_to_commit().end()); + std::vector tablets_to_commit(hdr.tablets().begin(), hdr.tablets().end()); auto st = close(hdr.src_id(), tablets_to_commit, &success_tablet_ids, &failed_tablet_ids); - _report_result(id, st, &success_tablet_ids, &failed_tablet_ids); + _report_result(id, st, success_tablet_ids, failed_tablet_ids); brpc::StreamClose(id); } break; + case PStreamHeader::GET_SCHEMA: { + _report_schema(id, hdr); + } break; default: LOG(WARNING) << "unexpected stream message " << hdr.opcode(); DCHECK(false); @@ -461,9 +503,9 @@ void LoadStream::on_idle_timeout(StreamId id) { } void LoadStream::on_closed(StreamId id) { - auto remaining_rpc_stream = remove_rpc_stream(); - LOG(INFO) << "stream closed " << id << ", remaining_rpc_stream=" << remaining_rpc_stream; - if (remaining_rpc_stream == 0) { + auto remaining_streams = _total_streams - _close_rpc_cnt.fetch_add(1) - 1; + LOG(INFO) << "stream " << id << " on_closed, remaining streams = " << remaining_streams; + if (remaining_streams == 0) { _load_stream_mgr->clear_load(_load_id); } } diff --git a/be/src/runtime/load_stream.h b/be/src/runtime/load_stream.h index 317dc51086178b..2674e77bc3c5ba 100644 --- a/be/src/runtime/load_stream.h +++ b/be/src/runtime/load_stream.h @@ -18,6 +18,7 @@ #pragma once #include +#include #include #include @@ -25,8 +26,6 @@ #include #include #include -// IWYU pragma: no_include -#include #include "brpc/stream.h" #include "butil/iobuf.h" @@ -106,16 +105,13 @@ class LoadStream : public brpc::StreamInputHandler { LoadStream(PUniqueId load_id, LoadStreamMgr* load_stream_mgr, bool enable_profile); ~LoadStream(); - Status init(const POpenStreamSinkRequest* request); + Status init(const POpenLoadStreamRequest* request); void add_source(int64_t src_id) { std::lock_guard lock_guard(_lock); _open_streams[src_id]++; } - uint32_t add_rpc_stream() { return ++_num_rpc_streams; } - uint32_t remove_rpc_stream() { return --_num_rpc_streams; } - Status close(int64_t src_id, const std::vector& tablets_to_commit, std::vector* success_tablet_ids, std::vector* failed_tablet_ids); @@ -130,16 +126,32 @@ class LoadStream : public brpc::StreamInputHandler { void _parse_header(butil::IOBuf* const message, PStreamHeader& hdr); void _dispatch(StreamId id, const PStreamHeader& hdr, butil::IOBuf* data); Status _append_data(const PStreamHeader& header, butil::IOBuf* data); - void _report_result(StreamId stream, Status& st, std::vector* success_tablet_ids, - std::vector* failed_tablet_ids); + + void _report_result(StreamId stream, const Status& st, + const std::vector& success_tablet_ids, + const std::vector& failed_tablet_ids); + void _report_schema(StreamId stream, const PStreamHeader& hdr); + + // report failure for one message + void _report_failure(StreamId stream, const Status& status, const PStreamHeader& header) { + std::vector success; // empty + std::vector failure; + if (header.has_tablet_id()) { + failure.push_back(header.tablet_id()); + } + _report_result(stream, status, success, failure); + } private: PUniqueId _load_id; std::unordered_map _index_streams_map; - std::atomic _num_rpc_streams; + int32_t _total_streams = 0; + int32_t _close_load_cnt = 0; + std::atomic _close_rpc_cnt = 0; + std::vector _tablets_to_commit; bthread::Mutex _lock; std::unordered_map _open_streams; - int64_t _txn_id; + int64_t _txn_id = 0; std::shared_ptr _schema; bool _enable_profile = false; std::unique_ptr _profile; diff --git a/be/src/runtime/load_stream_mgr.cpp b/be/src/runtime/load_stream_mgr.cpp index d1af7b60d3fd3d..b3553046aec9f8 100644 --- a/be/src/runtime/load_stream_mgr.cpp +++ b/be/src/runtime/load_stream_mgr.cpp @@ -45,7 +45,7 @@ LoadStreamMgr::~LoadStreamMgr() { _file_writer_thread_pool->shutdown(); } -Status LoadStreamMgr::open_load_stream(const POpenStreamSinkRequest* request, +Status LoadStreamMgr::open_load_stream(const POpenLoadStreamRequest* request, LoadStreamSharedPtr& load_stream) { UniqueId load_id(request->load_id()); diff --git a/be/src/runtime/load_stream_mgr.h b/be/src/runtime/load_stream_mgr.h index da3ae98a42e56f..debdd2f2fb130d 100644 --- a/be/src/runtime/load_stream_mgr.h +++ b/be/src/runtime/load_stream_mgr.h @@ -18,6 +18,7 @@ #pragma once #include +#include #include #include @@ -25,8 +26,6 @@ #include #include #include -// IWYU pragma: no_include -#include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/status.h" @@ -39,7 +38,7 @@ class LoadStreamMgr { FifoThreadPool* light_work_pool); ~LoadStreamMgr(); - Status open_load_stream(const POpenStreamSinkRequest* request, + Status open_load_stream(const POpenLoadStreamRequest* request, LoadStreamSharedPtr& load_stream); void clear_load(UniqueId loadid); std::unique_ptr new_token() { diff --git a/be/src/runtime/load_stream_writer.cpp b/be/src/runtime/load_stream_writer.cpp index e44a682b25babd..e7fe9abdcf3e6d 100644 --- a/be/src/runtime/load_stream_writer.cpp +++ b/be/src/runtime/load_stream_writer.cpp @@ -28,7 +28,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/config.h" #include "common/logging.h" @@ -121,7 +120,15 @@ Status LoadStreamWriter::close_segment(uint32_t segid) { return Status::OK(); } -Status LoadStreamWriter::add_segment(uint32_t segid, SegmentStatistics& stat) { +Status LoadStreamWriter::add_segment(uint32_t segid, const SegmentStatistics& stat) { + if (_segment_file_writers[segid]->bytes_appended() != stat.data_size) { + LOG(WARNING) << _segment_file_writers[segid]->path() << " is incomplete, actual size: " + << _segment_file_writers[segid]->bytes_appended() + << ", expected size: " << stat.data_size; + return Status::Corruption("segment {} is incomplete, actual size: {}, expected size: {}", + _segment_file_writers[segid]->path().native(), + _segment_file_writers[segid]->bytes_appended(), stat.data_size); + } return _rowset_writer->add_segment(segid, stat); } @@ -143,9 +150,9 @@ Status LoadStreamWriter::close() { return Status::Error("flush segment failed"); } - for (size_t i = 0; i < _segment_file_writers.size(); i++) { - if (!_segment_file_writers[i]->is_closed()) { - return Status::Corruption("segment {} is not eos", i); + for (const auto& writer : _segment_file_writers) { + if (!writer->is_closed()) { + return Status::Corruption("segment {} is not closed", writer->path().native()); } } diff --git a/be/src/runtime/load_stream_writer.h b/be/src/runtime/load_stream_writer.h index e4a3341cea38d7..dc952d2c18da48 100644 --- a/be/src/runtime/load_stream_writer.h +++ b/be/src/runtime/load_stream_writer.h @@ -71,7 +71,7 @@ class LoadStreamWriter { Status close_segment(uint32_t segid); - Status add_segment(uint32_t segid, SegmentStatistics& stat); + Status add_segment(uint32_t segid, const SegmentStatistics& stat); // wait for all memtables to be flushed. Status close(); diff --git a/be/src/runtime/memory/jemalloc_hook.cpp b/be/src/runtime/memory/jemalloc_hook.cpp index ec40879e4bf8db..445d60d382c270 100644 --- a/be/src/runtime/memory/jemalloc_hook.cpp +++ b/be/src/runtime/memory/jemalloc_hook.cpp @@ -19,7 +19,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "jemalloc/jemalloc.h" #include "runtime/thread_context.h" @@ -41,16 +40,18 @@ extern "C" { // mem hook should avoid nesting new/malloc. void* doris_malloc(size_t size) __THROW { - CONSUME_MEM_TRACKER(jenallocx(size, 0)); + CONSUME_THREAD_MEM_TRACKER_BY_HOOK_WITH_FN([](size_t size) { return jenallocx(size, 0); }, + size); void* ptr = jemalloc(size); if (UNLIKELY(ptr == nullptr)) { - RELEASE_MEM_TRACKER(jenallocx(size, 0)); + RELEASE_THREAD_MEM_TRACKER_BY_HOOK_WITH_FN([](size_t size) { return jenallocx(size, 0); }, + size); } return ptr; } void doris_free(void* p) __THROW { - RELEASE_MEM_TRACKER(jemalloc_usable_size(p)); + RELEASE_THREAD_MEM_TRACKER_BY_HOOK_WITH_FN([](void* p) { return jemalloc_usable_size(p); }, p); jefree(p); } @@ -61,14 +62,20 @@ void* doris_realloc(void* p, size_t size) __THROW { #if USE_MEM_TRACKER int64_t old_size = jemalloc_usable_size(p); -#endif - - CONSUME_MEM_TRACKER(jenallocx(size, 0) - old_size); + CONSUME_THREAD_MEM_TRACKER_BY_HOOK_WITH_FN( + [](size_t size, int64_t old_size) { return jenallocx(size, 0) - old_size; }, size, + old_size); void* ptr = jerealloc(p, size); if (UNLIKELY(ptr == nullptr)) { - RELEASE_MEM_TRACKER(jenallocx(size, 0) - old_size); + RELEASE_THREAD_MEM_TRACKER_BY_HOOK_WITH_FN( + [](size_t size, int64_t old_size) { return jenallocx(size, 0) - old_size; }, size, + old_size); } return ptr; +#else + void* ptr = jerealloc(p, size); + return ptr; +#endif } void* doris_calloc(size_t n, size_t size) __THROW { @@ -76,72 +83,80 @@ void* doris_calloc(size_t n, size_t size) __THROW { return nullptr; } - CONSUME_MEM_TRACKER(n * size); + CONSUME_THREAD_MEM_TRACKER_BY_HOOK(n * size); void* ptr = jecalloc(n, size); if (UNLIKELY(ptr == nullptr)) { - RELEASE_MEM_TRACKER(n * size); + RELEASE_THREAD_MEM_TRACKER_BY_HOOK(n * size); } else { - CONSUME_MEM_TRACKER(jemalloc_usable_size(ptr) - n * size); + CONSUME_THREAD_MEM_TRACKER_BY_HOOK_WITH_FN( + [](void* ptr, size_t size) { return jemalloc_usable_size(ptr) - size; }, ptr, + n * size); } return ptr; } void doris_cfree(void* ptr) __THROW { - RELEASE_MEM_TRACKER(jemalloc_usable_size(ptr)); + RELEASE_THREAD_MEM_TRACKER_BY_HOOK_WITH_FN([](void* ptr) { return jemalloc_usable_size(ptr); }, + ptr); jefree(ptr); } void* doris_memalign(size_t align, size_t size) __THROW { - CONSUME_MEM_TRACKER(size); + CONSUME_THREAD_MEM_TRACKER_BY_HOOK(size); void* ptr = jealigned_alloc(align, size); if (UNLIKELY(ptr == nullptr)) { - RELEASE_MEM_TRACKER(size); + RELEASE_THREAD_MEM_TRACKER_BY_HOOK(size); } else { - CONSUME_MEM_TRACKER(jemalloc_usable_size(ptr) - size); + CONSUME_THREAD_MEM_TRACKER_BY_HOOK_WITH_FN( + [](void* ptr, size_t size) { return jemalloc_usable_size(ptr) - size; }, ptr, size); } return ptr; } void* doris_aligned_alloc(size_t align, size_t size) __THROW { - CONSUME_MEM_TRACKER(size); + CONSUME_THREAD_MEM_TRACKER_BY_HOOK(size); void* ptr = jealigned_alloc(align, size); if (UNLIKELY(ptr == nullptr)) { - RELEASE_MEM_TRACKER(size); + RELEASE_THREAD_MEM_TRACKER_BY_HOOK(size); } else { - CONSUME_MEM_TRACKER(jemalloc_usable_size(ptr) - size); + CONSUME_THREAD_MEM_TRACKER_BY_HOOK_WITH_FN( + [](void* ptr, size_t size) { return jemalloc_usable_size(ptr) - size; }, ptr, size); } return ptr; } void* doris_valloc(size_t size) __THROW { - CONSUME_MEM_TRACKER(size); + CONSUME_THREAD_MEM_TRACKER_BY_HOOK(size); void* ptr = jevalloc(size); if (UNLIKELY(ptr == nullptr)) { - RELEASE_MEM_TRACKER(size); + RELEASE_THREAD_MEM_TRACKER_BY_HOOK(size); } else { - CONSUME_MEM_TRACKER(jemalloc_usable_size(ptr) - size); + CONSUME_THREAD_MEM_TRACKER_BY_HOOK_WITH_FN( + [](void* ptr, size_t size) { return jemalloc_usable_size(ptr) - size; }, ptr, size); } return ptr; } void* doris_pvalloc(size_t size) __THROW { - CONSUME_MEM_TRACKER(size); + CONSUME_THREAD_MEM_TRACKER_BY_HOOK(size); void* ptr = jevalloc(size); if (UNLIKELY(ptr == nullptr)) { - RELEASE_MEM_TRACKER(size); + RELEASE_THREAD_MEM_TRACKER_BY_HOOK(size); } else { - CONSUME_MEM_TRACKER(jemalloc_usable_size(ptr) - size); + CONSUME_THREAD_MEM_TRACKER_BY_HOOK_WITH_FN( + [](void* ptr, size_t size) { return jemalloc_usable_size(ptr) - size; }, ptr, size); } return ptr; } int doris_posix_memalign(void** r, size_t align, size_t size) __THROW { - CONSUME_MEM_TRACKER(size); + CONSUME_THREAD_MEM_TRACKER_BY_HOOK(size); int ret = jeposix_memalign(r, align, size); if (UNLIKELY(ret != 0)) { - RELEASE_MEM_TRACKER(size); + RELEASE_THREAD_MEM_TRACKER_BY_HOOK(size); } else { - CONSUME_MEM_TRACKER(jemalloc_usable_size(*r) - size); + CONSUME_THREAD_MEM_TRACKER_BY_HOOK_WITH_FN( + [](void* ptr, size_t size) { return jemalloc_usable_size(ptr) - size; }, *r, size); } return ret; } diff --git a/be/src/runtime/memory/mem_tracker.cpp b/be/src/runtime/memory/mem_tracker.cpp index df8e9d80a4ad34..3d23bd447c19a5 100644 --- a/be/src/runtime/memory/mem_tracker.cpp +++ b/be/src/runtime/memory/mem_tracker.cpp @@ -47,7 +47,7 @@ void MemTracker::bind_parent(MemTrackerLimiter* parent) { if (parent) { _parent_label = parent->label(); _parent_group_num = parent->group_num(); - } else if (thread_context_ptr.init) { + } else if (is_thread_context_init()) { _parent_label = thread_context()->thread_mem_tracker()->label(); _parent_group_num = thread_context()->thread_mem_tracker()->group_num(); } diff --git a/be/src/runtime/memory/mem_tracker.h b/be/src/runtime/memory/mem_tracker.h index 94d836015705ff..8af83d299f9eb7 100644 --- a/be/src/runtime/memory/mem_tracker.h +++ b/be/src/runtime/memory/mem_tracker.h @@ -31,7 +31,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "util/pretty_printer.h" diff --git a/be/src/runtime/memory/mem_tracker_limiter.cpp b/be/src/runtime/memory/mem_tracker_limiter.cpp index 268b0ec38eaba2..ad90df81145e44 100644 --- a/be/src/runtime/memory/mem_tracker_limiter.cpp +++ b/be/src/runtime/memory/mem_tracker_limiter.cpp @@ -136,7 +136,7 @@ void MemTrackerLimiter::refresh_global_counter() { void MemTrackerLimiter::make_process_snapshots(std::vector* snapshots) { MemTrackerLimiter::refresh_global_counter(); - int64_t process_mem_sum = 0; + int64_t all_tracker_mem_sum = 0; Snapshot snapshot; for (auto it : MemTrackerLimiter::TypeMemSum) { snapshot.type = type_string(it.first); @@ -145,7 +145,7 @@ void MemTrackerLimiter::make_process_snapshots(std::vector snapshot.cur_consumption = it.second->current_value(); snapshot.peak_consumption = it.second->peak_value(); (*snapshots).emplace_back(snapshot); - process_mem_sum += it.second->current_value(); + all_tracker_mem_sum += it.second->current_value(); } snapshot.type = "tc/jemalloc_free_memory"; @@ -154,14 +154,28 @@ void MemTrackerLimiter::make_process_snapshots(std::vector snapshot.cur_consumption = MemInfo::allocator_cache_mem(); snapshot.peak_consumption = -1; (*snapshots).emplace_back(snapshot); - process_mem_sum += MemInfo::allocator_cache_mem(); + all_tracker_mem_sum += MemInfo::allocator_cache_mem(); - snapshot.type = "process"; + snapshot.type = "all_tracker_sum (is virtual memory)"; snapshot.label = ""; snapshot.limit = -1; - snapshot.cur_consumption = process_mem_sum; + snapshot.cur_consumption = all_tracker_mem_sum; snapshot.peak_consumption = -1; (*snapshots).emplace_back(snapshot); + + snapshot.type = "process resident memory (from /proc VmRSS VmHWM)"; + snapshot.label = ""; + snapshot.limit = -1; + snapshot.cur_consumption = PerfCounters::get_vm_rss(); + snapshot.peak_consumption = PerfCounters::get_vm_hwm(); + (*snapshots).emplace_back(snapshot); + + snapshot.type = "process virtual memory (from /proc VmSize VmPeak)"; + snapshot.label = ""; + snapshot.limit = -1; + snapshot.cur_consumption = PerfCounters::get_vm_size(); + snapshot.peak_consumption = PerfCounters::get_vm_peak(); + (*snapshots).emplace_back(snapshot); } void MemTrackerLimiter::make_type_snapshots(std::vector* snapshots, @@ -343,7 +357,9 @@ std::string MemTrackerLimiter::tracker_limit_exceeded_str() { err_msg += fmt::format( " exec node:<{}>, can `set exec_mem_limit=8G` to change limit, details see " "be.INFO.", - doris::thread_context()->thread_mem_tracker_mgr->last_consumer_tracker()); + doris::is_thread_context_init() + ? doris::thread_context()->thread_mem_tracker_mgr->last_consumer_tracker() + : ""); } else if (_type == Type::SCHEMA_CHANGE) { err_msg += fmt::format( " can modify `memory_limitation_per_thread_for_schema_change_bytes` in be.conf to " diff --git a/be/src/runtime/memory/mem_tracker_limiter.h b/be/src/runtime/memory/mem_tracker_limiter.h index 936faa1592130b..df20448c01034f 100644 --- a/be/src/runtime/memory/mem_tracker_limiter.h +++ b/be/src/runtime/memory/mem_tracker_limiter.h @@ -155,6 +155,9 @@ class MemTrackerLimiter final : public MemTracker { // Transfer 'bytes' of consumption from this tracker to 'dst'. void transfer_to(int64_t size, MemTrackerLimiter* dst) { + if (label() == dst->label()) { + return; + } cache_consume(-size); dst->cache_consume(size); } diff --git a/be/src/runtime/memory/tcmalloc_hook.h b/be/src/runtime/memory/tcmalloc_hook.h index afd15d0b3277c6..9cc88be1b081bc 100644 --- a/be/src/runtime/memory/tcmalloc_hook.h +++ b/be/src/runtime/memory/tcmalloc_hook.h @@ -36,11 +36,11 @@ // destructor to control the behavior of consume can lead to unexpected behavior, // like this: if (LIKELY(doris::start_thread_mem_tracker)) { void new_hook(const void* ptr, size_t size) { - CONSUME_MEM_TRACKER(tc_nallocx(size, 0)); + CONSUME_THREAD_MEM_TRACKER_BY_HOOK(tc_nallocx(size, 0)); } void delete_hook(const void* ptr) { - RELEASE_MEM_TRACKER(tc_malloc_size(const_cast(ptr))); + RELEASE_THREAD_MEM_TRACKER_BY_HOOK(tc_malloc_size(const_cast(ptr))); } void init_hook() { diff --git a/be/src/runtime/memory/thread_mem_tracker_mgr.h b/be/src/runtime/memory/thread_mem_tracker_mgr.h index 487f82ac5494a2..4cb22b9e1ae75c 100644 --- a/be/src/runtime/memory/thread_mem_tracker_mgr.h +++ b/be/src/runtime/memory/thread_mem_tracker_mgr.h @@ -40,11 +40,13 @@ namespace doris { // Memory Hook is counted in the memory tracker of the current thread. class ThreadMemTrackerMgr { public: - ThreadMemTrackerMgr() {} + ThreadMemTrackerMgr() = default; ~ThreadMemTrackerMgr() { // if _init == false, exec env is not initialized when init(). and never consumed mem tracker once. - if (_init) flush_untracked_mem(); + if (_init) { + flush_untracked_mem(); + } } bool init(); @@ -77,7 +79,7 @@ class ThreadMemTrackerMgr { // such as calling LOG/iostream/sstream/stringstream/etc. related methods, // must increase the control to avoid entering infinite recursion, otherwise it may cause crash or stuck, // Returns whether the memory exceeds limit, and will consume mem trcker no matter whether the limit is exceeded. - void consume(int64_t size, bool large_memory_check = false); + void consume(int64_t size, int skip_large_memory_check = 0); void flush_untracked_mem(); bool is_attach_query() { return _fragment_instance_id != TUniqueId(); } @@ -92,7 +94,7 @@ class ThreadMemTrackerMgr { } void disable_wait_gc() { _wait_gc = false; } - bool wait_gc() { return _wait_gc; } + [[nodiscard]] bool wait_gc() const { return _wait_gc; } void cancel_instance(const std::string& exceed_msg); std::string print_debug_string() { @@ -161,9 +163,9 @@ inline void ThreadMemTrackerMgr::pop_consumer_tracker() { _consumer_tracker_stack.pop_back(); } -inline void ThreadMemTrackerMgr::consume(int64_t size, bool large_memory_check) { +inline void ThreadMemTrackerMgr::consume(int64_t size, int skip_large_memory_check) { _untracked_mem += size; - if (!ExecEnv::ready()) { + if (!_init && !ExecEnv::ready()) { return; } // When some threads `0 < _untracked_mem < config::mem_tracker_consume_min_size_bytes` @@ -176,7 +178,7 @@ inline void ThreadMemTrackerMgr::consume(int64_t size, bool large_memory_check) flush_untracked_mem(); } - if (large_memory_check && doris::config::large_memory_check_bytes > 0 && + if (skip_large_memory_check == 0 && doris::config::large_memory_check_bytes > 0 && size > doris::config::large_memory_check_bytes) { _stop_consume = true; LOG(WARNING) << fmt::format( diff --git a/be/src/runtime/plan_fragment_executor.cpp b/be/src/runtime/plan_fragment_executor.cpp index a58744a5692b02..3af870c7f07149 100644 --- a/be/src/runtime/plan_fragment_executor.cpp +++ b/be/src/runtime/plan_fragment_executor.cpp @@ -24,10 +24,6 @@ #include #include #include -#include -#include -#include -#include #include #include #include @@ -59,7 +55,6 @@ #include "util/debug_util.h" #include "util/defer_op.h" #include "util/pretty_printer.h" -#include "util/telemetry/telemetry.h" #include "util/threadpool.h" #include "util/time.h" #include "util/uid_util.h" @@ -116,12 +111,6 @@ PlanFragmentExecutor::~PlanFragmentExecutor() { } Status PlanFragmentExecutor::prepare(const TExecPlanFragmentParams& request) { - OpentelemetryTracer tracer = telemetry::get_noop_tracer(); - if (opentelemetry::trace::Tracer::GetCurrentSpan()->GetContext().IsValid()) { - tracer = telemetry::get_tracer(print_id(_query_ctx->query_id())); - } - _span = tracer->StartSpan("Plan_fragment_executor"); - OpentelemetryScope scope {_span}; if (request.__isset.query_options) { _timeout_second = request.query_options.execution_timeout; } @@ -140,7 +129,6 @@ Status PlanFragmentExecutor::prepare(const TExecPlanFragmentParams& request) { RuntimeState::create_unique(params, request.query_options, query_globals, _exec_env); _runtime_state->set_query_ctx(_query_ctx.get()); _runtime_state->set_query_mem_tracker(_query_ctx->query_mem_tracker); - _runtime_state->set_tracer(std::move(tracer)); SCOPED_ATTACH_TASK(_runtime_state.get()); static_cast(_runtime_state->runtime_filter_mgr()->init()); @@ -224,6 +212,9 @@ Status PlanFragmentExecutor::prepare(const TExecPlanFragmentParams& request) { _runtime_state->set_per_fragment_instance_idx(params.sender_id); _runtime_state->set_num_per_fragment_instances(params.num_senders); + _runtime_state->set_load_stream_per_node(request.load_stream_per_node); + _runtime_state->set_total_load_streams(request.total_load_streams); + _runtime_state->set_num_local_sink(request.num_local_sink); // set up sink, if required if (request.fragment.__isset.output_sink) { @@ -332,9 +323,25 @@ Status PlanFragmentExecutor::open_vectorized_internal() { : doris::vectorized::Block::create_unique(); bool eos = false; + auto st = Status::OK(); + auto handle_group_commit = [&]() { + if (UNLIKELY(_group_commit && !st.ok() && block != nullptr)) { + auto* future_block = dynamic_cast(block.get()); + std::unique_lock l(*(future_block->lock)); + if (!future_block->is_handled()) { + future_block->set_result(st, 0, 0); + future_block->cv->notify_all(); + } + } + }; + while (!eos) { RETURN_IF_CANCELLED(_runtime_state); - RETURN_IF_ERROR(get_vectorized_internal(block.get(), &eos)); + st = get_vectorized_internal(block.get(), &eos); + if (UNLIKELY(!st.ok())) { + handle_group_commit(); + return st; + } // Collect this plan and sub plan statistics, and send to parent plan. if (_collect_query_statistics_with_every_batch) { @@ -342,23 +349,13 @@ Status PlanFragmentExecutor::open_vectorized_internal() { } if (!eos || block->rows() > 0) { - auto st = _sink->send(runtime_state(), block.get()); + st = _sink->send(runtime_state(), block.get()); //TODO: Asynchronisation need refactor this if (st.is()) { // created partition, do it again. st = _sink->send(runtime_state(), block.get()); DCHECK(!st.is()); } - if (UNLIKELY(!st.ok() || block->rows() == 0)) { - // Used for group commit insert - if (_group_commit) { - auto* future_block = dynamic_cast(block.get()); - std::unique_lock l(*(future_block->lock)); - if (!future_block->is_handled()) { - future_block->set_result(st, 0, 0); - future_block->cv->notify_all(); - } - } - } + handle_group_commit(); if (st.is()) { break; } @@ -421,7 +418,6 @@ Status PlanFragmentExecutor::execute() { int64_t duration_ns = 0; { SCOPED_RAW_TIMER(&duration_ns); - opentelemetry::trace::Tracer::GetCurrentSpan()->AddEvent("start executing Fragment"); Status st = open(); WARN_IF_ERROR(st, strings::Substitute("Got error while opening fragment $0, query id: $1", print_id(_fragment_instance_id), @@ -680,7 +676,6 @@ void PlanFragmentExecutor::close() { } } - profile()->add_to_span(_span); _closed = true; } diff --git a/be/src/runtime/plan_fragment_executor.h b/be/src/runtime/plan_fragment_executor.h index 5426626da32416..6a73646f52f3be 100644 --- a/be/src/runtime/plan_fragment_executor.h +++ b/be/src/runtime/plan_fragment_executor.h @@ -240,8 +240,6 @@ class PlanFragmentExecutor { PPlanFragmentCancelReason _cancel_reason; std::string _cancel_msg; - OpentelemetrySpan _span; - bool _group_commit = false; DescriptorTbl* _desc_tbl; diff --git a/be/src/runtime/query_context.h b/be/src/runtime/query_context.h index c51e92d14c83bd..bb7ad20b90bd43 100644 --- a/be/src/runtime/query_context.h +++ b/be/src/runtime/query_context.h @@ -210,6 +210,11 @@ class QueryContext { return _query_options.runtime_filter_wait_time_ms; } + bool runtime_filter_wait_infinitely() const { + return _query_options.__isset.runtime_filter_wait_infinitely && + _query_options.runtime_filter_wait_infinitely; + } + bool enable_pipeline_exec() const { return _query_options.__isset.enable_pipeline_engine && _query_options.enable_pipeline_engine; diff --git a/be/src/runtime/query_statistics.cpp b/be/src/runtime/query_statistics.cpp index 22c18faa1e6f72..dfe63f576260db 100644 --- a/be/src/runtime/query_statistics.cpp +++ b/be/src/runtime/query_statistics.cpp @@ -88,9 +88,10 @@ void QueryStatistics::merge(QueryStatisticsRecvr* recvr) { } void QueryStatistics::merge(QueryStatisticsRecvr* recvr, int sender_id) { - auto it = recvr->_query_statistics.find(sender_id); - if (it != recvr->_query_statistics.end()) { - merge(*it->second); + DCHECK(recvr != nullptr); + auto QueryStatisticsptr = recvr->find(sender_id); + if (QueryStatisticsptr) { + merge(*QueryStatisticsptr); } } @@ -120,4 +121,13 @@ void QueryStatisticsRecvr::insert(QueryStatisticsPtr statistics, int sender_id) _query_statistics[sender_id] = statistics; } +QueryStatisticsPtr QueryStatisticsRecvr::find(int sender_id) { + std::lock_guard l(_lock); + auto it = _query_statistics.find(sender_id); + if (it != _query_statistics.end()) { + return it->second; + } + return nullptr; +} + } // namespace doris diff --git a/be/src/runtime/query_statistics.h b/be/src/runtime/query_statistics.h index 42c1457472f6ba..d32b7a60e2a7e0 100644 --- a/be/src/runtime/query_statistics.h +++ b/be/src/runtime/query_statistics.h @@ -140,6 +140,8 @@ class QueryStatisticsRecvr { // using local_exchange for transmission, only need to hold a shared pointer. void insert(QueryStatisticsPtr statistics, int sender_id); + QueryStatisticsPtr find(int sender_id); + private: friend class QueryStatistics; diff --git a/be/src/runtime/result_buffer_mgr.cpp b/be/src/runtime/result_buffer_mgr.cpp index 9dbe228bcd9890..a2009c5ec3c970 100644 --- a/be/src/runtime/result_buffer_mgr.cpp +++ b/be/src/runtime/result_buffer_mgr.cpp @@ -109,21 +109,21 @@ std::shared_ptr ResultBufferMgr::find_control_block(const TU return std::shared_ptr(); } -void ResultBufferMgr::register_row_descriptor(const TUniqueId& query_id, - const RowDescriptor& row_desc) { - std::unique_lock wlock(_row_descriptor_map_lock); - _row_descriptor_map.insert(std::make_pair(query_id, row_desc)); +void ResultBufferMgr::register_arrow_schema(const TUniqueId& query_id, + const std::shared_ptr& arrow_schema) { + std::unique_lock wlock(_arrow_schema_map_lock); + _arrow_schema_map.insert(std::make_pair(query_id, arrow_schema)); } -RowDescriptor ResultBufferMgr::find_row_descriptor(const TUniqueId& query_id) { - std::shared_lock rlock(_row_descriptor_map_lock); - RowDescriptorMap::iterator iter = _row_descriptor_map.find(query_id); +std::shared_ptr ResultBufferMgr::find_arrow_schema(const TUniqueId& query_id) { + std::shared_lock rlock(_arrow_schema_map_lock); + auto iter = _arrow_schema_map.find(query_id); - if (_row_descriptor_map.end() != iter) { + if (_arrow_schema_map.end() != iter) { return iter->second; } - return RowDescriptor(); + return nullptr; } void ResultBufferMgr::fetch_data(const PUniqueId& finst_id, GetResultBatchCtx* ctx) { @@ -162,11 +162,11 @@ Status ResultBufferMgr::cancel(const TUniqueId& query_id) { } { - std::unique_lock wlock(_row_descriptor_map_lock); - RowDescriptorMap::iterator row_desc_iter = _row_descriptor_map.find(query_id); + std::unique_lock wlock(_arrow_schema_map_lock); + auto arrow_schema_iter = _arrow_schema_map.find(query_id); - if (_row_descriptor_map.end() != row_desc_iter) { - _row_descriptor_map.erase(row_desc_iter); + if (_arrow_schema_map.end() != arrow_schema_iter) { + _arrow_schema_map.erase(arrow_schema_iter); } } diff --git a/be/src/runtime/result_buffer_mgr.h b/be/src/runtime/result_buffer_mgr.h index 4e5cd38a7264b7..7995496cbf9c6d 100644 --- a/be/src/runtime/result_buffer_mgr.h +++ b/be/src/runtime/result_buffer_mgr.h @@ -29,12 +29,12 @@ #include "common/status.h" #include "gutil/ref_counted.h" -#include "runtime/descriptors.h" #include "util/countdown_latch.h" #include "util/hash_util.hpp" namespace arrow { class RecordBatch; +class Schema; } // namespace arrow namespace doris { @@ -66,8 +66,9 @@ class ResultBufferMgr { // fetch data result to Arrow Flight Server Status fetch_arrow_data(const TUniqueId& finst_id, std::shared_ptr* result); - void register_row_descriptor(const TUniqueId& query_id, const RowDescriptor& row_desc); - RowDescriptor find_row_descriptor(const TUniqueId& query_id); + void register_arrow_schema(const TUniqueId& query_id, + const std::shared_ptr& arrow_schema); + std::shared_ptr find_arrow_schema(const TUniqueId& query_id); // cancel Status cancel(const TUniqueId& fragment_id); @@ -78,7 +79,7 @@ class ResultBufferMgr { private: using BufferMap = std::unordered_map>; using TimeoutMap = std::map>; - using RowDescriptorMap = std::unordered_map; + using ArrowSchemaMap = std::unordered_map>; std::shared_ptr find_control_block(const TUniqueId& query_id); @@ -90,10 +91,10 @@ class ResultBufferMgr { std::shared_mutex _buffer_map_lock; // buffer block map BufferMap _buffer_map; - // lock for descriptor map - std::shared_mutex _row_descriptor_map_lock; + // lock for arrow schema map + std::shared_mutex _arrow_schema_map_lock; // for arrow flight - RowDescriptorMap _row_descriptor_map; + ArrowSchemaMap _arrow_schema_map; // lock for timeout map std::mutex _timeout_lock; diff --git a/be/src/runtime/routine_load/routine_load_task_executor.cpp b/be/src/runtime/routine_load/routine_load_task_executor.cpp index c9769b43b44030..ca43ee055cb2e9 100644 --- a/be/src/runtime/routine_load/routine_load_task_executor.cpp +++ b/be/src/runtime/routine_load/routine_load_task_executor.cpp @@ -33,7 +33,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/config.h" #include "common/logging.h" diff --git a/be/src/runtime/runtime_predicate.cpp b/be/src/runtime/runtime_predicate.cpp index 2b949fb10e6c53..a16984ae7075f4 100644 --- a/be/src/runtime/runtime_predicate.cpp +++ b/be/src/runtime/runtime_predicate.cpp @@ -21,7 +21,6 @@ #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "olap/accept_null_predicate.h" #include "olap/column_predicate.h" diff --git a/be/src/runtime/runtime_state.cpp b/be/src/runtime/runtime_state.cpp index 7a24b86621bbfb..daf3e1bc75e0f3 100644 --- a/be/src/runtime/runtime_state.cpp +++ b/be/src/runtime/runtime_state.cpp @@ -101,11 +101,10 @@ RuntimeState::RuntimeState(const TPlanFragmentExecParams& fragment_exec_params, DCHECK(status.ok()); } -RuntimeState::RuntimeState(const TPipelineInstanceParams& pipeline_params, - const TUniqueId& query_id, int32_t fragment_id, - const TQueryOptions& query_options, const TQueryGlobals& query_globals, - ExecEnv* exec_env) - : _profile("Fragment " + print_id(pipeline_params.fragment_instance_id)), +RuntimeState::RuntimeState(const TUniqueId& instance_id, const TUniqueId& query_id, + int32_t fragment_id, const TQueryOptions& query_options, + const TQueryGlobals& query_globals, ExecEnv* exec_env) + : _profile("Fragment " + print_id(instance_id)), _load_channel_profile(""), _obj_pool(new ObjectPool()), _runtime_filter_mgr(new RuntimeFilterMgr(query_id, this)), @@ -125,11 +124,7 @@ RuntimeState::RuntimeState(const TPipelineInstanceParams& pipeline_params, _normal_row_number(0), _error_row_number(0), _error_log_file(nullptr) { - if (pipeline_params.__isset.runtime_filter_params) { - _runtime_filter_mgr->set_runtime_filter_params(pipeline_params.runtime_filter_params); - } - Status status = - init(pipeline_params.fragment_instance_id, query_options, query_globals, exec_env); + [[maybe_unused]] auto status = init(instance_id, query_options, query_globals, exec_env); DCHECK(status.ok()); } @@ -273,6 +268,11 @@ Status RuntimeState::init(const TUniqueId& fragment_instance_id, const TQueryOpt return Status::OK(); } +void RuntimeState::set_runtime_filter_params( + const TRuntimeFilterParams& runtime_filter_params) const { + _runtime_filter_mgr->set_runtime_filter_params(runtime_filter_params); +} + void RuntimeState::init_mem_trackers(const TUniqueId& id, const std::string& name) { _query_mem_tracker = std::make_shared( MemTrackerLimiter::Type::EXPERIMENTAL, fmt::format("{}#Id={}", name, print_id(id))); @@ -334,7 +334,7 @@ Status RuntimeState::check_query_state(const std::string& msg) { // // If the thread MemTrackerLimiter exceeds the limit, an error status is returned. // Usually used after SCOPED_ATTACH_TASK, during query execution. - if (thread_context()->thread_mem_tracker()->limit_exceeded() && + if (is_thread_context_init() && thread_context()->thread_mem_tracker()->limit_exceeded() && !config::enable_query_memory_overcommit) { auto failed_msg = fmt::format("{}, {}", msg, diff --git a/be/src/runtime/runtime_state.h b/be/src/runtime/runtime_state.h index 29ef581947cf99..835bd582894e1e 100644 --- a/be/src/runtime/runtime_state.h +++ b/be/src/runtime/runtime_state.h @@ -35,14 +35,12 @@ #include #include "cctz/time_zone.h" -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/factory_creator.h" #include "common/status.h" #include "gutil/integral_types.h" #include "util/debug_util.h" #include "util/runtime_profile.h" -#include "util/telemetry/telemetry.h" namespace doris { @@ -72,9 +70,9 @@ class RuntimeState { const TQueryOptions& query_options, const TQueryGlobals& query_globals, ExecEnv* exec_env); - RuntimeState(const TPipelineInstanceParams& pipeline_params, const TUniqueId& query_id, - int32 fragment_id, const TQueryOptions& query_options, - const TQueryGlobals& query_globals, ExecEnv* exec_env); + RuntimeState(const TUniqueId& instance_id, const TUniqueId& query_id, int32 fragment_id, + const TQueryOptions& query_options, const TQueryGlobals& query_globals, + ExecEnv* exec_env); // Used by pipelineX. This runtime state is only used for setup. RuntimeState(const TUniqueId& query_id, int32 fragment_id, const TQueryOptions& query_options, @@ -93,6 +91,8 @@ class RuntimeState { Status init(const TUniqueId& fragment_instance_id, const TQueryOptions& query_options, const TQueryGlobals& query_globals, ExecEnv* exec_env); + void set_runtime_filter_params(const TRuntimeFilterParams& runtime_filter_params) const; + // for ut and non-query. void set_exec_env(ExecEnv* exec_env) { _exec_env = exec_env; } void init_mem_trackers(const TUniqueId& id = TUniqueId(), const std::string& name = "unknown"); @@ -307,6 +307,22 @@ class RuntimeState { int num_per_fragment_instances() const { return _num_per_fragment_instances; } + void set_load_stream_per_node(int load_stream_per_node) { + _load_stream_per_node = load_stream_per_node; + } + + int load_stream_per_node() const { return _load_stream_per_node; } + + void set_total_load_streams(int total_load_streams) { + _total_load_streams = total_load_streams; + } + + int total_load_streams() const { return _total_load_streams; } + + void set_num_local_sink(int num_local_sink) { _num_local_sink = num_local_sink; } + + int num_local_sink() const { return _num_local_sink; } + bool disable_stream_preaggregations() const { return _query_options.disable_stream_preaggregations; } @@ -317,6 +333,11 @@ class RuntimeState { return _query_options.runtime_filter_wait_time_ms; } + bool runtime_filter_wait_infinitely() const { + return _query_options.__isset.runtime_filter_wait_infinitely && + _query_options.runtime_filter_wait_infinitely; + } + int32_t runtime_filter_max_in_num() const { return _query_options.runtime_filter_max_in_num; } int be_exec_version() const { @@ -329,6 +350,10 @@ class RuntimeState { return _query_options.__isset.enable_pipeline_engine && _query_options.enable_pipeline_engine; } + bool enable_pipeline_x_exec() const { + return _query_options.__isset.enable_pipeline_x_engine && + _query_options.enable_pipeline_x_engine; + } bool enable_local_shuffle() const { return _query_options.__isset.enable_local_shuffle && _query_options.enable_local_shuffle; } @@ -367,6 +392,10 @@ class RuntimeState { return _query_options.__isset.skip_delete_bitmap && _query_options.skip_delete_bitmap; } + bool skip_missing_version() const { + return _query_options.__isset.skip_missing_version && _query_options.skip_missing_version; + } + bool enable_page_cache() const; int partitioned_hash_join_rows_threshold() const { @@ -407,10 +436,6 @@ class RuntimeState { _query_mem_tracker = tracker; } - OpentelemetryTracer get_tracer() { return _tracer; } - - void set_tracer(OpentelemetryTracer&& tracer) { _tracer = std::move(tracer); } - bool enable_profile() const { return _query_options.__isset.enable_profile && _query_options.enable_profile; } @@ -535,6 +560,9 @@ class RuntimeState { int _per_fragment_instance_idx; int _num_per_fragment_instances = 0; + int _load_stream_per_node = 0; + int _total_load_streams = 0; + int _num_local_sink = 0; // The backend id on which this fragment instance runs int64_t _backend_id = -1; @@ -583,8 +611,6 @@ class RuntimeState { // true if max_filter_ratio is 0 bool _load_zero_tolerance = false; - OpentelemetryTracer _tracer = telemetry::get_noop_tracer(); - // prohibit copies RuntimeState(const RuntimeState&); }; diff --git a/be/src/runtime/snapshot_loader.cpp b/be/src/runtime/snapshot_loader.cpp index b5752710ae8ea5..afd41907289c19 100644 --- a/be/src/runtime/snapshot_loader.cpp +++ b/be/src/runtime/snapshot_loader.cpp @@ -467,7 +467,7 @@ Status SnapshotLoader::remote_http_download( for (const auto& filename : filename_list) { std::string remote_file_url = fmt::format( - "http://{}:{}/api/_tablet/_download?token={}&file={}/{}", + "http://{}:{}/api/_tablet/_download?token={}&file={}/{}&channel=ingest_binlog", remote_tablet_snapshot.remote_be_addr.hostname, remote_tablet_snapshot.remote_be_addr.port, remote_tablet_snapshot.remote_token, remote_tablet_snapshot.remote_snapshot_path, filename); @@ -696,14 +696,14 @@ Status SnapshotLoader::move(const std::string& snapshot_path, TabletSharedPtr ta } // rename the rowset ids and tabletid info in rowset meta - Status convert_status = SnapshotManager::instance()->convert_rowset_ids( + auto res = SnapshotManager::instance()->convert_rowset_ids( snapshot_path, tablet_id, tablet->replica_id(), tablet->partition_id(), schema_hash); - if (!convert_status.ok()) { - std::stringstream ss; - ss << "failed to convert rowsetids in snapshot: " << snapshot_path - << ", tablet path: " << tablet_path; - LOG(WARNING) << ss.str(); - return Status::InternalError(ss.str()); + if (!res.has_value()) [[unlikely]] { + auto err_msg = + fmt::format("failed to convert rowsetids in snapshot: {}, tablet path: {}, err: {}", + snapshot_path, tablet_path, res.error()); + LOG(WARNING) << err_msg; + return Status::InternalError(err_msg); } if (overwrite) { diff --git a/be/src/runtime/stream_load/new_load_stream_mgr.h b/be/src/runtime/stream_load/new_load_stream_mgr.h index 9c3385443659d9..61e4010a2c9182 100644 --- a/be/src/runtime/stream_load/new_load_stream_mgr.h +++ b/be/src/runtime/stream_load/new_load_stream_mgr.h @@ -45,7 +45,9 @@ class NewLoadStreamMgr { { std::lock_guard l(_lock); if (auto iter = _stream_map.find(id); iter != _stream_map.end()) { - return Status::InternalError("id already exist"); + std::stringstream ss; + ss << "id: " << id << " already exist"; + return Status::InternalError(ss.str()); } _stream_map.emplace(id, stream); } @@ -61,6 +63,7 @@ class NewLoadStreamMgr { return iter->second; } } + VLOG_NOTICE << "stream load pipe does not exist: " << id; return nullptr; } diff --git a/be/src/runtime/stream_load/stream_load_context.h b/be/src/runtime/stream_load/stream_load_context.h index ffbed37fe3abf8..2b8d271157d107 100644 --- a/be/src/runtime/stream_load/stream_load_context.h +++ b/be/src/runtime/stream_load/stream_load_context.h @@ -161,6 +161,7 @@ class StreamLoadContext { // only used to check if we receive whole body size_t body_bytes = 0; size_t receive_bytes = 0; + bool is_chunked_transfer = false; int64_t txn_id = default_txn_id; diff --git a/be/src/runtime/stream_load/stream_load_executor.cpp b/be/src/runtime/stream_load/stream_load_executor.cpp index 3dd39e56f651f0..50bb240756481d 100644 --- a/be/src/runtime/stream_load/stream_load_executor.cpp +++ b/be/src/runtime/stream_load/stream_load_executor.cpp @@ -89,7 +89,7 @@ Status StreamLoadExecutor::execute_plan_fragment(std::shared_ptrnumber_filtered_rows / num_selected_rows > ctx->max_filter_ratio) { // NOTE: Do not modify the error message here, for historical reasons, // some users may rely on this error message. - *status = Status::InternalError("too many filtered rows"); + *status = Status::DataQualityError("too many filtered rows"); } if (ctx->number_filtered_rows > 0 && !state->get_error_log_file_path().empty()) { ctx->error_url = to_load_error_http_path(state->get_error_log_file_path()); @@ -190,7 +190,7 @@ Status StreamLoadExecutor::begin_txn(StreamLoadContext* ctx) { #else result = k_stream_load_begin_result; #endif - status = Status::create(result.status); + status = Status::create(result.status); } g_stream_load_begin_txn_latency << duration_ns / 1000; if (!status.ok()) { @@ -254,7 +254,7 @@ Status StreamLoadExecutor::operate_txn_2pc(StreamLoadContext* ctx) { request.__set_operation(ctx->txn_operation); request.__set_thrift_rpc_timeout_ms(config::txn_commit_rpc_timeout_ms); request.__set_label(ctx->label); - if (ctx->txn_id != ctx->default_txn_id) { + if (ctx->txn_id != doris::StreamLoadContext::default_txn_id) { request.__set_txnId(ctx->txn_id); } diff --git a/be/src/runtime/tablets_channel.cpp b/be/src/runtime/tablets_channel.cpp index 91294135a06d8f..1dc7cf5afa3990 100644 --- a/be/src/runtime/tablets_channel.cpp +++ b/be/src/runtime/tablets_channel.cpp @@ -22,7 +22,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/status.h" // IWYU pragma: no_include @@ -84,6 +83,7 @@ void TabletsChannel::_init_profile(RuntimeProfile* profile) { _slave_replica_timer = ADD_TIMER(_profile, "SlaveReplicaTime"); _add_batch_timer = ADD_TIMER(_profile, "AddBatchTime"); _write_block_timer = ADD_TIMER(_profile, "WriteBlockTime"); + _incremental_open_timer = ADD_TIMER(_profile, "IncrementalOpenTabletTime"); _memory_usage_counter = memory_usage->AddHighWaterMarkCounter("Total", TUnit::BYTES); _write_memory_usage_counter = memory_usage->AddHighWaterMarkCounter("Write", TUnit::BYTES); _flush_memory_usage_counter = memory_usage->AddHighWaterMarkCounter("Flush", TUnit::BYTES); @@ -120,13 +120,14 @@ Status TabletsChannel::open(const PTabletWriterOpenRequest& request) { } Status TabletsChannel::incremental_open(const PTabletWriterOpenRequest& params) { + SCOPED_TIMER(_incremental_open_timer); if (_state == kInitialized) { // haven't opened return open(params); } std::lock_guard l(_lock); std::vector* index_slots = nullptr; int32_t schema_hash = 0; - for (auto& index : _schema->indexes()) { + for (const auto& index : _schema->indexes()) { if (index->index_id == _index_id) { index_slots = &index->slots; schema_hash = index->schema_hash; @@ -137,14 +138,12 @@ Status TabletsChannel::incremental_open(const PTabletWriterOpenRequest& params) return Status::InternalError("unknown index id, key={}", _key.to_string()); } // update tablets - std::vector tablet_ids; - tablet_ids.reserve(params.tablets_size()); size_t incremental_tablet_num = 0; std::stringstream ss; ss << "LocalTabletsChannel txn_id: " << _txn_id << " load_id: " << print_id(params.id()) << " incremental open delta writer: "; - for (auto& tablet : params.tablets()) { + for (const auto& tablet : params.tablets()) { if (_tablet_writers.find(tablet.tablet_id()) != _tablet_writers.end()) { continue; } diff --git a/be/src/runtime/tablets_channel.h b/be/src/runtime/tablets_channel.h index fe9c226829d794..4dca9050331cf2 100644 --- a/be/src/runtime/tablets_channel.h +++ b/be/src/runtime/tablets_channel.h @@ -196,6 +196,7 @@ class TabletsChannel { RuntimeProfile::Counter* _slave_replica_timer = nullptr; RuntimeProfile::Counter* _add_batch_timer = nullptr; RuntimeProfile::Counter* _write_block_timer = nullptr; + RuntimeProfile::Counter* _incremental_open_timer = nullptr; }; template diff --git a/be/src/runtime/task_group/task_group_manager.cpp b/be/src/runtime/task_group/task_group_manager.cpp index b3c24fa96e7c58..fb940069787666 100644 --- a/be/src/runtime/task_group/task_group_manager.cpp +++ b/be/src/runtime/task_group/task_group_manager.cpp @@ -60,7 +60,8 @@ void TaskGroupManager::get_resource_groups(const std::function lock(_task_scheduler_lock); // step 1: init cgroup cpu controller @@ -117,7 +118,14 @@ Status TaskGroupManager::create_and_get_task_scheduler(uint64_t tg_id, std::stri query_ctx_ptr->set_scan_task_scheduler(scan_task_sche); // step 5 update cgroup cpu if needed - _cgroup_ctl_map.at(tg_id)->update_cpu_hard_limit(cpu_hard_limit); + if (cpu_hard_limit > 0) { + _cgroup_ctl_map.at(tg_id)->update_cpu_hard_limit(cpu_hard_limit); + _cgroup_ctl_map.at(tg_id)->update_cpu_soft_limit(CPU_SOFT_LIMIT_DEFAULT_VALUE); + } else { + _cgroup_ctl_map.at(tg_id)->update_cpu_soft_limit(cpu_shares); + _cgroup_ctl_map.at(tg_id)->update_cpu_hard_limit( + CPU_HARD_LIMIT_DEFAULT_VALUE); // disable cpu hard limit + } return Status::OK(); } diff --git a/be/src/runtime/task_group/task_group_manager.h b/be/src/runtime/task_group/task_group_manager.h index ae501e93f3e781..cf44f5354406c7 100644 --- a/be/src/runtime/task_group/task_group_manager.h +++ b/be/src/runtime/task_group/task_group_manager.h @@ -52,7 +52,8 @@ class TaskGroupManager { std::vector* task_groups); Status create_and_get_task_scheduler(uint64_t wg_id, std::string wg_name, int cpu_hard_limit, - ExecEnv* exec_env, QueryContext* query_ctx_ptr); + int cpu_shares, ExecEnv* exec_env, + QueryContext* query_ctx_ptr); void delete_task_group_by_ids(std::set id_set); diff --git a/be/src/runtime/thread_context.cpp b/be/src/runtime/thread_context.cpp index a092385f410bc5..cd03c67b9935de 100644 --- a/be/src/runtime/thread_context.cpp +++ b/be/src/runtime/thread_context.cpp @@ -23,22 +23,15 @@ namespace doris { class MemTracker; -DEFINE_STATIC_THREAD_LOCAL(ThreadContext, ThreadContextPtr, _ptr); - -ThreadContextPtr::ThreadContextPtr() { - INIT_STATIC_THREAD_LOCAL(ThreadContext, _ptr); - init = true; -} - AttachTask::AttachTask(const std::shared_ptr& mem_tracker, const TUniqueId& task_id, const TUniqueId& fragment_instance_id) { - SwitchBthreadLocal::switch_to_bthread_local(); + ThreadLocalHandle::create_thread_local_if_not_exits(); signal::set_signal_task_id(task_id); thread_context()->attach_task(task_id, fragment_instance_id, mem_tracker); } AttachTask::AttachTask(RuntimeState* runtime_state) { - SwitchBthreadLocal::switch_to_bthread_local(); + ThreadLocalHandle::create_thread_local_if_not_exits(); signal::set_signal_task_id(runtime_state->query_id()); thread_context()->attach_task(runtime_state->query_id(), runtime_state->fragment_instance_id(), runtime_state->query_mem_tracker()); @@ -46,11 +39,11 @@ AttachTask::AttachTask(RuntimeState* runtime_state) { AttachTask::~AttachTask() { thread_context()->detach_task(); - SwitchBthreadLocal::switch_back_pthread_local(); + ThreadLocalHandle::del_thread_local_if_count_is_zero(); } AddThreadMemTrackerConsumer::AddThreadMemTrackerConsumer(MemTracker* mem_tracker) { - SwitchBthreadLocal::switch_to_bthread_local(); + ThreadLocalHandle::create_thread_local_if_not_exits(); if (mem_tracker) { _need_pop = thread_context()->thread_mem_tracker_mgr->push_consumer_tracker(mem_tracker); } @@ -59,7 +52,7 @@ AddThreadMemTrackerConsumer::AddThreadMemTrackerConsumer(MemTracker* mem_tracker AddThreadMemTrackerConsumer::AddThreadMemTrackerConsumer( const std::shared_ptr& mem_tracker) : _mem_tracker(mem_tracker) { - SwitchBthreadLocal::switch_to_bthread_local(); + ThreadLocalHandle::create_thread_local_if_not_exits(); if (_mem_tracker) { _need_pop = thread_context()->thread_mem_tracker_mgr->push_consumer_tracker(_mem_tracker.get()); @@ -70,7 +63,22 @@ AddThreadMemTrackerConsumer::~AddThreadMemTrackerConsumer() { if (_need_pop) { thread_context()->thread_mem_tracker_mgr->pop_consumer_tracker(); } - SwitchBthreadLocal::switch_back_pthread_local(); + ThreadLocalHandle::del_thread_local_if_count_is_zero(); +} + +AddThreadMemTrackerConsumerByHook::AddThreadMemTrackerConsumerByHook( + const std::shared_ptr& mem_tracker) + : _mem_tracker(mem_tracker) { + ThreadLocalHandle::create_thread_local_if_not_exits(); + DCHECK(mem_tracker != nullptr); + use_mem_hook = true; + thread_context()->thread_mem_tracker_mgr->push_consumer_tracker(_mem_tracker.get()); +} + +AddThreadMemTrackerConsumerByHook::~AddThreadMemTrackerConsumerByHook() { + thread_context()->thread_mem_tracker_mgr->pop_consumer_tracker(); + use_mem_hook = false; + ThreadLocalHandle::del_thread_local_if_count_is_zero(); } } // namespace doris diff --git a/be/src/runtime/thread_context.h b/be/src/runtime/thread_context.h index a5c9473d665a0c..1999549049936b 100644 --- a/be/src/runtime/thread_context.h +++ b/be/src/runtime/thread_context.h @@ -33,64 +33,76 @@ #include "runtime/exec_env.h" #include "runtime/memory/mem_tracker_limiter.h" #include "runtime/memory/thread_mem_tracker_mgr.h" -#include "runtime/threadlocal.h" #include "util/defer_op.h" // IWYU pragma: keep -// Used to observe the memory usage of the specified code segment -#if defined(USE_MEM_TRACKER) && !defined(UNDEFINED_BEHAVIOR_SANITIZER) -// Count a code segment memory (memory malloc - memory free) to int64_t -// Usage example: int64_t scope_mem = 0; { SCOPED_MEM_COUNT(&scope_mem); xxx; xxx; } -#define SCOPED_MEM_COUNT(scope_mem) \ - auto VARNAME_LINENUM(scope_mem_count) = doris::ScopeMemCount(scope_mem) - -// Count a code segment memory (memory malloc - memory free) to MemTracker. -// Compared to count `scope_mem`, MemTracker is easier to observe from the outside and is thread-safe. -// Usage example: std::unique_ptr tracker = std::make_unique("first_tracker"); -// { SCOPED_CONSUME_MEM_TRACKER(_mem_tracker.get()); xxx; xxx; } -// Usually used to record query more detailed memory, including ExecNode operators. -#define SCOPED_CONSUME_MEM_TRACKER(mem_tracker) \ - auto VARNAME_LINENUM(add_mem_consumer) = doris::AddThreadMemTrackerConsumer(mem_tracker) -#else -#define SCOPED_MEM_COUNT(scope_mem) (void)0 -#define SCOPED_CONSUME_MEM_TRACKER(mem_tracker) (void)0 -#endif - -// Used to observe query/load/compaction/e.g. execution thread memory usage and respond when memory exceeds the limit. -#if defined(USE_MEM_TRACKER) && !defined(UNDEFINED_BEHAVIOR_SANITIZER) +// Used to tracking query/load/compaction/e.g. execution thread memory usage. +// This series of methods saves some information to the thread local context of the current worker thread, +// including MemTracker, QueryID, etc. Use CONSUME_THREAD_MEM_TRACKER/RELEASE_THREAD_MEM_TRACKER in the code segment where +// the macro is located to record the memory into MemTracker. +// Not use it in rpc done.run(), because bthread_setspecific may have errors when UBSAN compiles. +#if defined(USE_MEM_TRACKER) && !defined(UNDEFINED_BEHAVIOR_SANITIZER) && !defined(BE_TEST) // Attach to query/load/compaction/e.g. when thread starts. // This will save some info about a working thread in the thread context. -// And count the memory during thread execution (is actually also the code segment that executes the function) -// to specify MemTrackerLimiter, and expect to handle when the memory exceeds the limit, for example cancel query. -// Usage is similar to SCOPED_CONSUME_MEM_TRACKER. +// Looking forward to tracking memory during thread execution into MemTrackerLimiter. #define SCOPED_ATTACH_TASK(arg1) auto VARNAME_LINENUM(attach_task) = AttachTask(arg1) - #define SCOPED_ATTACH_TASK_WITH_ID(arg1, arg2, arg3) \ auto VARNAME_LINENUM(attach_task) = AttachTask(arg1, arg2, arg3) // Switch MemTrackerLimiter for count memory during thread execution. -// Usually used after SCOPED_ATTACH_TASK, in order to count the memory of the specified code segment into another +// Used after SCOPED_ATTACH_TASK, in order to count the memory into another // MemTrackerLimiter instead of the MemTrackerLimiter added by the attach task. -// Note that, not use it in rpc done.run(), because bthread_setspecific may have errors when UBSAN compiles. #define SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(mem_tracker_limiter) \ auto VARNAME_LINENUM(switch_mem_tracker) = SwitchThreadMemTrackerLimiter(mem_tracker_limiter) -// Usually used to exclude a part of memory in query or load mem tracker and track it to Orphan Mem Tracker. -// Note that, not check whether it is currently a Bthread and switch Bthread Local, because this is usually meaningless, -// if used in Bthread, and pthread switching is expected, use SwitchThreadMemTrackerLimiter. -#define SCOPED_TRACK_MEMORY_TO_UNKNOWN() \ - auto VARNAME_LINENUM(track_memory_to_unknown) = TrackMemoryToUnknown() +// Looking forward to tracking memory during thread execution into MemTracker. +// Usually used to record query more detailed memory, including ExecNode operators. +#define SCOPED_CONSUME_MEM_TRACKER(mem_tracker) \ + auto VARNAME_LINENUM(add_mem_consumer) = doris::AddThreadMemTrackerConsumer(mem_tracker) #else #define SCOPED_ATTACH_TASK(arg1, ...) (void)0 #define SCOPED_ATTACH_TASK_WITH_ID(arg1, arg2, arg3) (void)0 #define SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(mem_tracker_limiter) (void)0 -#define SCOPED_TRACK_MEMORY_TO_UNKNOWN() (void)0 +#define SCOPED_CONSUME_MEM_TRACKER(mem_tracker) (void)0 #endif -#define SKIP_MEMORY_CHECK(...) \ - do { \ - doris::thread_context()->skip_memory_check++; \ - DEFER({ doris::thread_context()->skip_memory_check--; }); \ - __VA_ARGS__; \ +// Used to tracking the memory usage of the specified code segment use by mem hook. +#if defined(USE_MEM_TRACKER) +// Count a code segment memory (memory malloc - memory free) to int64_t +// Usage example: int64_t scope_mem = 0; { SCOPED_MEM_COUNT(&scope_mem); xxx; xxx; } +#define SCOPED_MEM_COUNT_BY_HOOK(scope_mem) \ + auto VARNAME_LINENUM(scope_mem_count) = doris::ScopeMemCountByHook(scope_mem) + +// Count a code segment memory (memory malloc - memory free) to MemTracker. +// Compared to count `scope_mem`, MemTracker is easier to observe from the outside and is thread-safe. +// Usage example: std::unique_ptr tracker = std::make_unique("first_tracker"); +// { SCOPED_CONSUME_MEM_TRACKER_BY_HOOK(_mem_tracker.get()); xxx; xxx; } +#define SCOPED_CONSUME_MEM_TRACKER_BY_HOOK(mem_tracker) \ + auto VARNAME_LINENUM(add_mem_consumer) = doris::AddThreadMemTrackerConsumerByHook(mem_tracker) +#else +#define SCOPED_MEM_COUNT_BY_HOOK(scope_mem) (void)0 +#define SCOPED_CONSUME_MEM_TRACKER_BY_HOOK(mem_tracker) (void)0 +#endif + +#define SKIP_MEMORY_CHECK(...) \ + do { \ + doris::ThreadLocalHandle::create_thread_local_if_not_exits(); \ + doris::thread_context()->skip_memory_check++; \ + DEFER({ \ + doris::thread_context()->skip_memory_check--; \ + doris::ThreadLocalHandle::del_thread_local_if_count_is_zero(); \ + }); \ + __VA_ARGS__; \ + } while (0) + +#define SKIP_LARGE_MEMORY_CHECK(...) \ + do { \ + doris::ThreadLocalHandle::create_thread_local_if_not_exits(); \ + doris::thread_context()->skip_large_memory_check++; \ + DEFER({ \ + doris::thread_context()->skip_large_memory_check--; \ + doris::ThreadLocalHandle::del_thread_local_if_count_is_zero(); \ + }); \ + __VA_ARGS__; \ } while (0) namespace doris { @@ -101,63 +113,26 @@ class RuntimeState; extern bthread_key_t btls_key; -// Using gcc11 compiles thread_local variable on lower versions of GLIBC will report an error, -// see https://github.com/apache/doris/pull/7911 -// -// If we want to avoid this error, -// 1. For non-trivial variables in thread_local, such as std::string, you need to store them as pointers to -// ensure that thread_local is trivial, these non-trivial pointers will uniformly call destructors elsewhere. -// 2. The default destructor of the thread_local variable cannot be overridden. -// -// This is difficult to implement. Because the destructor is not overwritten, it means that the outside cannot -// be notified when the thread terminates, and the non-trivial pointers in thread_local cannot be released in time. -// The func provided by pthread and std::thread doesn't help either. -// -// So, kudu Class-scoped static thread local implementation was introduced. Solve the above problem by -// Thread-scoped thread local + Class-scoped thread local. -// -// This may look very trick, but it's the best way I can find. -// -// refer to: -// https://gcc.gnu.org/onlinedocs/gcc-3.3.1/gcc/Thread-Local.html -// https://stackoverflow.com/questions/12049684/ -// https://sourceware.org/glibc/wiki/Destructor%20support%20for%20thread_local%20variables -// https://www.jianshu.com/p/756240e837dd -// https://man7.org/linux/man-pages/man3/pthread_tryjoin_np.3.html -class ThreadContextPtr { -public: - ThreadContextPtr(); - // Cannot add destructor `~ThreadContextPtr`, otherwise it will no longer be of type POD, the reason is as above. - - // TCMalloc hook is triggered during ThreadContext construction, which may lead to deadlock. - bool init = false; - - DECLARE_STATIC_THREAD_LOCAL(ThreadContext, _ptr); -}; - -inline thread_local ThreadContextPtr thread_context_ptr; - -// To avoid performance problems caused by frequently calling `bthread_getspecific` to obtain bthread TLS -// in tcmalloc hook, cache the key and value of bthread TLS in pthread TLS. -inline thread_local ThreadContext* bthread_context; -inline thread_local bthread_t bthread_id; +// Is true after ThreadContext construction. +inline thread_local bool pthread_context_ptr_init = false; +inline thread_local constinit ThreadContext* thread_context_ptr; +// use mem hook to consume thread mem tracker. +inline thread_local bool use_mem_hook = false; // The thread context saves some info about a working thread. // 2 required info: // 1. thread_id: Current thread id, Auto generated. -// 2. type: The type is a enum value indicating which type of task current thread is running. +// 2. type(abolished): The type is a enum value indicating which type of task current thread is running. // For example: QUERY, LOAD, COMPACTION, ... // 3. task id: A unique id to identify this task. maybe query id, load job id, etc. +// 4. ThreadMemTrackerMgr // // There may be other optional info to be added later. class ThreadContext { public: - ThreadContext() { - thread_mem_tracker_mgr.reset(new ThreadMemTrackerMgr()); - if (ExecEnv::ready()) thread_mem_tracker_mgr->init(); - } + ThreadContext() { thread_mem_tracker_mgr = std::make_unique(); } - ~ThreadContext() { thread_context_ptr.init = false; } + ~ThreadContext() = default; void attach_task(const TUniqueId& task_id, const TUniqueId& fragment_instance_id, const std::shared_ptr& mem_tracker) { @@ -179,118 +154,140 @@ class ThreadContext { thread_mem_tracker_mgr->detach_limiter_tracker(); } - const TUniqueId& task_id() const { return _task_id; } - const TUniqueId& fragment_instance_id() const { return _fragment_instance_id; } + [[nodiscard]] const TUniqueId& task_id() const { return _task_id; } + [[nodiscard]] const TUniqueId& fragment_instance_id() const { return _fragment_instance_id; } - std::string get_thread_id() { + static std::string get_thread_id() { std::stringstream ss; ss << std::this_thread::get_id(); return ss.str(); } - // After thread_mem_tracker_mgr is initialized, the current thread TCMalloc Hook starts to + // After thread_mem_tracker_mgr is initialized, the current thread Hook starts to // consume/release mem_tracker. // Note that the use of shared_ptr will cause a crash. The guess is that there is an // intermediate state during the copy construction of shared_ptr. Shared_ptr is not equal // to nullptr, but the object it points to is not initialized. At this time, when the memory - // is released somewhere, the TCMalloc hook is triggered to cause the crash. + // is released somewhere, the hook is triggered to cause the crash. std::unique_ptr thread_mem_tracker_mgr; - MemTrackerLimiter* thread_mem_tracker() { + [[nodiscard]] MemTrackerLimiter* thread_mem_tracker() const { return thread_mem_tracker_mgr->limiter_mem_tracker_raw(); } void consume_memory(const int64_t size) const { - thread_mem_tracker_mgr->consume(size, large_memory_check); + thread_mem_tracker_mgr->consume(size, skip_large_memory_check); } - int switch_bthread_local_count = 0; + int thread_local_handle_count = 0; int skip_memory_check = 0; - bool large_memory_check = true; + int skip_large_memory_check = 0; private: TUniqueId _task_id; TUniqueId _fragment_instance_id; }; -// Switch thread context from pthread local to bthread local context. -// Cache the pointer of bthread local in pthead local, -// Avoid calling bthread_getspecific frequently to get bthread local, which has performance problems. -class SwitchBthreadLocal { +class ThreadLocalHandle { public: - static void switch_to_bthread_local() { - if (bthread_self() != 0) { - // Very frequent bthread_getspecific will slow, but switch_to_bthread_local is not expected to be much. - bthread_context = static_cast(bthread_getspecific(btls_key)); + static void create_thread_local_if_not_exits() { + if (bthread_self() == 0) { + if (!pthread_context_ptr_init) { + thread_context_ptr = new ThreadContext(); + pthread_context_ptr_init = true; + } + DCHECK(thread_context_ptr != nullptr); + thread_context_ptr->thread_local_handle_count++; + } else { + // Avoid calling bthread_getspecific frequently to get bthread local. + // Very frequent bthread_getspecific will slow, but create_thread_local_if_not_exits is not expected to be much. + // Cache the pointer of bthread local in pthead local. + auto* bthread_context = static_cast(bthread_getspecific(btls_key)); if (bthread_context == nullptr) { - // A new bthread starts, two scenarios: + // If bthread_context == nullptr: // 1. First call to bthread_getspecific (and before any bthread_setspecific) returns NULL // 2. There are not enough reusable btls in btls pool. - // else, two scenarios: + // else if bthread_context != nullptr: // 1. A new bthread starts, but get a reuses btls. - // 2. A pthread switch occurs. Because the pthread switch cannot be accurately identified at the moment. - // So tracker call reset 0 like reuses btls. - // during this period, stop the use of thread_context. - thread_context_ptr.init = false; bthread_context = new ThreadContext; // The brpc server should respond as quickly as possible. bthread_context->thread_mem_tracker_mgr->disable_wait_gc(); // set the data so that next time bthread_getspecific in the thread returns the data. CHECK(0 == bthread_setspecific(btls_key, bthread_context) || k_doris_exit); - thread_context_ptr.init = true; } - bthread_id = bthread_self(); - bthread_context->switch_bthread_local_count++; + DCHECK(bthread_context != nullptr); + bthread_context->thread_local_handle_count++; } } - // `switch_to_bthread_local` and `switch_back_pthread_local` should be used in pairs, - // `switch_to_bthread_local` should only be called if `switch_to_bthread_local` returns true - static void switch_back_pthread_local() { - if (bthread_self() != 0) { - if (!bthread_equal(bthread_self(), bthread_id)) { - bthread_id = bthread_self(); - bthread_context = static_cast(bthread_getspecific(btls_key)); - DCHECK(bthread_context != nullptr); - } - bthread_context->switch_bthread_local_count--; - if (bthread_context->switch_bthread_local_count == 0) { - bthread_context = thread_context_ptr._ptr; + // `create_thread_local_if_not_exits` and `del_thread_local_if_count_is_zero` should be used in pairs, + // `del_thread_local_if_count_is_zero` should only be called if `create_thread_local_if_not_exits` returns true + static void del_thread_local_if_count_is_zero() { + if (pthread_context_ptr_init) { + // in pthread + thread_context_ptr->thread_local_handle_count--; + if (thread_context_ptr->thread_local_handle_count == 0) { + pthread_context_ptr_init = false; + delete doris::thread_context_ptr; + thread_context_ptr = nullptr; } + } else if (bthread_self() != 0) { + // in bthread + auto* bthread_context = static_cast(bthread_getspecific(btls_key)); + DCHECK(bthread_context != nullptr); + bthread_context->thread_local_handle_count--; + } else { + LOG(FATAL) << "__builtin_unreachable"; + __builtin_unreachable(); } } }; -// Note: All use of thread_context() in bthread requires the use of SwitchBthreadLocal. +[[maybe_unused]] static bool is_thread_context_init() { + if (pthread_context_ptr_init) { + // in pthread + DCHECK(bthread_self() == 0); + DCHECK(thread_context_ptr != nullptr); + return true; + } + if (bthread_self() != 0) { + // in bthread + return static_cast(bthread_getspecific(btls_key)) != nullptr; + } + return false; +} + +// must call create_thread_local_if_not_exits() before use thread_context(). static ThreadContext* thread_context() { + if (pthread_context_ptr_init) { + // in pthread + DCHECK(bthread_self() == 0); + DCHECK(thread_context_ptr != nullptr); + return thread_context_ptr; + } if (bthread_self() != 0) { // in bthread - if (!bthread_equal(bthread_self(), bthread_id)) { - // bthread switching pthread may be very frequent, remember not to use lock or other time-consuming operations. - bthread_id = bthread_self(); - bthread_context = static_cast(bthread_getspecific(btls_key)); - // if nullptr, a new bthread task start and no reusable bthread local, - // or bthread switch pthread but not call switch_to_bthread_local, use pthread local context - // else, bthread switch pthread and called switch_to_bthread_local, use bthread local context. - if (bthread_context == nullptr) { - bthread_context = thread_context_ptr._ptr; - } - } + // bthread switching pthread may be very frequent, remember not to use lock or other time-consuming operations. + auto* bthread_context = static_cast(bthread_getspecific(btls_key)); + DCHECK(bthread_context != nullptr); return bthread_context; - } else { - // in pthread - return thread_context_ptr._ptr; } + LOG(FATAL) << "__builtin_unreachable"; + __builtin_unreachable(); } -class ScopeMemCount { +class ScopeMemCountByHook { public: - explicit ScopeMemCount(int64_t* scope_mem) { + explicit ScopeMemCountByHook(int64_t* scope_mem) { + ThreadLocalHandle::create_thread_local_if_not_exits(); + use_mem_hook = true; _scope_mem = scope_mem; thread_context()->thread_mem_tracker_mgr->start_count_scope_mem(); } - ~ScopeMemCount() { + ~ScopeMemCountByHook() { *_scope_mem += thread_context()->thread_mem_tracker_mgr->stop_count_scope_mem(); + use_mem_hook = false; + ThreadLocalHandle::del_thread_local_if_count_is_zero(); } private: @@ -311,44 +308,20 @@ class AttachTask { class SwitchThreadMemTrackerLimiter { public: explicit SwitchThreadMemTrackerLimiter(const std::shared_ptr& mem_tracker) { - SwitchBthreadLocal::switch_to_bthread_local(); + ThreadLocalHandle::create_thread_local_if_not_exits(); _old_mem_tracker = thread_context()->thread_mem_tracker_mgr->limiter_mem_tracker(); thread_context()->thread_mem_tracker_mgr->attach_limiter_tracker(mem_tracker, TUniqueId()); } ~SwitchThreadMemTrackerLimiter() { thread_context()->thread_mem_tracker_mgr->detach_limiter_tracker(_old_mem_tracker); - SwitchBthreadLocal::switch_back_pthread_local(); + ThreadLocalHandle::del_thread_local_if_count_is_zero(); } private: std::shared_ptr _old_mem_tracker; }; -class TrackMemoryToUnknown { -public: - explicit TrackMemoryToUnknown() { - if (bthread_self() != 0) { - _tid = std::this_thread::get_id(); // save pthread id - } - _old_mem_tracker = thread_context()->thread_mem_tracker_mgr->limiter_mem_tracker(); - thread_context()->thread_mem_tracker_mgr->attach_limiter_tracker( - ExecEnv::GetInstance()->orphan_mem_tracker(), TUniqueId()); - } - - ~TrackMemoryToUnknown() { - if (bthread_self() != 0) { - // make sure pthread is not switch, if switch, mem tracker will be wrong, but not crash in release - DCHECK(_tid == std::this_thread::get_id()); - } - thread_context()->thread_mem_tracker_mgr->detach_limiter_tracker(_old_mem_tracker); - } - -private: - std::shared_ptr _old_mem_tracker; - std::thread::id _tid; -}; - class AddThreadMemTrackerConsumer { public: // The owner and user of MemTracker are in the same thread, and the raw pointer is faster. @@ -365,48 +338,94 @@ class AddThreadMemTrackerConsumer { bool _need_pop = false; }; -// Basic macros for mem tracker, usually do not need to be modified and used. -#ifdef USE_MEM_TRACKER -// For the memory that cannot be counted by mem hook, manually count it into the mem tracker, such as mmap. -#define CONSUME_THREAD_MEM_TRACKER(size) doris::thread_context()->consume_memory(size) -#define RELEASE_THREAD_MEM_TRACKER(size) doris::thread_context()->consume_memory(-size) +class AddThreadMemTrackerConsumerByHook { +public: + explicit AddThreadMemTrackerConsumerByHook(const std::shared_ptr& mem_tracker); + ~AddThreadMemTrackerConsumerByHook(); + +private: + std::shared_ptr _mem_tracker = nullptr; +}; +// Basic macros for mem tracker, usually do not need to be modified and used. +#if defined(USE_MEM_TRACKER) && !defined(BE_TEST) // used to fix the tracking accuracy of caches. -#define THREAD_MEM_TRACKER_TRANSFER_TO(size, tracker) \ - doris::thread_context()->thread_mem_tracker_mgr->limiter_mem_tracker_raw()->transfer_to( \ - size, tracker) -#define THREAD_MEM_TRACKER_TRANSFER_FROM(size, tracker) \ - tracker->transfer_to( \ - size, doris::thread_context()->thread_mem_tracker_mgr->limiter_mem_tracker_raw()) - -// Mem Hook to consume thread mem tracker -// TODO: In the original design, the MemTracker consume method is called before the memory is allocated. -// If the consume succeeds, the memory is actually allocated, otherwise an exception is thrown. -// But the statistics of memory through TCMalloc new/delete Hook are after the memory is actually allocated, -// which is different from the previous behavior. -#define CONSUME_MEM_TRACKER(size) \ +#define THREAD_MEM_TRACKER_TRANSFER_TO(size, tracker) \ + do { \ + if (is_thread_context_init()) { \ + doris::thread_context() \ + ->thread_mem_tracker_mgr->limiter_mem_tracker_raw() \ + ->transfer_to(size, tracker); \ + } else { \ + doris::ExecEnv::GetInstance()->orphan_mem_tracker_raw()->transfer_to(size, tracker); \ + } \ + } while (0) + +#define THREAD_MEM_TRACKER_TRANSFER_FROM(size, tracker) \ + do { \ + if (is_thread_context_init()) { \ + tracker->transfer_to( \ + size, \ + doris::thread_context()->thread_mem_tracker_mgr->limiter_mem_tracker_raw()); \ + } else { \ + tracker->transfer_to(size, doris::ExecEnv::GetInstance()->orphan_mem_tracker_raw()); \ + } \ + } while (0) + +// Mem Hook to consume thread mem tracker, not use in brpc thread. +#define CONSUME_THREAD_MEM_TRACKER_BY_HOOK(size) \ + do { \ + if (doris::use_mem_hook) { \ + DCHECK(doris::pthread_context_ptr_init); \ + DCHECK(bthread_self() == 0); \ + doris::thread_context_ptr->consume_memory(size); \ + } \ + } while (0) +#define RELEASE_THREAD_MEM_TRACKER_BY_HOOK(size) CONSUME_THREAD_MEM_TRACKER_BY_HOOK(-size) +#define CONSUME_THREAD_MEM_TRACKER_BY_HOOK_WITH_FN(size_fn, ...) \ + do { \ + if (doris::use_mem_hook) { \ + DCHECK(doris::pthread_context_ptr_init); \ + DCHECK(bthread_self() == 0); \ + doris::thread_context_ptr->consume_memory(size_fn(__VA_ARGS__)); \ + } \ + } while (0) +#define RELEASE_THREAD_MEM_TRACKER_BY_HOOK_WITH_FN(size_fn, ...) \ + do { \ + if (doris::use_mem_hook) { \ + DCHECK(doris::pthread_context_ptr_init); \ + DCHECK(bthread_self() == 0); \ + doris::thread_context_ptr->consume_memory(-size_fn(__VA_ARGS__)); \ + } \ + } while (0) + +// if use mem hook, avoid repeated consume. +// must call create_thread_local_if_not_exits() before use thread_context(). +#define CONSUME_THREAD_MEM_TRACKER(size) \ do { \ - if (doris::thread_context_ptr.init) { \ - doris::thread_context()->consume_memory(size); \ + if (doris::use_mem_hook) { \ + break; \ + } \ + if (doris::pthread_context_ptr_init) { \ + DCHECK(bthread_self() == 0); \ + doris::thread_context_ptr->consume_memory(size); \ + } else if (bthread_self() != 0) { \ + static_cast(bthread_getspecific(doris::btls_key)) \ + ->consume_memory(size); \ } else if (doris::ExecEnv::ready()) { \ doris::ExecEnv::GetInstance()->orphan_mem_tracker_raw()->consume_no_update_peak(size); \ } \ } while (0) -#define RELEASE_MEM_TRACKER(size) \ - do { \ - if (doris::thread_context_ptr.init) { \ - doris::thread_context()->consume_memory(-size); \ - } else if (doris::ExecEnv::ready()) { \ - doris::ExecEnv::GetInstance()->orphan_mem_tracker_raw()->consume_no_update_peak( \ - -size); \ - } \ - } while (0) +#define RELEASE_THREAD_MEM_TRACKER(size) CONSUME_THREAD_MEM_TRACKER(-size) + #else -#define CONSUME_THREAD_MEM_TRACKER(size) (void)0 -#define RELEASE_THREAD_MEM_TRACKER(size) (void)0 #define THREAD_MEM_TRACKER_TRANSFER_TO(size, tracker) (void)0 #define THREAD_MEM_TRACKER_TRANSFER_FROM(size, tracker) (void)0 -#define CONSUME_MEM_TRACKER(size) (void)0 -#define RELEASE_MEM_TRACKER(size) (void)0 +#define CONSUME_THREAD_MEM_TRACKER_BY_HOOK(size) (void)0 +#define RELEASE_THREAD_MEM_TRACKER_BY_HOOK(size) (void)0 +#define CONSUME_THREAD_MEM_TRACKER_BY_HOOK_WITH_FN(size) (void)0 +#define RELEASE_THREAD_MEM_TRACKER_BY_HOOK_WITH_FN(size) (void)0 +#define CONSUME_THREAD_MEM_TRACKER(size) (void)0 +#define RELEASE_THREAD_MEM_TRACKER(size) (void)0 #endif } // namespace doris diff --git a/be/src/runtime/threadlocal.cc b/be/src/runtime/threadlocal.cc deleted file mode 100644 index 69b57607730c75..00000000000000 --- a/be/src/runtime/threadlocal.cc +++ /dev/null @@ -1,84 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -#include "runtime/threadlocal.h" - -#include - -#include -#include -#include -#include - -#include "common/logging.h" -#include "util/errno.h" - -namespace doris { - -// One key used by the entire process to attach destructors on thread exit. -static pthread_key_t destructors_key; - -// The above key must only be initialized once per process. -static std::once_flag once; - -namespace { - -// List of destructors for all thread locals instantiated on a given thread. -struct PerThreadDestructorList { - void (*destructor)(void*); - void* arg; - PerThreadDestructorList* next; -}; - -} // anonymous namespace - -// Call all the destructors associated with all THREAD_LOCAL instances in this -// thread. -static void invoke_destructors(void* t) { - PerThreadDestructorList* d = reinterpret_cast(t); - while (d != nullptr) { - d->destructor(d->arg); - PerThreadDestructorList* next = d->next; - delete d; - d = next; - } -} - -// This key must be initialized only once. -static void create_key() { - int ret = pthread_key_create(&destructors_key, &invoke_destructors); - // Linux supports up to 1024 keys, we will use only one for all thread locals. - CHECK_EQ(0, ret) << "pthread_key_create() failed, cannot add destructor to thread: " - << "error " << ret << ": " << errno_to_string(ret); -} - -// Adds a destructor to the list. -void add_destructor(void (*destructor)(void*), void* arg) { - std::call_once(once, create_key); - - // Returns NULL if nothing is set yet. - std::unique_ptr p(new PerThreadDestructorList()); - p->destructor = destructor; - p->arg = arg; - p->next = reinterpret_cast(pthread_getspecific(destructors_key)); - int ret = pthread_setspecific(destructors_key, p.release()); - // The only time this check should fail is if we are out of memory, or if - // somehow key creation failed, which should be caught by the above CHECK. - CHECK_EQ(0, ret) << "pthread_setspecific() failed, cannot update destructor list: " - << "error " << ret << ": " << errno_to_string(ret); -} - -} // namespace doris diff --git a/be/src/runtime/threadlocal.h b/be/src/runtime/threadlocal.h deleted file mode 100644 index 3842b2fcad6f75..00000000000000 --- a/be/src/runtime/threadlocal.h +++ /dev/null @@ -1,129 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Reference from kudu, Solve the problem of gcc11 compiling -// non-trivial thread_local variables on lower versions of GLIBC. -// see https://github.com/apache/doris/pull/7911 -// -// Block-scoped static thread local implementation. -// -// Usage is similar to a C++11 thread_local. The BLOCK_STATIC_THREAD_LOCAL macro -// defines a thread-local pointer to the specified type, which is lazily -// instantiated by any thread entering the block for the first time. The -// constructor for the type T is invoked at macro execution time, as expected, -// and its destructor is invoked when the corresponding thread's Runnable -// returns, or when the thread exits. -// -// Inspired by Poco , -// Andrew Tomazos , and -// the C++11 thread_local API. -// -// Example usage: -// -// // Invokes a 3-arg constructor on SomeClass: -// BLOCK_STATIC_THREAD_LOCAL(SomeClass, instance, arg1, arg2, arg3); -// instance->DoSomething(); -// - -#pragma once - -#include - -#include "gutil/port.h" - -#define BLOCK_STATIC_THREAD_LOCAL(T, t, ...) \ - static __thread T* t; \ - do { \ - if (PREDICT_FALSE(t == NULL)) { \ - t = new T(__VA_ARGS__); \ - add_destructor(destroy, t); \ - } \ - } while (false) - -// Class-scoped static thread local implementation. -// -// Very similar in implementation to the above block-scoped version, but -// requires a bit more syntax and vigilance to use properly. -// -// DECLARE_STATIC_THREAD_LOCAL(Type, instance_var_) must be placed in the -// class header, as usual for variable declarations. -// -// Because these variables are static, they must also be defined in the impl -// file with DEFINE_STATIC_THREAD_LOCAL(Type, Classname, instance_var_), -// which is very much like defining any static member, i.e. int Foo::member_. -// -// Finally, each thread must initialize the instance before using it by calling -// INIT_STATIC_THREAD_LOCAL(Type, instance_var_, ...). This is a cheap -// call, and may be invoked at the top of any method which may reference a -// thread-local variable. -// -// Due to all of these requirements, you should probably declare TLS members -// as private. -// -// Example usage: -// -// // foo.h -// #include "kudu/utils/file.h" -// class Foo { -// public: -// void DoSomething(std::string s); -// private: -// DECLARE_STATIC_THREAD_LOCAL(utils::File, file_); -// }; -// -// // foo.cc -// #include "kudu/foo.h" -// DEFINE_STATIC_THREAD_LOCAL(utils::File, Foo, file_); -// void Foo::WriteToFile(std::string s) { -// // Call constructor if necessary. -// INIT_STATIC_THREAD_LOCAL(utils::File, file_, "/tmp/file_location.txt"); -// file_->Write(s); -// } - -// Goes in the class declaration (usually in a header file). -// dtor must be destructed _after_ t, so it gets defined first. -// Uses a mangled variable name for dtor since it must also be a member of the -// class. -#define DECLARE_STATIC_THREAD_LOCAL(T, t) static __thread T* t - -// You must also define the instance in the .cc file. -#define DEFINE_STATIC_THREAD_LOCAL(T, Class, t) __thread T* Class::t - -// Must be invoked at least once by each thread that will access t. -#define INIT_STATIC_THREAD_LOCAL(T, t, ...) \ - do { \ - if (PREDICT_FALSE(t == NULL)) { \ - t = new T(__VA_ARGS__); \ - add_destructor(destroy, t); \ - } \ - } while (false) - -// Internal implementation below. - -namespace doris { - -// Add a destructor to the list. -void add_destructor(void (*destructor)(void*), void* arg); - -// Destroy the passed object of type T. -template -void destroy(void* t) { - // With tcmalloc, this should be pretty cheap (same thread as new). - delete reinterpret_cast(t); -} - -} // namespace doris diff --git a/be/src/runtime/type_limit.h b/be/src/runtime/type_limit.h index d406689644b8b2..374f4d9099f135 100644 --- a/be/src/runtime/type_limit.h +++ b/be/src/runtime/type_limit.h @@ -70,8 +70,8 @@ struct type_limit { } static vectorized::Decimal128 min() { return -max(); } }; -static Int256 MAX_DECIMAL256_INT({18446744073709551615ul, 8607968719199866879ul, - 532749306367912313ul, 1593091911132452277ul}); +static wide::Int256 MAX_DECIMAL256_INT({18446744073709551615ul, 8607968719199866879ul, + 532749306367912313ul, 1593091911132452277ul}); template <> struct type_limit { static vectorized::Decimal256 max() { return vectorized::Decimal256(MAX_DECIMAL256_INT); } diff --git a/be/src/runtime/user_function_cache.cpp b/be/src/runtime/user_function_cache.cpp index 9f07ba590299bf..c2b25971587dd1 100644 --- a/be/src/runtime/user_function_cache.cpp +++ b/be/src/runtime/user_function_cache.cpp @@ -200,14 +200,6 @@ Status UserFunctionCache::_load_cached_lib() { return Status::OK(); } -std::string get_real_symbol(const std::string& symbol) { - static std::regex rx1("8palo_udf"); - std::string str1 = std::regex_replace(symbol, rx1, "9doris_udf"); - static std::regex rx2("4palo"); - std::string str2 = std::regex_replace(str1, rx2, "5doris"); - return str2; -} - Status UserFunctionCache::_get_cache_entry(int64_t fid, const std::string& url, const std::string& checksum, std::shared_ptr& output_entry, diff --git a/be/src/service/CMakeLists.txt b/be/src/service/CMakeLists.txt index f0831f7317db50..c5832d8433e260 100644 --- a/be/src/service/CMakeLists.txt +++ b/be/src/service/CMakeLists.txt @@ -45,14 +45,31 @@ if (${MAKE_TEST} STREQUAL "OFF") install(DIRECTORY DESTINATION ${OUTPUT_DIR}/lib/) install(TARGETS doris_be DESTINATION ${OUTPUT_DIR}/lib/) - if ("${STRIP_DEBUG_INFO}" STREQUAL "ON") - add_custom_command(TARGET doris_be POST_BUILD - COMMAND ${CMAKE_OBJCOPY} --only-keep-debug $ $.dbg - COMMAND ${CMAKE_STRIP} --strip-debug --strip-unneeded $ - COMMAND ${CMAKE_OBJCOPY} --add-gnu-debuglink=$.dbg $ + if (OS_MACOSX AND ARCH_ARM AND "${CMAKE_BUILD_TYPE}" STREQUAL "ASAN") + set(SHOULD_STRIP_DEBUG_INFO "ON") + endif() + + if ("${STRIP_DEBUG_INFO}" STREQUAL "ON" OR "${SHOULD_STRIP_DEBUG_INFO}" STREQUAL "ON") + if (OS_MACOSX) + find_program(DSYMUTIL NAMES dsymutil) + message(STATUS "dsymutil found: ${DSYMUTIL}") + find_program(LLVM_STRIP NAMES llvm-strip) + message(STATUS "llvm-strip found: ${LLVM_STRIP}") + add_custom_command(TARGET doris_be POST_BUILD + COMMAND ${DSYMUTIL} $ + COMMAND ${LLVM_STRIP} --strip-all $ + COMMAND mkdir -p ${OUTPUT_DIR}/lib + COMMAND cp -rf doris_be.dSYM ${OUTPUT_DIR}/lib/ ) + else() + add_custom_command(TARGET doris_be POST_BUILD + COMMAND ${CMAKE_OBJCOPY} --only-keep-debug $ $.dbg + COMMAND ${CMAKE_STRIP} --strip-debug --strip-unneeded $ + COMMAND ${CMAKE_OBJCOPY} --add-gnu-debuglink=$.dbg $ + ) - install(DIRECTORY DESTINATION ${OUTPUT_DIR}/lib/debug_info/) - install(FILES $.dbg DESTINATION ${OUTPUT_DIR}/lib/debug_info/) + install(DIRECTORY DESTINATION ${OUTPUT_DIR}/lib/debug_info/) + install(FILES $.dbg DESTINATION ${OUTPUT_DIR}/lib/debug_info/) + endif() endif() endif() diff --git a/be/src/service/arrow_flight/arrow_flight_batch_reader.cpp b/be/src/service/arrow_flight/arrow_flight_batch_reader.cpp index 8a0f1e67859494..d2b45a0b77c6ef 100644 --- a/be/src/service/arrow_flight/arrow_flight_batch_reader.cpp +++ b/be/src/service/arrow_flight/arrow_flight_batch_reader.cpp @@ -39,18 +39,12 @@ ArrowFlightBatchReader::ArrowFlightBatchReader(std::shared_ptr s arrow::Result> ArrowFlightBatchReader::Create( const std::shared_ptr& statement_) { // Make sure that FE send the fragment to BE and creates the BufferControlBlock before returning ticket - // to the ADBC client, so that the row_descriptor and control block can be found. - RowDescriptor row_desc = - ExecEnv::GetInstance()->result_mgr()->find_row_descriptor(statement_->query_id); - if (row_desc.equals(RowDescriptor())) { + // to the ADBC client, so that the schema and control block can be found. + auto schema = ExecEnv::GetInstance()->result_mgr()->find_arrow_schema(statement_->query_id); + if (schema == nullptr) { ARROW_RETURN_NOT_OK(arrow::Status::Invalid(fmt::format( - "Schema RowDescriptor Not Found, queryid: {}", print_id(statement_->query_id)))); - } - std::shared_ptr schema; - auto st = convert_to_arrow_schema(row_desc, &schema); - if (UNLIKELY(!st.ok())) { - LOG(WARNING) << st.to_string(); - ARROW_RETURN_NOT_OK(to_arrow_status(st)); + "not found arrow flight schema, maybe query has been canceled, queryid: {}", + print_id(statement_->query_id)))); } std::shared_ptr result(new ArrowFlightBatchReader(statement_, schema)); return result; diff --git a/be/src/service/backend_service.cpp b/be/src/service/backend_service.cpp index a312996162ce2c..af4959929e9f01 100644 --- a/be/src/service/backend_service.cpp +++ b/be/src/service/backend_service.cpp @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -48,6 +49,7 @@ #include "olap/olap_common.h" #include "olap/olap_define.h" #include "olap/rowset/beta_rowset.h" +#include "olap/rowset/pending_rowset_helper.h" #include "olap/rowset/rowset_factory.h" #include "olap/rowset/rowset_meta.h" #include "olap/storage_engine.h" @@ -63,6 +65,7 @@ #include "runtime/stream_load/stream_load_recorder.h" #include "util/arrow/row_batch.h" #include "util/defer_op.h" +#include "util/threadpool.h" #include "util/thrift_server.h" #include "util/uid_util.h" @@ -79,6 +82,284 @@ class TTransportException; namespace doris { +namespace { +constexpr uint64_t kMaxTimeoutMs = 3000; // 3s +struct IngestBinlogArg { + int64_t txn_id; + int64_t partition_id; + int64_t local_tablet_id; + TabletSharedPtr local_tablet; + TIngestBinlogRequest request; + TStatus* tstatus; +}; + +void _ingest_binlog(IngestBinlogArg* arg) { + auto txn_id = arg->txn_id; + auto partition_id = arg->partition_id; + auto local_tablet_id = arg->local_tablet_id; + const auto& local_tablet = arg->local_tablet; + const auto& local_tablet_uid = local_tablet->tablet_uid(); + + auto& request = arg->request; + + TStatus tstatus; + Defer defer {[=, &tstatus, ingest_binlog_tstatus = arg->tstatus]() { + LOG(INFO) << "ingest binlog. result: " << apache::thrift::ThriftDebugString(tstatus); + if (tstatus.status_code != TStatusCode::OK) { + // abort txn + StorageEngine::instance()->txn_manager()->abort_txn(partition_id, txn_id, + local_tablet_id, local_tablet_uid); + } + + if (ingest_binlog_tstatus) { + *ingest_binlog_tstatus = std::move(tstatus); + } + }}; + + auto set_tstatus = [&tstatus](TStatusCode::type code, std::string error_msg) { + tstatus.__set_status_code(code); + tstatus.__isset.error_msgs = true; + tstatus.error_msgs.push_back(std::move(error_msg)); + }; + + // Step 3: get binlog info + auto binlog_api_url = fmt::format("http://{}:{}/api/_binlog/_download", request.remote_host, + request.remote_port); + constexpr int max_retry = 3; + + auto get_binlog_info_url = + fmt::format("{}?method={}&tablet_id={}&binlog_version={}", binlog_api_url, + "get_binlog_info", request.remote_tablet_id, request.binlog_version); + std::string binlog_info; + auto get_binlog_info_cb = [&get_binlog_info_url, &binlog_info](HttpClient* client) { + RETURN_IF_ERROR(client->init(get_binlog_info_url)); + client->set_timeout_ms(kMaxTimeoutMs); + return client->execute(&binlog_info); + }; + auto status = HttpClient::execute_with_retry(max_retry, 1, get_binlog_info_cb); + if (!status.ok()) { + LOG(WARNING) << "failed to get binlog info from " << get_binlog_info_url + << ", status=" << status.to_string(); + status.to_thrift(&tstatus); + return; + } + + std::vector binlog_info_parts = strings::Split(binlog_info, ":"); + // TODO(Drogon): check binlog info content is right + DCHECK(binlog_info_parts.size() == 2); + const std::string& remote_rowset_id = binlog_info_parts[0]; + int64_t num_segments = std::stoll(binlog_info_parts[1]); + + // Step 4: get rowset meta + auto get_rowset_meta_url = fmt::format( + "{}?method={}&tablet_id={}&rowset_id={}&binlog_version={}", binlog_api_url, + "get_rowset_meta", request.remote_tablet_id, remote_rowset_id, request.binlog_version); + std::string rowset_meta_str; + auto get_rowset_meta_cb = [&get_rowset_meta_url, &rowset_meta_str](HttpClient* client) { + RETURN_IF_ERROR(client->init(get_rowset_meta_url)); + client->set_timeout_ms(kMaxTimeoutMs); + return client->execute(&rowset_meta_str); + }; + status = HttpClient::execute_with_retry(max_retry, 1, get_rowset_meta_cb); + if (!status.ok()) { + LOG(WARNING) << "failed to get rowset meta from " << get_rowset_meta_url + << ", status=" << status.to_string(); + status.to_thrift(&tstatus); + return; + } + RowsetMetaPB rowset_meta_pb; + if (!rowset_meta_pb.ParseFromString(rowset_meta_str)) { + LOG(WARNING) << "failed to parse rowset meta from " << get_rowset_meta_url; + status = Status::InternalError("failed to parse rowset meta"); + status.to_thrift(&tstatus); + return; + } + // rewrite rowset meta + rowset_meta_pb.set_tablet_id(local_tablet_id); + rowset_meta_pb.set_partition_id(partition_id); + rowset_meta_pb.set_tablet_schema_hash(local_tablet->tablet_meta()->schema_hash()); + rowset_meta_pb.set_txn_id(txn_id); + rowset_meta_pb.set_rowset_state(RowsetStatePB::COMMITTED); + auto rowset_meta = std::make_shared(); + if (!rowset_meta->init_from_pb(rowset_meta_pb)) { + LOG(WARNING) << "failed to init rowset meta from " << get_rowset_meta_url; + status = Status::InternalError("failed to init rowset meta"); + status.to_thrift(&tstatus); + return; + } + RowsetId new_rowset_id = StorageEngine::instance()->next_rowset_id(); + auto pending_rs_guard = StorageEngine::instance()->pending_local_rowsets().add(new_rowset_id); + rowset_meta->set_rowset_id(new_rowset_id); + rowset_meta->set_tablet_uid(local_tablet->tablet_uid()); + + // Step 5: get all segment files + // Step 5.1: get all segment files size + std::vector segment_file_urls; + segment_file_urls.reserve(num_segments); + std::vector segment_file_sizes; + segment_file_sizes.reserve(num_segments); + for (int64_t segment_index = 0; segment_index < num_segments; ++segment_index) { + auto get_segment_file_size_url = fmt::format( + "{}?method={}&tablet_id={}&rowset_id={}&segment_index={}", binlog_api_url, + "get_segment_file", request.remote_tablet_id, remote_rowset_id, segment_index); + uint64_t segment_file_size; + auto get_segment_file_size_cb = [&get_segment_file_size_url, + &segment_file_size](HttpClient* client) { + RETURN_IF_ERROR(client->init(get_segment_file_size_url)); + client->set_timeout_ms(kMaxTimeoutMs); + RETURN_IF_ERROR(client->head()); + return client->get_content_length(&segment_file_size); + }; + + status = HttpClient::execute_with_retry(max_retry, 1, get_segment_file_size_cb); + if (!status.ok()) { + LOG(WARNING) << "failed to get segment file size from " << get_segment_file_size_url + << ", status=" << status.to_string(); + status.to_thrift(&tstatus); + return; + } + + segment_file_sizes.push_back(segment_file_size); + segment_file_urls.push_back(std::move(get_segment_file_size_url)); + } + + // Step 5.2: check data capacity + uint64_t total_size = std::accumulate(segment_file_sizes.begin(), segment_file_sizes.end(), 0); + if (!local_tablet->can_add_binlog(total_size)) { + LOG(WARNING) << "failed to add binlog, no enough space, total_size=" << total_size + << ", tablet=" << local_tablet->tablet_id(); + status = Status::InternalError("no enough space"); + status.to_thrift(&tstatus); + return; + } + + // Step 5.3: get all segment files + for (int64_t segment_index = 0; segment_index < num_segments; ++segment_index) { + auto segment_file_size = segment_file_sizes[segment_index]; + auto get_segment_file_url = segment_file_urls[segment_index]; + + uint64_t estimate_timeout = + segment_file_size / config::download_low_speed_limit_kbps / 1024; + if (estimate_timeout < config::download_low_speed_time) { + estimate_timeout = config::download_low_speed_time; + } + + auto local_segment_path = BetaRowset::segment_file_path( + local_tablet->tablet_path(), rowset_meta->rowset_id(), segment_index); + LOG(INFO) << fmt::format("download segment file from {} to {}", get_segment_file_url, + local_segment_path); + auto get_segment_file_cb = [&get_segment_file_url, &local_segment_path, segment_file_size, + estimate_timeout](HttpClient* client) { + RETURN_IF_ERROR(client->init(get_segment_file_url)); + client->set_timeout_ms(estimate_timeout * 1000); + RETURN_IF_ERROR(client->download(local_segment_path)); + + std::error_code ec; + // Check file length + uint64_t local_file_size = std::filesystem::file_size(local_segment_path, ec); + if (ec) { + LOG(WARNING) << "download file error" << ec.message(); + return Status::IOError("can't retrive file_size of {}, due to {}", + local_segment_path, ec.message()); + } + if (local_file_size != segment_file_size) { + LOG(WARNING) << "download file length error" + << ", get_segment_file_url=" << get_segment_file_url + << ", file_size=" << segment_file_size + << ", local_file_size=" << local_file_size; + return Status::InternalError("downloaded file size is not equal"); + } + chmod(local_segment_path.c_str(), S_IRUSR | S_IWUSR); + return Status::OK(); + }; + + auto status = HttpClient::execute_with_retry(max_retry, 1, get_segment_file_cb); + if (!status.ok()) { + LOG(WARNING) << "failed to get segment file from " << get_segment_file_url + << ", status=" << status.to_string(); + status.to_thrift(&tstatus); + return; + } + } + + // Step 6: create rowset && calculate delete bitmap && commit + // Step 6.1: create rowset + RowsetSharedPtr rowset; + status = RowsetFactory::create_rowset(local_tablet->tablet_schema(), + local_tablet->tablet_path(), rowset_meta, &rowset); + + if (!status) { + LOG(WARNING) << "failed to create rowset from rowset meta for remote tablet" + << ". rowset_id: " << rowset_meta_pb.rowset_id() + << ", rowset_type: " << rowset_meta_pb.rowset_type() + << ", remote_tablet_id=" << rowset_meta_pb.tablet_id() << ", txn_id=" << txn_id + << ", status=" << status.to_string(); + status.to_thrift(&tstatus); + return; + } + + // Step 6.2 calculate delete bitmap before commit + auto calc_delete_bitmap_token = + StorageEngine::instance()->calc_delete_bitmap_executor()->create_token(); + DeleteBitmapPtr delete_bitmap = std::make_shared(local_tablet_id); + RowsetIdUnorderedSet pre_rowset_ids; + if (local_tablet->enable_unique_key_merge_on_write()) { + auto beta_rowset = reinterpret_cast(rowset.get()); + std::vector segments; + status = beta_rowset->load_segments(&segments); + if (!status) { + LOG(WARNING) << "failed to load segments from rowset" + << ". rowset_id: " << beta_rowset->rowset_id() << ", txn_id=" << txn_id + << ", status=" << status.to_string(); + status.to_thrift(&tstatus); + return; + } + if (segments.size() > 1) { + // calculate delete bitmap between segments + status = local_tablet->calc_delete_bitmap_between_segments(rowset, segments, + delete_bitmap); + if (!status) { + LOG(WARNING) << "failed to calculate delete bitmap" + << ". tablet_id: " << local_tablet->tablet_id() + << ". rowset_id: " << rowset->rowset_id() << ", txn_id=" << txn_id + << ", status=" << status.to_string(); + status.to_thrift(&tstatus); + return; + } + } + + static_cast(local_tablet->commit_phase_update_delete_bitmap( + rowset, pre_rowset_ids, delete_bitmap, segments, txn_id, + calc_delete_bitmap_token.get(), nullptr)); + static_cast(calc_delete_bitmap_token->wait()); + } + + // Step 6.3: commit txn + Status commit_txn_status = StorageEngine::instance()->txn_manager()->commit_txn( + local_tablet->data_dir()->get_meta(), rowset_meta->partition_id(), + rowset_meta->txn_id(), rowset_meta->tablet_id(), local_tablet->tablet_uid(), + rowset_meta->load_id(), rowset, std::move(pending_rs_guard), false); + if (!commit_txn_status && !commit_txn_status.is()) { + auto err_msg = fmt::format( + "failed to commit txn for remote tablet. rowset_id: {}, remote_tablet_id={}, " + "txn_id={}, status={}", + rowset_meta->rowset_id().to_string(), rowset_meta->tablet_id(), + rowset_meta->txn_id(), commit_txn_status.to_string()); + LOG(WARNING) << err_msg; + set_tstatus(TStatusCode::RUNTIME_ERROR, std::move(err_msg)); + return; + } + + if (local_tablet->enable_unique_key_merge_on_write()) { + StorageEngine::instance()->txn_manager()->set_txn_related_delete_bitmap( + partition_id, txn_id, local_tablet_id, local_tablet->tablet_uid(), true, + delete_bitmap, pre_rowset_ids, nullptr); + } + + tstatus.__set_status_code(TStatusCode::OK); +} +} // namespace + using apache::thrift::TException; using apache::thrift::TProcessor; using apache::thrift::TMultiplexedProcessor; @@ -90,19 +371,33 @@ BackendService::BackendService(ExecEnv* exec_env) Status BackendService::create_service(ExecEnv* exec_env, int port, std::unique_ptr* server) { - std::shared_ptr handler(new BackendService(exec_env)); + auto service = std::make_shared(exec_env); // TODO: do we want a BoostThreadFactory? // TODO: we want separate thread factories here, so that fe requests can't starve // be requests - std::shared_ptr thread_factory(new ThreadFactory()); - - std::shared_ptr be_processor(new BackendServiceProcessor(handler)); + // std::shared_ptr be_processor = std::make_shared(service); + auto be_processor = std::make_shared(service); *server = std::make_unique("backend", be_processor, port, config::be_service_threads); LOG(INFO) << "Doris BackendService listening on " << port; + auto thread_num = config::ingest_binlog_work_pool_size; + if (thread_num < 0) { + LOG(INFO) << fmt::format("ingest binlog thread pool size is {}, so we will in sync mode", + thread_num); + return Status::OK(); + } + + if (thread_num == 0) { + thread_num = std::thread::hardware_concurrency(); + } + static_cast(doris::ThreadPoolBuilder("IngestBinlog") + .set_min_threads(thread_num) + .set_max_threads(thread_num * 2) + .build(&(service->_ingest_binlog_workers))); + LOG(INFO) << fmt::format("ingest binlog thread pool size is {}, in async mode", thread_num); return Status::OK(); } @@ -396,8 +691,6 @@ void BackendService::ingest_binlog(TIngestBinlogResult& result, const TIngestBinlogRequest& request) { LOG(INFO) << "ingest binlog. request: " << apache::thrift::ThriftDebugString(request); - constexpr uint64_t kMaxTimeoutMs = 1000; - TStatus tstatus; Defer defer {[&result, &tstatus]() { result.__set_status(tstatus); @@ -452,6 +745,12 @@ void BackendService::ingest_binlog(TIngestBinlogResult& result, set_tstatus(TStatusCode::ANALYSIS_ERROR, error_msg); return; } + if (!request.__isset.local_tablet_id) { + auto error_msg = "local_tablet_id is empty"; + LOG(WARNING) << error_msg; + set_tstatus(TStatusCode::ANALYSIS_ERROR, error_msg); + return; + } if (!request.__isset.load_id) { auto error_msg = "load_id is empty"; LOG(WARNING) << error_msg; @@ -486,239 +785,105 @@ void BackendService::ingest_binlog(TIngestBinlogResult& result, return; } - // Step 3: get binlog info - auto binlog_api_url = fmt::format("http://{}:{}/api/_binlog/_download", request.remote_host, - request.remote_port); - constexpr int max_retry = 3; + bool is_async = (_ingest_binlog_workers != nullptr); + result.__set_is_async(is_async); - auto get_binlog_info_url = - fmt::format("{}?method={}&tablet_id={}&binlog_version={}", binlog_api_url, - "get_binlog_info", request.remote_tablet_id, request.binlog_version); - std::string binlog_info; - auto get_binlog_info_cb = [&get_binlog_info_url, &binlog_info](HttpClient* client) { - RETURN_IF_ERROR(client->init(get_binlog_info_url)); - client->set_timeout_ms(kMaxTimeoutMs); - return client->execute(&binlog_info); - }; - status = HttpClient::execute_with_retry(max_retry, 1, get_binlog_info_cb); - if (!status.ok()) { - LOG(WARNING) << "failed to get binlog info from " << get_binlog_info_url - << ", status=" << status.to_string(); - status.to_thrift(&tstatus); - return; - } + auto ingest_binlog_func = [=, tstatus = &tstatus]() { + IngestBinlogArg ingest_binlog_arg = { + .txn_id = txn_id, + .partition_id = partition_id, + .local_tablet_id = local_tablet_id, + .local_tablet = local_tablet, - std::vector binlog_info_parts = strings::Split(binlog_info, ":"); - // TODO(Drogon): check binlog info content is right - DCHECK(binlog_info_parts.size() == 2); - const std::string& remote_rowset_id = binlog_info_parts[0]; - int64_t num_segments = std::stoll(binlog_info_parts[1]); + .request = std::move(request), + .tstatus = is_async ? nullptr : tstatus, + }; - // Step 4: get rowset meta - auto get_rowset_meta_url = fmt::format( - "{}?method={}&tablet_id={}&rowset_id={}&binlog_version={}", binlog_api_url, - "get_rowset_meta", request.remote_tablet_id, remote_rowset_id, request.binlog_version); - std::string rowset_meta_str; - auto get_rowset_meta_cb = [&get_rowset_meta_url, &rowset_meta_str](HttpClient* client) { - RETURN_IF_ERROR(client->init(get_rowset_meta_url)); - client->set_timeout_ms(kMaxTimeoutMs); - return client->execute(&rowset_meta_str); + _ingest_binlog(&ingest_binlog_arg); }; - status = HttpClient::execute_with_retry(max_retry, 1, get_rowset_meta_cb); - if (!status.ok()) { - LOG(WARNING) << "failed to get rowset meta from " << get_rowset_meta_url - << ", status=" << status.to_string(); - status.to_thrift(&tstatus); - return; - } - RowsetMetaPB rowset_meta_pb; - if (!rowset_meta_pb.ParseFromString(rowset_meta_str)) { - LOG(WARNING) << "failed to parse rowset meta from " << get_rowset_meta_url; - status = Status::InternalError("failed to parse rowset meta"); - status.to_thrift(&tstatus); - return; - } - // rewrite rowset meta - rowset_meta_pb.set_tablet_id(local_tablet_id); - rowset_meta_pb.set_partition_id(partition_id); - rowset_meta_pb.set_tablet_schema_hash(local_tablet->tablet_meta()->schema_hash()); - rowset_meta_pb.set_txn_id(txn_id); - rowset_meta_pb.set_rowset_state(RowsetStatePB::COMMITTED); - auto rowset_meta = std::make_shared(); - if (!rowset_meta->init_from_pb(rowset_meta_pb)) { - LOG(WARNING) << "failed to init rowset meta from " << get_rowset_meta_url; - status = Status::InternalError("failed to init rowset meta"); - status.to_thrift(&tstatus); - return; - } - RowsetId new_rowset_id = StorageEngine::instance()->next_rowset_id(); - rowset_meta->set_rowset_id(new_rowset_id); - rowset_meta->set_tablet_uid(local_tablet->tablet_uid()); - // Step 5: get all segment files - // Step 5.1: get all segment files size - std::vector segment_file_urls; - segment_file_urls.reserve(num_segments); - std::vector segment_file_sizes; - segment_file_sizes.reserve(num_segments); - for (int64_t segment_index = 0; segment_index < num_segments; ++segment_index) { - auto get_segment_file_size_url = fmt::format( - "{}?method={}&tablet_id={}&rowset_id={}&segment_index={}", binlog_api_url, - "get_segment_file", request.remote_tablet_id, remote_rowset_id, segment_index); - uint64_t segment_file_size; - auto get_segment_file_size_cb = [&get_segment_file_size_url, - &segment_file_size](HttpClient* client) { - RETURN_IF_ERROR(client->init(get_segment_file_size_url)); - client->set_timeout_ms(kMaxTimeoutMs); - RETURN_IF_ERROR(client->head()); - return client->get_content_length(&segment_file_size); - }; - - status = HttpClient::execute_with_retry(max_retry, 1, get_segment_file_size_cb); + if (is_async) { + status = _ingest_binlog_workers->submit_func(std::move(ingest_binlog_func)); if (!status.ok()) { - LOG(WARNING) << "failed to get segment file size from " << get_segment_file_size_url - << ", status=" << status.to_string(); status.to_thrift(&tstatus); return; } - - segment_file_sizes.push_back(segment_file_size); - segment_file_urls.push_back(std::move(get_segment_file_size_url)); - } - - // Step 5.2: check data capacity - uint64_t total_size = std::accumulate(segment_file_sizes.begin(), segment_file_sizes.end(), 0); - if (!local_tablet->can_add_binlog(total_size)) { - LOG(WARNING) << "failed to add binlog, no enough space, total_size=" << total_size - << ", tablet=" << local_tablet->tablet_id(); - status = Status::InternalError("no enough space"); - status.to_thrift(&tstatus); - return; + } else { + ingest_binlog_func(); } +} - // Step 5.3: get all segment files - for (int64_t segment_index = 0; segment_index < num_segments; ++segment_index) { - auto segment_file_size = segment_file_sizes[segment_index]; - auto get_segment_file_url = segment_file_urls[segment_index]; - - uint64_t estimate_timeout = - segment_file_size / config::download_low_speed_limit_kbps / 1024; - if (estimate_timeout < config::download_low_speed_time) { - estimate_timeout = config::download_low_speed_time; - } +void BackendService::query_ingest_binlog(TQueryIngestBinlogResult& result, + const TQueryIngestBinlogRequest& request) { + LOG(INFO) << "query ingest binlog. request: " << apache::thrift::ThriftDebugString(request); - auto local_segment_path = BetaRowset::segment_file_path( - local_tablet->tablet_path(), rowset_meta->rowset_id(), segment_index); - LOG(INFO) << fmt::format("download segment file from {} to {}", get_segment_file_url, - local_segment_path); - auto get_segment_file_cb = [&get_segment_file_url, &local_segment_path, segment_file_size, - estimate_timeout](HttpClient* client) { - RETURN_IF_ERROR(client->init(get_segment_file_url)); - client->set_timeout_ms(estimate_timeout * 1000); - RETURN_IF_ERROR(client->download(local_segment_path)); - - std::error_code ec; - // Check file length - uint64_t local_file_size = std::filesystem::file_size(local_segment_path, ec); - if (ec) { - LOG(WARNING) << "download file error" << ec.message(); - return Status::IOError("can't retrive file_size of {}, due to {}", - local_segment_path, ec.message()); - } - if (local_file_size != segment_file_size) { - LOG(WARNING) << "download file length error" - << ", get_segment_file_url=" << get_segment_file_url - << ", file_size=" << segment_file_size - << ", local_file_size=" << local_file_size; - return Status::InternalError("downloaded file size is not equal"); - } - chmod(local_segment_path.c_str(), S_IRUSR | S_IWUSR); - return Status::OK(); - }; + auto set_result = [&](TIngestBinlogStatus::type status, std::string error_msg) { + result.__set_status(status); + result.__set_err_msg(std::move(error_msg)); + }; - auto status = HttpClient::execute_with_retry(max_retry, 1, get_segment_file_cb); - if (!status.ok()) { - LOG(WARNING) << "failed to get segment file from " << get_segment_file_url - << ", status=" << status.to_string(); - status.to_thrift(&tstatus); - return; - } + /// Check args: txn_id, partition_id, tablet_id, load_id + if (!request.__isset.txn_id) { + auto error_msg = "txn_id is empty"; + LOG(WARNING) << error_msg; + set_result(TIngestBinlogStatus::ANALYSIS_ERROR, error_msg); + return; } - - // Step 6: create rowset && calculate delete bitmap && commit - // Step 6.1: create rowset - RowsetSharedPtr rowset; - status = RowsetFactory::create_rowset(local_tablet->tablet_schema(), - local_tablet->tablet_path(), rowset_meta, &rowset); - - if (!status) { - LOG(WARNING) << "failed to create rowset from rowset meta for remote tablet" - << ". rowset_id: " << rowset_meta_pb.rowset_id() - << ", rowset_type: " << rowset_meta_pb.rowset_type() - << ", remote_tablet_id=" << rowset_meta_pb.tablet_id() << ", txn_id=" << txn_id - << ", status=" << status.to_string(); - status.to_thrift(&tstatus); + if (!request.__isset.partition_id) { + auto error_msg = "partition_id is empty"; + LOG(WARNING) << error_msg; + set_result(TIngestBinlogStatus::ANALYSIS_ERROR, error_msg); return; } - - // Step 6.2 calculate delete bitmap before commit - auto calc_delete_bitmap_token = - StorageEngine::instance()->calc_delete_bitmap_executor()->create_token(); - DeleteBitmapPtr delete_bitmap = std::make_shared(local_tablet_id); - RowsetIdUnorderedSet pre_rowset_ids; - if (local_tablet->enable_unique_key_merge_on_write()) { - auto beta_rowset = reinterpret_cast(rowset.get()); - std::vector segments; - status = beta_rowset->load_segments(&segments); - if (!status) { - LOG(WARNING) << "failed to load segments from rowset" - << ". rowset_id: " << beta_rowset->rowset_id() << ", txn_id=" << txn_id - << ", status=" << status.to_string(); - status.to_thrift(&tstatus); - return; - } - if (segments.size() > 1) { - // calculate delete bitmap between segments - status = local_tablet->calc_delete_bitmap_between_segments(rowset, segments, - delete_bitmap); - if (!status) { - LOG(WARNING) << "failed to calculate delete bitmap" - << ". tablet_id: " << local_tablet->tablet_id() - << ". rowset_id: " << rowset->rowset_id() << ", txn_id=" << txn_id - << ", status=" << status.to_string(); - status.to_thrift(&tstatus); - return; - } - } - - static_cast(local_tablet->commit_phase_update_delete_bitmap( - rowset, pre_rowset_ids, delete_bitmap, segments, txn_id, - calc_delete_bitmap_token.get(), nullptr)); - static_cast(calc_delete_bitmap_token->wait()); + if (!request.__isset.tablet_id) { + auto error_msg = "tablet_id is empty"; + LOG(WARNING) << error_msg; + set_result(TIngestBinlogStatus::ANALYSIS_ERROR, error_msg); + return; } - - // Step 6.3: commit txn - Status commit_txn_status = StorageEngine::instance()->txn_manager()->commit_txn( - local_tablet->data_dir()->get_meta(), rowset_meta->partition_id(), - rowset_meta->txn_id(), rowset_meta->tablet_id(), local_tablet->tablet_uid(), - rowset_meta->load_id(), rowset, false); - if (!commit_txn_status && !commit_txn_status.is()) { - auto err_msg = fmt::format( - "failed to commit txn for remote tablet. rowset_id: {}, remote_tablet_id={}, " - "txn_id={}, status={}", - rowset_meta->rowset_id().to_string(), rowset_meta->tablet_id(), - rowset_meta->txn_id(), commit_txn_status.to_string()); - LOG(WARNING) << err_msg; - set_tstatus(TStatusCode::RUNTIME_ERROR, std::move(err_msg)); + if (!request.__isset.load_id) { + auto error_msg = "load_id is empty"; + LOG(WARNING) << error_msg; + set_result(TIngestBinlogStatus::ANALYSIS_ERROR, error_msg); return; } - if (local_tablet->enable_unique_key_merge_on_write()) { - StorageEngine::instance()->txn_manager()->set_txn_related_delete_bitmap( - partition_id, txn_id, local_tablet_id, local_tablet->tablet_uid(), true, - delete_bitmap, pre_rowset_ids, nullptr); + auto partition_id = request.partition_id; + auto txn_id = request.txn_id; + auto tablet_id = request.tablet_id; + + // Step 1: get local tablet + auto local_tablet = StorageEngine::instance()->tablet_manager()->get_tablet(tablet_id); + if (local_tablet == nullptr) { + auto error_msg = fmt::format("tablet {} not found", tablet_id); + LOG(WARNING) << error_msg; + set_result(TIngestBinlogStatus::NOT_FOUND, std::move(error_msg)); + return; } - tstatus.__set_status_code(TStatusCode::OK); + // Step 2: get txn state + auto tablet_uid = local_tablet->tablet_uid(); + auto txn_state = StorageEngine::instance()->txn_manager()->get_txn_state(partition_id, txn_id, + tablet_id, tablet_uid); + switch (txn_state) { + case TxnState::NOT_FOUND: + result.__set_status(TIngestBinlogStatus::NOT_FOUND); + break; + case TxnState::PREPARED: + result.__set_status(TIngestBinlogStatus::DOING); + break; + case TxnState::COMMITTED: + result.__set_status(TIngestBinlogStatus::OK); + break; + case TxnState::ROLLEDBACK: + result.__set_status(TIngestBinlogStatus::FAILED); + break; + case TxnState::ABORTED: + result.__set_status(TIngestBinlogStatus::FAILED); + break; + case TxnState::DELETED: + result.__set_status(TIngestBinlogStatus::FAILED); + break; + } } } // namespace doris diff --git a/be/src/service/backend_service.h b/be/src/service/backend_service.h index 8ad55e43e6e74b..09a79a68bf073a 100644 --- a/be/src/service/backend_service.h +++ b/be/src/service/backend_service.h @@ -58,6 +58,7 @@ class TTransmitDataParams; class TUniqueId; class TIngestBinlogRequest; class TIngestBinlogResult; +class ThreadPool; // This class just forward rpc for actual handler // make this class because we can bind multiple service on single point @@ -137,10 +138,14 @@ class BackendService : public BackendServiceIf { void ingest_binlog(TIngestBinlogResult& result, const TIngestBinlogRequest& request) override; + void query_ingest_binlog(TQueryIngestBinlogResult& result, + const TQueryIngestBinlogRequest& request) override; + private: Status start_plan_fragment_execution(const TExecPlanFragmentParams& exec_params); ExecEnv* _exec_env; std::unique_ptr _agent_server; + std::unique_ptr _ingest_binlog_workers; }; } // namespace doris diff --git a/be/src/service/doris_main.cpp b/be/src/service/doris_main.cpp index 33fe22b12e7746..7da5c74683b41f 100644 --- a/be/src/service/doris_main.cpp +++ b/be/src/service/doris_main.cpp @@ -72,7 +72,6 @@ #include "util/debug_util.h" #include "util/disk_info.h" #include "util/mem_info.h" -#include "util/telemetry/telemetry.h" #include "util/thrift_rpc_helper.h" #include "util/thrift_server.h" #include "util/uid_util.h" @@ -480,6 +479,8 @@ int main(int argc, char** argv) { exit(-1); } + doris::ThreadLocalHandle::create_thread_local_if_not_exits(); + // init exec env auto exec_env(doris::ExecEnv::GetInstance()); status = doris::ExecEnv::init(doris::ExecEnv::GetInstance(), paths, broken_paths); @@ -488,8 +489,6 @@ int main(int argc, char** argv) { exit(-1); } - doris::telemetry::init_tracer(); - // begin to start services doris::ThriftRpcHelper::setup(exec_env); // 1. thrift server with be_port @@ -583,6 +582,7 @@ int main(int argc, char** argv) { brpc_service.reset(nullptr); LOG(INFO) << "Brpc service stopped"; exec_env->destroy(); + doris::ThreadLocalHandle::del_thread_local_if_count_is_zero(); LOG(INFO) << "Doris main exited."; return 0; } diff --git a/be/src/service/http_service.cpp b/be/src/service/http_service.cpp index 41578d976dde2a..c4f24cfdf8fc0f 100644 --- a/be/src/service/http_service.cpp +++ b/be/src/service/http_service.cpp @@ -17,6 +17,9 @@ #include "service/http_service.h" +#include +#include + #include #include #include @@ -59,6 +62,30 @@ #include "util/doris_metrics.h" namespace doris { +namespace { +std::shared_ptr get_rate_limit_group(event_base* event_base) { + auto rate_limit = config::download_binlog_rate_limit_kbs; + if (rate_limit <= 0) { + return nullptr; + } + + auto max_value = std::numeric_limits::max() / 1024 * 10; + if (rate_limit > max_value) { + LOG(WARNING) << "rate limit is too large, set to max value."; + rate_limit = max_value; + } + struct timeval cfg_tick = {0, 100 * 1000}; // 100ms + rate_limit = rate_limit / 10 * 1024; // convert to KB/S + + auto token_bucket = std::unique_ptr( + ev_token_bucket_cfg_new(rate_limit, rate_limit * 2, rate_limit, rate_limit * 2, + &cfg_tick), + ev_token_bucket_cfg_free); + return std::shared_ptr( + bufferevent_rate_limit_group_new(event_base, token_bucket.get()), + bufferevent_rate_limit_group_free); +} +} // namespace HttpService::HttpService(ExecEnv* env, int port, int num_threads) : _env(env), @@ -72,6 +99,9 @@ HttpService::~HttpService() { Status HttpService::start() { add_default_path_handlers(_web_page_handler.get()); + auto event_base = _ev_http_server->get_event_bases()[0]; + _rate_limit_group = get_rate_limit_group(event_base.get()); + // register load StreamLoadAction* streamload_action = _pool.add(new StreamLoadAction(_env)); _ev_http_server->register_handler(HttpMethod::PUT, "/api/{db}/{table}/_load", @@ -93,18 +123,19 @@ Status HttpService::start() { for (auto& path : _env->store_paths()) { allow_paths.emplace_back(path.path); } - DownloadAction* download_action = _pool.add(new DownloadAction(_env, allow_paths)); + DownloadAction* download_action = _pool.add(new DownloadAction(_env, nullptr, allow_paths)); _ev_http_server->register_handler(HttpMethod::HEAD, "/api/_download_load", download_action); _ev_http_server->register_handler(HttpMethod::GET, "/api/_download_load", download_action); - DownloadAction* tablet_download_action = _pool.add(new DownloadAction(_env, allow_paths)); + DownloadAction* tablet_download_action = + _pool.add(new DownloadAction(_env, _rate_limit_group, allow_paths)); _ev_http_server->register_handler(HttpMethod::HEAD, "/api/_tablet/_download", tablet_download_action); _ev_http_server->register_handler(HttpMethod::GET, "/api/_tablet/_download", tablet_download_action); if (config::enable_single_replica_load) { DownloadAction* single_replica_download_action = _pool.add(new DownloadAction( - _env, allow_paths, config::single_replica_load_download_num_workers)); + _env, nullptr, allow_paths, config::single_replica_load_download_num_workers)); _ev_http_server->register_handler(HttpMethod::HEAD, "/api/_single_replica/_download", single_replica_download_action); _ev_http_server->register_handler(HttpMethod::GET, "/api/_single_replica/_download", @@ -118,7 +149,8 @@ Status HttpService::start() { _ev_http_server->register_handler(HttpMethod::HEAD, "/api/_load_error_log", error_log_download_action); - DownloadBinlogAction* download_binlog_action = _pool.add(new DownloadBinlogAction(_env)); + DownloadBinlogAction* download_binlog_action = + _pool.add(new DownloadBinlogAction(_env, _rate_limit_group)); _ev_http_server->register_handler(HttpMethod::GET, "/api/_binlog/_download", download_binlog_action); _ev_http_server->register_handler(HttpMethod::HEAD, "/api/_binlog/_download", diff --git a/be/src/service/http_service.h b/be/src/service/http_service.h index 46b5b3c5b3c40e..eeaf95165949ee 100644 --- a/be/src/service/http_service.h +++ b/be/src/service/http_service.h @@ -22,6 +22,8 @@ #include "common/object_pool.h" #include "common/status.h" +struct bufferevent_rate_limit_group; + namespace doris { class ExecEnv; @@ -47,6 +49,8 @@ class HttpService { std::unique_ptr _ev_http_server; std::unique_ptr _web_page_handler; + std::shared_ptr _rate_limit_group {nullptr}; + bool stopped = false; }; diff --git a/be/src/service/internal_service.cpp b/be/src/service/internal_service.cpp index 8f9ed8c0d096d0..0c12e910c3391f 100644 --- a/be/src/service/internal_service.cpp +++ b/be/src/service/internal_service.cpp @@ -86,7 +86,6 @@ #include "runtime/exec_env.h" #include "runtime/fold_constant_executor.h" #include "runtime/fragment_mgr.h" -#include "runtime/group_commit_mgr.h" #include "runtime/load_channel_mgr.h" #include "runtime/load_stream_mgr.h" #include "runtime/result_buffer_mgr.h" @@ -108,8 +107,6 @@ #include "util/runtime_profile.h" #include "util/stopwatch.hpp" #include "util/string_util.h" -#include "util/telemetry/brpc_carrier.h" -#include "util/telemetry/telemetry.h" #include "util/thrift_util.h" #include "util/time.h" #include "util/uid_util.h" @@ -304,8 +301,6 @@ void PInternalServiceImpl::exec_plan_fragment(google::protobuf::RpcController* c void PInternalServiceImpl::_exec_plan_fragment_in_pthread( google::protobuf::RpcController* controller, const PExecPlanFragmentRequest* request, PExecPlanFragmentResult* response, google::protobuf::Closure* done) { - auto span = telemetry::start_rpc_server_span("exec_plan_fragment", controller); - auto scope = OpentelemetryScope {span}; brpc::ClosureGuard closure_guard(done); auto st = Status::OK(); bool compact = request->has_compact() ? request->compact() : false; @@ -338,13 +333,11 @@ void PInternalServiceImpl::exec_plan_fragment_prepare(google::protobuf::RpcContr } } -void PInternalServiceImpl::exec_plan_fragment_start(google::protobuf::RpcController* controller, +void PInternalServiceImpl::exec_plan_fragment_start(google::protobuf::RpcController* /*controller*/, const PExecPlanFragmentStartRequest* request, PExecPlanFragmentResult* result, google::protobuf::Closure* done) { - bool ret = _light_work_pool.try_offer([this, controller, request, result, done]() { - auto span = telemetry::start_rpc_server_span("exec_plan_fragment_start", controller); - auto scope = OpentelemetryScope {span}; + bool ret = _light_work_pool.try_offer([this, request, result, done]() { brpc::ClosureGuard closure_guard(done); auto st = _exec_env->fragment_mgr()->start_query_execution(request); st.to_protobuf(result->mutable_status()); @@ -356,8 +349,8 @@ void PInternalServiceImpl::exec_plan_fragment_start(google::protobuf::RpcControl } void PInternalServiceImpl::open_load_stream(google::protobuf::RpcController* controller, - const POpenStreamSinkRequest* request, - POpenStreamSinkResponse* response, + const POpenLoadStreamRequest* request, + POpenLoadStreamResponse* response, google::protobuf::Closure* done) { bool ret = _light_work_pool.try_offer([this, controller, request, response, done]() { signal::set_signal_task_id(request->load_id()); @@ -402,7 +395,6 @@ void PInternalServiceImpl::open_load_stream(google::protobuf::RpcController* con return; } - load_stream->add_rpc_stream(); VLOG_DEBUG << "get streamid =" << streamid; st.to_protobuf(response->mutable_status()); }); @@ -498,9 +490,9 @@ void PInternalServiceImpl::tablet_writer_cancel(google::protobuf::RpcController* } } -Status PInternalServiceImpl::_exec_plan_fragment_impl(const std::string& ser_request, - PFragmentRequestVersion version, - bool compact) { +Status PInternalServiceImpl::_exec_plan_fragment_impl( + const std::string& ser_request, PFragmentRequestVersion version, bool compact, + const std::function& cb) { // Sometimes the BE do not receive the first heartbeat message and it receives request from FE // If BE execute this fragment, it will core when it wants to get some property from master info. if (ExecEnv::GetInstance()->master_info() == nullptr) { @@ -516,7 +508,11 @@ Status PInternalServiceImpl::_exec_plan_fragment_impl(const std::string& ser_req uint32_t len = ser_request.size(); RETURN_IF_ERROR(deserialize_thrift_msg(buf, &len, compact, &t_request)); } - return _exec_env->fragment_mgr()->exec_plan_fragment(t_request); + if (cb) { + return _exec_env->fragment_mgr()->exec_plan_fragment(t_request, cb); + } else { + return _exec_env->fragment_mgr()->exec_plan_fragment(t_request); + } } else if (version == PFragmentRequestVersion::VERSION_2) { TExecPlanFragmentParamsList t_request; { @@ -526,7 +522,11 @@ Status PInternalServiceImpl::_exec_plan_fragment_impl(const std::string& ser_req } for (const TExecPlanFragmentParams& params : t_request.paramsList) { - RETURN_IF_ERROR(_exec_env->fragment_mgr()->exec_plan_fragment(params)); + if (cb) { + RETURN_IF_ERROR(_exec_env->fragment_mgr()->exec_plan_fragment(params, cb)); + } else { + RETURN_IF_ERROR(_exec_env->fragment_mgr()->exec_plan_fragment(params)); + } } return Status::OK(); } else if (version == PFragmentRequestVersion::VERSION_3) { @@ -538,7 +538,11 @@ Status PInternalServiceImpl::_exec_plan_fragment_impl(const std::string& ser_req } for (const TPipelineFragmentParams& params : t_request.params_list) { - RETURN_IF_ERROR(_exec_env->fragment_mgr()->exec_plan_fragment(params)); + if (cb) { + RETURN_IF_ERROR(_exec_env->fragment_mgr()->exec_plan_fragment(params, cb)); + } else { + RETURN_IF_ERROR(_exec_env->fragment_mgr()->exec_plan_fragment(params)); + } } return Status::OK(); } else { @@ -546,13 +550,11 @@ Status PInternalServiceImpl::_exec_plan_fragment_impl(const std::string& ser_req } } -void PInternalServiceImpl::cancel_plan_fragment(google::protobuf::RpcController* controller, +void PInternalServiceImpl::cancel_plan_fragment(google::protobuf::RpcController* /*controller*/, const PCancelPlanFragmentRequest* request, PCancelPlanFragmentResult* result, google::protobuf::Closure* done) { - bool ret = _light_work_pool.try_offer([this, controller, request, result, done]() { - auto span = telemetry::start_rpc_server_span("exec_plan_fragment_start", controller); - auto scope = OpentelemetryScope {span}; + bool ret = _light_work_pool.try_offer([this, request, result, done]() { brpc::ClosureGuard closure_guard(done); TUniqueId tid; tid.__set_hi(request->finst_id().hi()); @@ -707,23 +709,19 @@ void PInternalServiceImpl::fetch_arrow_flight_schema(google::protobuf::RpcContro google::protobuf::Closure* done) { bool ret = _light_work_pool.try_offer([request, result, done]() { brpc::ClosureGuard closure_guard(done); - RowDescriptor row_desc = ExecEnv::GetInstance()->result_mgr()->find_row_descriptor( - UniqueId(request->finst_id()).to_thrift()); - if (row_desc.equals(RowDescriptor())) { - auto st = Status::NotFound("not found row descriptor"); - st.to_protobuf(result->mutable_status()); - return; - } - - std::shared_ptr schema; - auto st = convert_to_arrow_schema(row_desc, &schema); - if (UNLIKELY(!st.ok())) { + std::shared_ptr schema = + ExecEnv::GetInstance()->result_mgr()->find_arrow_schema( + UniqueId(request->finst_id()).to_thrift()); + if (schema == nullptr) { + LOG(INFO) << "not found arrow flight schema, maybe query has been canceled"; + auto st = Status::NotFound( + "not found arrow flight schema, maybe query has been canceled"); st.to_protobuf(result->mutable_status()); return; } std::string schema_str; - st = serialize_arrow_schema(row_desc, &schema, &schema_str); + auto st = serialize_arrow_schema(&schema, &schema_str); if (st.ok()) { result->set_schema(std::move(schema_str)); } @@ -1177,7 +1175,7 @@ void PInternalServiceImpl::_transmit_block(google::protobuf::RpcController* cont st.to_protobuf(response->mutable_status()); if (extract_st.ok()) { st = _exec_env->vstream_mgr()->transmit_block(request, &done); - if (!st.ok()) { + if (!st.ok() && !st.is()) { LOG(WARNING) << "transmit_block failed, message=" << st << ", fragment_instance_id=" << print_id(request->finst_id()) << ", node=" << request->node_id() @@ -1379,6 +1377,8 @@ void PInternalServiceImpl::request_slave_tablet_pull_rowset( RowsetId remote_rowset_id = rowset_meta->rowset_id(); // change rowset id because it maybe same as other local rowset RowsetId new_rowset_id = StorageEngine::instance()->next_rowset_id(); + auto pending_rs_guard = + StorageEngine::instance()->pending_local_rowsets().add(new_rowset_id); rowset_meta->set_rowset_id(new_rowset_id); rowset_meta->set_tablet_uid(tablet->tablet_uid()); VLOG_CRITICAL << "succeed to init rowset meta for slave replica. rowset_id=" @@ -1478,7 +1478,7 @@ void PInternalServiceImpl::request_slave_tablet_pull_rowset( Status commit_txn_status = StorageEngine::instance()->txn_manager()->commit_txn( tablet->data_dir()->get_meta(), rowset_meta->partition_id(), rowset_meta->txn_id(), rowset_meta->tablet_id(), tablet->tablet_uid(), rowset_meta->load_id(), rowset, - false); + std::move(pending_rs_guard), false); if (!commit_txn_status && !commit_txn_status.is()) { LOG(WARNING) << "failed to add committed rowset for slave replica. rowset_id=" << rowset_meta->rowset_id() << ", tablet_id=" << rowset_meta->tablet_id() @@ -1518,37 +1518,35 @@ void PInternalServiceImpl::_response_pull_slave_rowset(const std::string& remote return; } - PTabletWriteSlaveDoneRequest request; - request.set_txn_id(txn_id); - request.set_tablet_id(tablet_id); - request.set_node_id(node_id); - request.set_is_succeed(is_succeed); - RefCountClosure* closure = - new RefCountClosure(); - closure->ref(); - closure->ref(); - closure->cntl.set_timeout_ms(config::slave_replica_writer_rpc_timeout_sec * 1000); - closure->cntl.ignore_eovercrowded(); - stub->response_slave_tablet_pull_rowset(&closure->cntl, &request, &closure->result, closure); - - closure->join(); - if (closure->cntl.Failed()) { + auto request = std::make_shared(); + request->set_txn_id(txn_id); + request->set_tablet_id(tablet_id); + request->set_node_id(node_id); + request->set_is_succeed(is_succeed); + auto pull_rowset_callback = DummyBrpcCallback::create_shared(); + auto closure = AutoReleaseClosure< + PTabletWriteSlaveDoneRequest, + DummyBrpcCallback>::create_unique(request, + pull_rowset_callback); + closure->cntl_->set_timeout_ms(config::slave_replica_writer_rpc_timeout_sec * 1000); + closure->cntl_->ignore_eovercrowded(); + stub->response_slave_tablet_pull_rowset(closure->cntl_.get(), closure->request_.get(), + closure->response_.get(), closure.get()); + closure.release(); + + pull_rowset_callback->join(); + if (pull_rowset_callback->cntl_->Failed()) { if (!ExecEnv::GetInstance()->brpc_internal_client_cache()->available(stub, remote_host, brpc_port)) { ExecEnv::GetInstance()->brpc_internal_client_cache()->erase( - closure->cntl.remote_side()); + closure->cntl_->remote_side()); } LOG(WARNING) << "failed to response result of slave replica to master replica, error=" - << berror(closure->cntl.ErrorCode()) - << ", error_text=" << closure->cntl.ErrorText() + << berror(pull_rowset_callback->cntl_->ErrorCode()) + << ", error_text=" << pull_rowset_callback->cntl_->ErrorText() << ", master host: " << remote_host << ", tablet_id=" << tablet_id << ", txn_id=" << txn_id; } - - if (closure->unref()) { - delete closure; - } - closure = nullptr; VLOG_CRITICAL << "succeed to response the result of slave replica pull rowset to master " "replica. master host: " << remote_host << ". is_succeed=" << is_succeed << ", tablet_id=" << tablet_id @@ -1785,54 +1783,67 @@ void PInternalServiceImpl::group_commit_insert(google::protobuf::RpcController* const PGroupCommitInsertRequest* request, PGroupCommitInsertResponse* response, google::protobuf::Closure* done) { - bool ret = _light_work_pool.try_offer([this, request, response, done]() { + TUniqueId load_id; + load_id.__set_hi(request->load_id().hi()); + load_id.__set_lo(request->load_id().lo()); + bool ret = _light_work_pool.try_offer([this, request, response, done, load_id]() { brpc::ClosureGuard closure_guard(done); - auto table_id = request->table_id(); - Status st = Status::OK(); - TPlan plan; - { - auto& plan_node = request->plan_node(); - const uint8_t* buf = (const uint8_t*)plan_node.data(); - uint32_t len = plan_node.size(); - st = deserialize_thrift_msg(buf, &len, false, &plan); - if (UNLIKELY(!st.ok())) { - LOG(WARNING) << "deserialize plan failed, msg=" << st; - response->mutable_status()->set_status_code(st.code()); - response->mutable_status()->set_error_msgs(0, st.to_string()); - return; - } - } - TDescriptorTable tdesc_tbl; - { - auto& desc_tbl = request->desc_tbl(); - const uint8_t* buf = (const uint8_t*)desc_tbl.data(); - uint32_t len = desc_tbl.size(); - st = deserialize_thrift_msg(buf, &len, false, &tdesc_tbl); - if (UNLIKELY(!st.ok())) { - LOG(WARNING) << "deserialize desc tbl failed, msg=" << st; - response->mutable_status()->set_status_code(st.code()); - response->mutable_status()->set_error_msgs(0, st.to_string()); - return; + std::shared_ptr ctx = std::make_shared(_exec_env); + auto pipe = std::make_shared( + io::kMaxPipeBufferedBytes /* max_buffered_bytes */, 64 * 1024 /* min_chunk_size */, + -1 /* total_length */, true /* use_proto */); + ctx->pipe = pipe; + Status st = _exec_env->new_load_stream_mgr()->put(load_id, ctx); + if (st.ok()) { + doris::Mutex mutex; + doris::ConditionVariable cv; + bool handled = false; + try { + st = _exec_plan_fragment_impl( + request->exec_plan_fragment_request().request(), + request->exec_plan_fragment_request().version(), + request->exec_plan_fragment_request().compact(), + [&](RuntimeState* state, Status* status) { + response->set_label(state->import_label()); + response->set_txn_id(state->wal_id()); + response->set_loaded_rows(state->num_rows_load_success()); + response->set_filtered_rows(state->num_rows_load_filtered()); + st = *status; + std::unique_lock l(mutex); + handled = true; + cv.notify_one(); + }); + } catch (const Exception& e) { + st = e.to_status(); + } catch (...) { + st = Status::Error(ErrorCode::INTERNAL_ERROR, + "_exec_plan_fragment_impl meet unknown error"); } - } - TScanRangeParams tscan_range_params; - { - auto& bytes = request->scan_range_params(); - const uint8_t* buf = (const uint8_t*)bytes.data(); - uint32_t len = bytes.size(); - st = deserialize_thrift_msg(buf, &len, false, &tscan_range_params); - if (UNLIKELY(!st.ok())) { - LOG(WARNING) << "deserialize scan range failed, msg=" << st; - response->mutable_status()->set_status_code(st.code()); - response->mutable_status()->set_error_msgs(0, st.to_string()); - return; + if (!st.ok()) { + LOG(WARNING) << "exec plan fragment failed, errmsg=" << st; + } else { + for (int i = 0; i < request->data().size(); ++i) { + std::unique_ptr row(new PDataRow()); + row->CopyFrom(request->data(i)); + st = pipe->append(std::move(row)); + if (!st.ok()) { + break; + } + } + if (st.ok()) { + static_cast(pipe->finish()); + std::unique_lock l(mutex); + if (!handled) { + cv.wait(l); + } + } } } - st = _exec_env->group_commit_mgr()->group_commit_insert( - table_id, plan, tdesc_tbl, tscan_range_params, request, response); st.to_protobuf(response->mutable_status()); + _exec_env->new_load_stream_mgr()->remove(load_id); }); if (!ret) { + _exec_env->new_load_stream_mgr()->remove(load_id); offer_failed(response, done, _light_work_pool); return; } diff --git a/be/src/service/internal_service.h b/be/src/service/internal_service.h index d5712e654cfc42..ca28c8b8b06c52 100644 --- a/be/src/service/internal_service.h +++ b/be/src/service/internal_service.h @@ -38,6 +38,7 @@ class ExecEnv; class PHandShakeRequest; class PHandShakeResponse; class LoadStreamMgr; +class RuntimeState; class PInternalServiceImpl : public PBackendService { public: @@ -93,7 +94,7 @@ class PInternalServiceImpl : public PBackendService { google::protobuf::Closure* done) override; void open_load_stream(google::protobuf::RpcController* controller, - const POpenStreamSinkRequest* request, POpenStreamSinkResponse* response, + const POpenLoadStreamRequest* request, POpenLoadStreamResponse* response, google::protobuf::Closure* done) override; void tablet_writer_add_block(google::protobuf::RpcController* controller, @@ -211,7 +212,9 @@ class PInternalServiceImpl : public PBackendService { google::protobuf::Closure* done); Status _exec_plan_fragment_impl(const std::string& s_request, PFragmentRequestVersion version, - bool compact); + bool compact, + const std::function& cb = + std::function()); Status _fold_constant_expr(const std::string& ser_request, PConstantExprResult* response); diff --git a/be/src/service/point_query_executor.cpp b/be/src/service/point_query_executor.cpp index ca5b8752e06977..580e0b66df612c 100644 --- a/be/src/service/point_query_executor.cpp +++ b/be/src/service/point_query_executor.cpp @@ -51,7 +51,7 @@ Reusable::~Reusable() {} constexpr static int s_preallocted_blocks_num = 64; Status Reusable::init(const TDescriptorTable& t_desc_tbl, const std::vector& output_exprs, size_t block_size) { - SCOPED_MEM_COUNT(&_mem_size); + SCOPED_MEM_COUNT_BY_HOOK(&_mem_size); _runtime_state = RuntimeState::create_unique(); RETURN_IF_ERROR(DescriptorTbl::create(_runtime_state->obj_pool(), t_desc_tbl, &_desc_tbl)); _runtime_state->set_desc_tbl(_desc_tbl); @@ -254,9 +254,8 @@ Status PointQueryExecutor::_init_keys(const PTabletKeyLookupRequest* request) { RowCursor cursor; RETURN_IF_ERROR(cursor.init_scan_key(_tablet->tablet_schema(), olap_tuples[i].values())); RETURN_IF_ERROR(cursor.from_tuple(olap_tuples[i])); - encode_key_with_padding(&_row_read_ctxs[i]._primary_key, cursor, - _tablet->tablet_schema()->num_key_columns(), - true); + encode_key_with_padding(&_row_read_ctxs[i]._primary_key, cursor, + _tablet->tablet_schema()->num_key_columns(), true); } return Status::OK(); } diff --git a/be/src/udf/udf.cpp b/be/src/udf/udf.cpp index ecabc905ab8335..40f8e70ed0948d 100644 --- a/be/src/udf/udf.cpp +++ b/be/src/udf/udf.cpp @@ -57,6 +57,8 @@ std::unique_ptr FunctionContext::clone() { new_context->_constant_cols = _constant_cols; new_context->_fragment_local_fn_state = _fragment_local_fn_state; new_context->_check_overflow_for_decimal = _check_overflow_for_decimal; + new_context->_string_as_jsonb_string = _string_as_jsonb_string; + new_context->_jsonb_string_as_string = _jsonb_string_as_string; return new_context; } diff --git a/be/src/udf/udf.h b/be/src/udf/udf.h index a1413a5ab89047..e383904c5b4a25 100644 --- a/be/src/udf/udf.h +++ b/be/src/udf/udf.h @@ -80,6 +80,22 @@ class FunctionContext { return _check_overflow_for_decimal = check_overflow_for_decimal; } + void set_string_as_jsonb_string(bool string_as_jsonb_string) { + _string_as_jsonb_string = string_as_jsonb_string; + } + + void set_jsonb_string_as_string(bool jsonb_string_as_string) { + _jsonb_string_as_string = jsonb_string_as_string; + } + + // Cast flag, when enable string_as_jsonb_string, string casting to jsonb will not parse string + // instead just insert a string literal + bool string_as_jsonb_string() const { return _string_as_jsonb_string; } + + // Cast flag, when enable jsonb_string_as_string, jsonb string casting to string will not parse string + // instead just insert a string literal + bool jsonb_string_as_string() const { return _jsonb_string_as_string; } + // Sets an error for this UDF. If this is called, this will trigger the // query to fail. // Note: when you set error for the UDFs used in Data Load, you should @@ -162,6 +178,9 @@ class FunctionContext { bool _check_overflow_for_decimal = false; + bool _string_as_jsonb_string = false; + bool _jsonb_string_as_string = false; + std::string _string_result; vectorized::Arena arena; diff --git a/be/src/util/aligned_new.h b/be/src/util/aligned_new.h index 2800b19e4f9065..1d97010cad4c86 100644 --- a/be/src/util/aligned_new.h +++ b/be/src/util/aligned_new.h @@ -22,7 +22,6 @@ #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/logging.h" diff --git a/be/src/util/arrow/block_convertor.cpp b/be/src/util/arrow/block_convertor.cpp index 7f506d03111aec..2c605c9c96869b 100644 --- a/be/src/util/arrow/block_convertor.cpp +++ b/be/src/util/arrow/block_convertor.cpp @@ -388,8 +388,9 @@ Status FromBlockConverter::convert(std::shared_ptr* out) { return to_doris_status(arrow_st); } _cur_builder = builder.get(); + auto column = _cur_col->convert_to_full_column_if_const(); try { - _cur_type->get_serde()->write_column_to_arrow(*_cur_col, nullptr, _cur_builder, + _cur_type->get_serde()->write_column_to_arrow(*column, nullptr, _cur_builder, _cur_start, _cur_start + _cur_rows); } catch (std::exception& e) { return Status::InternalError("Fail to convert block data to arrow data, error: {}", diff --git a/be/src/util/arrow/row_batch.cpp b/be/src/util/arrow/row_batch.cpp index 6a44da2ec6b642..6662f2e0ba7aee 100644 --- a/be/src/util/arrow/row_batch.cpp +++ b/be/src/util/arrow/row_batch.cpp @@ -39,6 +39,8 @@ #include "runtime/types.h" #include "util/arrow/block_convertor.h" #include "vec/core/block.h" +#include "vec/exprs/vexpr.h" +#include "vec/exprs/vexpr_context.h" namespace doris { @@ -163,6 +165,22 @@ Status convert_to_arrow_schema(const RowDescriptor& row_desc, return Status::OK(); } +Status convert_expr_ctxs_arrow_schema(const vectorized::VExprContextSPtrs& output_vexpr_ctxs, + std::shared_ptr* result) { + std::vector> fields; + for (auto expr_ctx : output_vexpr_ctxs) { + std::shared_ptr arrow_type; + auto root_expr = expr_ctx->root(); + RETURN_IF_ERROR(convert_to_arrow_type(root_expr->type(), &arrow_type)); + auto field_name = root_expr->is_slot_ref() ? root_expr->expr_name() + : root_expr->data_type()->get_name(); + fields.push_back( + std::make_shared(field_name, arrow_type, root_expr->is_nullable())); + } + *result = arrow::schema(std::move(fields)); + return Status::OK(); +} + Status serialize_record_batch(const arrow::RecordBatch& record_batch, std::string* result) { // create sink memory buffer outputstream with the computed capacity int64_t capacity; @@ -206,15 +224,13 @@ Status serialize_record_batch(const arrow::RecordBatch& record_batch, std::strin return Status::OK(); } -Status serialize_arrow_schema(RowDescriptor row_desc, std::shared_ptr* schema, - std::string* result) { - std::vector slots; - for (auto tuple_desc : row_desc.tuple_descriptors()) { - slots.insert(slots.end(), tuple_desc->slots().begin(), tuple_desc->slots().end()); +Status serialize_arrow_schema(std::shared_ptr* schema, std::string* result) { + auto make_empty_result = arrow::RecordBatch::MakeEmpty(*schema); + if (!make_empty_result.ok()) { + return Status::InternalError("serialize_arrow_schema failed, reason: {}", + make_empty_result.status().ToString()); } - auto block = vectorized::Block(slots, 0); - std::shared_ptr batch; - RETURN_IF_ERROR(convert_to_arrow_batch(block, *schema, arrow::default_memory_pool(), &batch)); + auto batch = make_empty_result.ValueOrDie(); return serialize_record_batch(*batch, result); } diff --git a/be/src/util/arrow/row_batch.h b/be/src/util/arrow/row_batch.h index 1bd408754f1b58..ddffc3324d3451 100644 --- a/be/src/util/arrow/row_batch.h +++ b/be/src/util/arrow/row_batch.h @@ -23,6 +23,7 @@ #include "common/status.h" #include "runtime/types.h" #include "vec/core/block.h" +#include "vec/exprs/vexpr_fwd.h" // This file will convert Doris RowBatch to/from Arrow's RecordBatch // RowBatch is used by Doris query engine to exchange data between @@ -49,9 +50,11 @@ Status convert_to_arrow_schema(const RowDescriptor& row_desc, Status convert_block_arrow_schema(const vectorized::Block& block, std::shared_ptr* result); +Status convert_expr_ctxs_arrow_schema(const vectorized::VExprContextSPtrs& output_vexpr_ctxs, + std::shared_ptr* result); + Status serialize_record_batch(const arrow::RecordBatch& record_batch, std::string* result); -Status serialize_arrow_schema(RowDescriptor row_desc, std::shared_ptr* schema, - std::string* result); +Status serialize_arrow_schema(std::shared_ptr* schema, std::string* result); } // namespace doris diff --git a/be/src/util/bit_util.h b/be/src/util/bit_util.h index 3396b24c9d5c98..230134ade092b6 100644 --- a/be/src/util/bit_util.h +++ b/be/src/util/bit_util.h @@ -24,7 +24,6 @@ #include #endif -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "gutil/bits.h" #include "gutil/endian.h" diff --git a/be/src/util/bitmap_value.h b/be/src/util/bitmap_value.h index 0664735599a8ac..71edf62a8adb11 100644 --- a/be/src/util/bitmap_value.h +++ b/be/src/util/bitmap_value.h @@ -1172,10 +1172,11 @@ class BitmapValue { using SetContainer = phmap::flat_hash_set; // Construct an empty bitmap. - BitmapValue() : _type(EMPTY), _is_shared(false) {} + BitmapValue() : _sv(0), _bitmap(nullptr), _type(EMPTY), _is_shared(false) { _set.clear(); } // Construct a bitmap with one element. - explicit BitmapValue(uint64_t value) : _sv(value), _type(SINGLE), _is_shared(false) {} + explicit BitmapValue(uint64_t value) + : _sv(value), _bitmap(nullptr), _type(SINGLE), _is_shared(false) {} // Construct a bitmap from serialized data. explicit BitmapValue(const char* src) : _is_shared(false) { @@ -1199,7 +1200,7 @@ class BitmapValue { break; } - if (other._type != EMPTY) { + if (other._type == BITMAP) { _is_shared = true; // should also set other's state to shared, so that other bitmap value will // create a new bitmap when it wants to modify it. @@ -1229,6 +1230,10 @@ class BitmapValue { } BitmapValue& operator=(const BitmapValue& other) { + if (this == &other) { + return *this; + } + reset(); _type = other._type; switch (other._type) { case EMPTY: @@ -1244,7 +1249,7 @@ class BitmapValue { break; } - if (other._type != EMPTY) { + if (other._type == BITMAP) { _is_shared = true; // should also set other's state to shared, so that other bitmap value will // create a new bitmap when it wants to modify it. @@ -1265,6 +1270,7 @@ class BitmapValue { if (this == &other) { return *this; } + reset(); _type = other._type; switch (other._type) { @@ -1721,8 +1727,7 @@ class BitmapValue { BitmapValue& operator&=(const BitmapValue& rhs) { switch (rhs._type) { case EMPTY: - _type = EMPTY; - _bitmap.reset(); + reset(); // empty & any = empty break; case SINGLE: switch (_type) { @@ -1741,6 +1746,7 @@ class BitmapValue { _sv = rhs._sv; } _bitmap.reset(); + _is_shared = false; break; case SET: if (!_set.contains(rhs._sv)) { @@ -1797,6 +1803,7 @@ class BitmapValue { } _type = SET; _bitmap.reset(); + _is_shared = false; _convert_to_smaller_type(); break; case SET: @@ -1832,7 +1839,6 @@ class BitmapValue { case SINGLE: if (_sv == rhs._sv) { _type = EMPTY; - _bitmap.reset(); } else { add(rhs._sv); } @@ -2162,7 +2168,7 @@ class BitmapValue { // Return how many bytes are required to serialize this bitmap. // See BitmapTypeCode for the serialized format. - size_t getSizeInBytes() { + size_t getSizeInBytes() const { size_t res = 0; switch (_type) { case EMPTY: @@ -2613,12 +2619,13 @@ class BitmapValue { } } - void clear() { + void reset() { _type = EMPTY; - _bitmap.reset(); _sv = 0; + _set.clear(); + _is_shared = false; + _bitmap = nullptr; } - // Implement an iterator for convenience friend class BitmapValueIterator; typedef BitmapValueIterator b_iterator; diff --git a/be/src/util/brpc_client_cache.h b/be/src/util/brpc_client_cache.h index e050ad53cd9aed..ff537850854f9e 100644 --- a/be/src/util/brpc_client_cache.h +++ b/be/src/util/brpc_client_cache.h @@ -38,7 +38,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/config.h" #include "util/network_util.h" diff --git a/be/src/util/core_local.cpp b/be/src/util/core_local.cpp index a6c867656076f2..1c4b1dd04715b4 100644 --- a/be/src/util/core_local.cpp +++ b/be/src/util/core_local.cpp @@ -22,7 +22,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/logging.h" #include "util/spinlock.h" diff --git a/be/src/util/core_local.h b/be/src/util/core_local.h index e23a6820d98517..1610ae5a0bb046 100644 --- a/be/src/util/core_local.h +++ b/be/src/util/core_local.h @@ -29,7 +29,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep namespace doris { diff --git a/be/src/util/doris_metrics.cpp b/be/src/util/doris_metrics.cpp index 6b46c0cab6a0b3..21264ed2fa2ad5 100644 --- a/be/src/util/doris_metrics.cpp +++ b/be/src/util/doris_metrics.cpp @@ -26,6 +26,7 @@ #include #include +#include #include #include @@ -341,59 +342,52 @@ void DorisMetrics::_update() { } // get num of thread of doris_be process -// from /proc/pid/task +// from /proc/self/task void DorisMetrics::_update_process_thread_num() { - int64_t pid = getpid(); - std::stringstream ss; - ss << "/proc/" << pid << "/task/"; - - int64_t count = 0; - auto cb = [&count](const io::FileInfo& file) -> bool { - count += 1; - return true; - }; - Status st = io::global_local_filesystem()->iterate_directory(ss.str(), cb); - if (!st.ok()) { - LOG(WARNING) << "failed to count thread num: " << st; - process_thread_num->set_value(0); + std::error_code ec; + std::filesystem::directory_iterator dict_iter("/proc/self/task/", ec); + if (ec) { + LOG(WARNING) << "failed to count thread num: " << ec.message(); + process_fd_num_used->set_value(0); return; } + int64_t count = + std::count_if(dict_iter, std::filesystem::end(dict_iter), [](const auto& entry) { + std::error_code error_code; + return entry.is_regular_file(error_code) && !error_code; + }); process_thread_num->set_value(count); } // get num of file descriptor of doris_be process void DorisMetrics::_update_process_fd_num() { - int64_t pid = getpid(); - // fd used - std::stringstream ss; - ss << "/proc/" << pid << "/fd/"; - int64_t count = 0; - auto cb = [&count](const io::FileInfo& file) -> bool { - count += 1; - return true; - }; - Status st = io::global_local_filesystem()->iterate_directory(ss.str(), cb); - if (!st.ok()) { - LOG(WARNING) << "failed to count fd: " << st; + std::error_code ec; + std::filesystem::directory_iterator dict_iter("/proc/self/fd/", ec); + if (ec) { + LOG(WARNING) << "failed to count fd: " << ec.message(); process_fd_num_used->set_value(0); return; } + int64_t count = + std::count_if(dict_iter, std::filesystem::end(dict_iter), [](const auto& entry) { + std::error_code error_code; + return entry.is_regular_file(error_code) && !error_code; + }); + process_fd_num_used->set_value(count); // fd limits - std::stringstream ss2; - ss2 << "/proc/" << pid << "/limits"; - FILE* fp = fopen(ss2.str().c_str(), "r"); + FILE* fp = fopen("/proc/self/limits", "r"); if (fp == nullptr) { char buf[64]; - LOG(WARNING) << "open " << ss2.str() << " failed, errno=" << errno + LOG(WARNING) << "open /proc/self/limits failed, errno=" << errno << ", message=" << strerror_r(errno, buf, 64); return; } - // /proc/pid/limits + // /proc/self/limits // Max open files 65536 65536 files int64_t values[2]; size_t line_buf_size = 0; diff --git a/be/src/util/doris_metrics.h b/be/src/util/doris_metrics.h index 60ca3b51072cf5..409e6dae2b6396 100644 --- a/be/src/util/doris_metrics.h +++ b/be/src/util/doris_metrics.h @@ -228,6 +228,18 @@ class DorisMetrics { UIntGauge* heavy_work_max_threads; UIntGauge* light_work_max_threads; + UIntGauge* flush_thread_pool_queue_size; + UIntGauge* flush_thread_pool_thread_num; + + UIntGauge* local_scan_thread_pool_queue_size; + UIntGauge* local_scan_thread_pool_thread_num; + UIntGauge* remote_scan_thread_pool_queue_size; + UIntGauge* remote_scan_thread_pool_thread_num; + UIntGauge* limited_scan_thread_pool_queue_size; + UIntGauge* limited_scan_thread_pool_thread_num; + UIntGauge* group_local_scan_thread_pool_queue_size; + UIntGauge* group_local_scan_thread_pool_thread_num; + static DorisMetrics* instance() { static DorisMetrics instance; return &instance; diff --git a/be/src/util/hash_util.hpp b/be/src/util/hash_util.hpp index 355303087cb071..afa8a1453864aa 100644 --- a/be/src/util/hash_util.hpp +++ b/be/src/util/hash_util.hpp @@ -26,7 +26,6 @@ #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "gutil/hash/hash.h" // IWYU pragma: keep #include "runtime/define_primitive_type.h" diff --git a/be/src/util/jsonb_document.h b/be/src/util/jsonb_document.h index 657438fefee0d0..05e161e3568a11 100644 --- a/be/src/util/jsonb_document.h +++ b/be/src/util/jsonb_document.h @@ -77,7 +77,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep // #include "util/string_parser.hpp" @@ -197,27 +196,33 @@ class JsonbDocument { public: bool operator==(const JsonbDocument& other) const { - LOG(FATAL) << "comparing between JsonbDocument is not supported"; + assert(false); + return false; } bool operator!=(const JsonbDocument& other) const { - LOG(FATAL) << "comparing between JsonbDocument is not supported"; + assert(false); + return false; } bool operator<=(const JsonbDocument& other) const { - LOG(FATAL) << "comparing between JsonbDocument is not supported"; + assert(false); + return false; } bool operator>=(const JsonbDocument& other) const { - LOG(FATAL) << "comparing between JsonbDocument is not supported"; + assert(false); + return false; } bool operator<(const JsonbDocument& other) const { - LOG(FATAL) << "comparing between JsonbDocument is not supported"; + assert(false); + return false; } bool operator>(const JsonbDocument& other) const { - LOG(FATAL) << "comparing between JsonbDocument is not supported"; + assert(false); + return false; } private: diff --git a/be/src/util/key_util.h b/be/src/util/key_util.h index 3648a45c565ad4..0dbaa39710157c 100644 --- a/be/src/util/key_util.h +++ b/be/src/util/key_util.h @@ -48,8 +48,6 @@ constexpr uint8_t KEY_MINIMAL_MARKER = 0x00; constexpr uint8_t KEY_NULL_FIRST_MARKER = 0x01; // Used to represent a normal field, which content is encoded after this marker constexpr uint8_t KEY_NORMAL_MARKER = 0x02; -// Used to represent -constexpr uint8_t KEY_NULL_LAST_MARKER = 0xFE; // Used to represent maximal value for that field constexpr uint8_t KEY_MAXIMAL_MARKER = 0xFF; @@ -61,7 +59,7 @@ constexpr uint8_t KEY_MAXIMAL_MARKER = 0xFF; // If all num_keys are found in row, no marker will be added. // if padding_minimal is false and padding_normal_marker is true, // KEY_NORMAL_MARKER will be added. -template +template void encode_key_with_padding(std::string* buf, const RowType& row, size_t num_keys, bool padding_minimal, bool padding_normal_marker = false) { for (auto cid = 0; cid < num_keys; cid++) { @@ -80,11 +78,7 @@ void encode_key_with_padding(std::string* buf, const RowType& row, size_t num_ke auto cell = row.cell(cid); if (cell.is_null()) { - if (null_first) { - buf->push_back(KEY_NULL_FIRST_MARKER); - } else { - buf->push_back(KEY_NULL_LAST_MARKER); - } + buf->push_back(KEY_NULL_FIRST_MARKER); continue; } buf->push_back(KEY_NORMAL_MARKER); @@ -99,16 +93,12 @@ void encode_key_with_padding(std::string* buf, const RowType& row, size_t num_ke // Encode one row into binary according given num_keys. // Client call this function must assure that row contains the first // num_keys columns. -template +template void encode_key(std::string* buf, const RowType& row, size_t num_keys) { for (auto cid = 0; cid < num_keys; cid++) { auto cell = row.cell(cid); if (cell.is_null()) { - if (null_first) { - buf->push_back(KEY_NULL_FIRST_MARKER); - } else { - buf->push_back(KEY_NULL_LAST_MARKER); - } + buf->push_back(KEY_NULL_FIRST_MARKER); continue; } buf->push_back(KEY_NORMAL_MARKER); diff --git a/be/src/util/perf_counters.cpp b/be/src/util/perf_counters.cpp index 74cae81783b149..67a9c013e1dfda 100644 --- a/be/src/util/perf_counters.cpp +++ b/be/src/util/perf_counters.cpp @@ -48,6 +48,9 @@ static std::unordered_map _process_state; int64_t PerfCounters::_vm_rss = 0; std::string PerfCounters::_vm_rss_str = ""; +int64_t PerfCounters::_vm_hwm = 0; +int64_t PerfCounters::_vm_size = 0; +int64_t PerfCounters::_vm_peak = 0; // This is the order of the counters in /proc/self/io enum PERF_IO_IDX { @@ -579,14 +582,18 @@ void PerfCounters::refresh_proc_status() { if (statusinfo.is_open()) statusinfo.close(); + _vm_size = parse_bytes("status/VmSize"); + _vm_peak = parse_bytes("status/VmPeak"); _vm_rss = parse_bytes("status/VmRSS"); _vm_rss_str = PrettyPrinter::print(_vm_rss, TUnit::BYTES); + _vm_hwm = parse_bytes("status/VmHWM"); } void PerfCounters::get_proc_status(ProcStatus* out) { out->vm_size = parse_bytes("status/VmSize"); out->vm_peak = parse_bytes("status/VmPeak"); out->vm_rss = parse_bytes("status/VmRSS"); + out->vm_hwm = parse_bytes("status/VmHWM"); } } // namespace doris diff --git a/be/src/util/perf_counters.h b/be/src/util/perf_counters.h index 9f0936193d4a96..52fe46acb67eda 100644 --- a/be/src/util/perf_counters.h +++ b/be/src/util/perf_counters.h @@ -104,6 +104,7 @@ class PerfCounters { int64_t vm_size = 0; int64_t vm_peak = 0; int64_t vm_rss = 0; + int64_t vm_hwm = 0; }; static int parse_int(const std::string& state_key); @@ -118,6 +119,9 @@ class PerfCounters { // Return the process actual physical memory in bytes. static inline int64_t get_vm_rss() { return _vm_rss; } static inline std::string get_vm_rss_str() { return _vm_rss_str; } + static inline int64_t get_vm_hwm() { return _vm_hwm; } + static inline int64_t get_vm_size() { return _vm_size; } + static inline int64_t get_vm_peak() { return _vm_peak; } private: // Copy constructor and assignment not allowed @@ -164,6 +168,9 @@ class PerfCounters { static int64_t _vm_rss; static std::string _vm_rss_str; + static int64_t _vm_hwm; + static int64_t _vm_size; + static int64_t _vm_peak; }; } // namespace doris diff --git a/be/src/util/perf_counters_mac.cpp b/be/src/util/perf_counters_mac.cpp index 99bc76e033f06f..31b5ef682c8484 100644 --- a/be/src/util/perf_counters_mac.cpp +++ b/be/src/util/perf_counters_mac.cpp @@ -21,6 +21,9 @@ namespace doris { int64_t PerfCounters::_vm_rss = 0; std::string PerfCounters::_vm_rss_str = ""; +int64_t PerfCounters::_vm_hwm = 0; +int64_t PerfCounters::_vm_peak = 0; +int64_t PerfCounters::_vm_size = 0; void PerfCounters::refresh_proc_status() {} diff --git a/be/src/util/proto_util.h b/be/src/util/proto_util.h index 2b8b524da543b2..26779977642288 100644 --- a/be/src/util/proto_util.h +++ b/be/src/util/proto_util.h @@ -38,9 +38,10 @@ constexpr size_t MIN_HTTP_BRPC_SIZE = (1ULL << 31); // Embed column_values and brpc request serialization string in controller attachment. template -Status request_embed_attachment_contain_block(Params* brpc_request, Closure* closure) { +Status request_embed_attachment_contain_blockv2(Params* brpc_request, + std::unique_ptr& closure) { auto block = brpc_request->block(); - Status st = request_embed_attachment(brpc_request, block.column_values(), closure); + Status st = request_embed_attachmentv2(brpc_request, block.column_values(), closure); block.set_column_values(""); return st; } @@ -59,32 +60,37 @@ inline bool enable_http_send_block(const PTransmitDataParams& request) { } template -void transmit_block(PBackendService_Stub& stub, Closure* closure, - const PTransmitDataParams& params) { - closure->cntl.http_request().Clear(); - stub.transmit_block(&closure->cntl, ¶ms, &closure->result, closure); +void transmit_blockv2(PBackendService_Stub& stub, std::unique_ptr closure) { + closure->cntl_->http_request().Clear(); + stub.transmit_block(closure->cntl_.get(), closure->request_.get(), closure->response_.get(), + closure.get()); + closure.release(); } template -Status transmit_block_http(ExecEnv* exec_env, Closure* closure, PTransmitDataParams& params, - TNetworkAddress brpc_dest_addr) { - RETURN_IF_ERROR(request_embed_attachment_contain_block(¶ms, closure)); +Status transmit_block_httpv2(ExecEnv* exec_env, std::unique_ptr closure, + TNetworkAddress brpc_dest_addr) { + RETURN_IF_ERROR(request_embed_attachment_contain_blockv2(closure->request_.get(), closure)); //format an ipv6 address std::string brpc_url = get_brpc_http_url(brpc_dest_addr.hostname, brpc_dest_addr.port); std::shared_ptr brpc_http_stub = exec_env->brpc_internal_client_cache()->get_new_client_no_cache(brpc_url, "http"); - closure->cntl.http_request().uri() = brpc_url + "/PInternalServiceImpl/transmit_block_by_http"; - closure->cntl.http_request().set_method(brpc::HTTP_METHOD_POST); - closure->cntl.http_request().set_content_type("application/json"); - brpc_http_stub->transmit_block_by_http(&closure->cntl, nullptr, &closure->result, closure); + closure->cntl_->http_request().uri() = + brpc_url + "/PInternalServiceImpl/transmit_block_by_http"; + closure->cntl_->http_request().set_method(brpc::HTTP_METHOD_POST); + closure->cntl_->http_request().set_content_type("application/json"); + brpc_http_stub->transmit_block_by_http(closure->cntl_.get(), nullptr, closure->response_.get(), + closure.get()); + closure.release(); return Status::OK(); } template -Status request_embed_attachment(Params* brpc_request, const std::string& data, Closure* closure) { +Status request_embed_attachmentv2(Params* brpc_request, const std::string& data, + std::unique_ptr& closure) { butil::IOBuf attachment; // step1: serialize brpc_request to string, and append to attachment. @@ -106,7 +112,7 @@ Status request_embed_attachment(Params* brpc_request, const std::string& data, C data_size); } // step3: attachment add to closure. - closure->cntl.request_attachment().swap(attachment); + closure->cntl_->request_attachment().swap(attachment); return Status::OK(); } diff --git a/be/src/util/radix_sort.h b/be/src/util/radix_sort.h index 463af1c772a18c..1251f6edf1432d 100644 --- a/be/src/util/radix_sort.h +++ b/be/src/util/radix_sort.h @@ -37,7 +37,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep namespace doris { @@ -248,13 +247,15 @@ struct RadixSort { /// Transform the array and calculate the histogram. /// NOTE This is slightly suboptimal. Look at https://github.com/powturbo/TurboHist for (size_t i = 0; i < size; ++i) { - if (!Traits::Transform::transform_is_simple) + if (!Traits::Transform::transform_is_simple) { Traits::extractKey(arr[i]) = bitsToKey( Traits::Transform::forward(keyToBits(Traits::extractKey(arr[i])))); + } - for (size_t pass = 0; pass < NUM_PASSES; ++pass) + for (size_t pass = 0; pass < NUM_PASSES; ++pass) { ++histograms[pass * HISTOGRAM_SIZE + getPart(pass, keyToBits(Traits::extractKey(arr[i])))]; + } } { @@ -297,12 +298,4 @@ struct RadixSort { } }; -/// Helper functions for numeric types. -/// Use RadixSort with custom traits for complex types instead. - -template -void radixSortLSD(T* arr, size_t size) { - RadixSort>::executeLSD(arr, size); -} - } // namespace doris diff --git a/be/src/util/ref_count_closure.h b/be/src/util/ref_count_closure.h index d2fbd2fd14e863..14a136fcfd0c6e 100644 --- a/be/src/util/ref_count_closure.h +++ b/be/src/util/ref_count_closure.h @@ -21,37 +21,91 @@ #include -#include "runtime/exec_env.h" #include "runtime/thread_context.h" #include "service/brpc.h" namespace doris { -template -class RefCountClosure : public google::protobuf::Closure { +template +class DummyBrpcCallback { + ENABLE_FACTORY_CREATOR(DummyBrpcCallback); + public: - RefCountClosure() : _refs(0) {} - ~RefCountClosure() override = default; + using ResponseType = Response; + DummyBrpcCallback() { + cntl_ = std::make_shared(); + response_ = std::make_shared(); + } + + virtual ~DummyBrpcCallback() = default; + + virtual void call() {} + + virtual void join() { brpc::Join(cntl_->call_id()); } + + // controller has to be the same lifecycle with the closure, because brpc may use + // it in any stage of the rpc. + std::shared_ptr cntl_; + // We do not know if brpc will use request or response after brpc method returns. + // So that we need keep a shared ptr here to ensure that brpc could use req/rep + // at any stage. + std::shared_ptr response_; +}; - void ref() { _refs.fetch_add(1); } +// The closure will be deleted after callback. +// It could only be created by using shared ptr or unique ptr. +// It will hold a weak ptr of T and call run of T +// Callback() { +// xxxx; +// public +// void run() { +// logxxx +// } +// } +// +// std::shared_ptr b; +// +// std::unique_ptr a(b); +// brpc_call(a.release()); - // If unref() returns true, this object should be delete - bool unref() { return _refs.fetch_sub(1) == 1; } +template +class AutoReleaseClosure : public google::protobuf::Closure { + using Weak = typename std::shared_ptr::weak_type; + using ResponseType = typename Callback::ResponseType; + ENABLE_FACTORY_CREATOR(AutoReleaseClosure); +public: + AutoReleaseClosure(std::shared_ptr req, std::shared_ptr callback) + : request_(req), callback_(callback) { + this->cntl_ = callback->cntl_; + this->response_ = callback->response_; + } + + ~AutoReleaseClosure() override = default; + + // Will delete itself void Run() override { - SCOPED_TRACK_MEMORY_TO_UNKNOWN(); - if (unref()) { - delete this; + Defer defer {[&]() { delete this; }}; + // If lock failed, it means the callback object is deconstructed, then no need + // to deal with the callback any more. + if (auto tmp = callback_.lock()) { + tmp->call(); } } - void join() { brpc::Join(cntl.call_id()); } - - brpc::Controller cntl; - T result; + // controller has to be the same lifecycle with the closure, because brpc may use + // it in any stage of the rpc. + std::shared_ptr cntl_; + // We do not know if brpc will use request or response after brpc method returns. + // So that we need keep a shared ptr here to ensure that brpc could use req/rep + // at any stage. + std::shared_ptr request_; + std::shared_ptr response_; private: - std::atomic _refs; + // Use a weak ptr to keep the callback, so that the callback can be deleted if the main + // thread is freed. + Weak callback_; }; } // namespace doris diff --git a/be/src/util/runtime_profile.cpp b/be/src/util/runtime_profile.cpp index 1eb3a6b1393638..345035a5c99684 100644 --- a/be/src/util/runtime_profile.cpp +++ b/be/src/util/runtime_profile.cpp @@ -21,9 +21,6 @@ #include "util/runtime_profile.h" #include -#include -#include -#include #include #include #include @@ -553,76 +550,6 @@ void RuntimeProfile::pretty_print(std::ostream* s, const std::string& prefix) co } } -void RuntimeProfile::add_to_span(OpentelemetrySpan span) { - if (!span || !span->IsRecording() || _added_to_span) { - return; - } - _added_to_span = true; - - CounterMap counter_map; - ChildCounterMap child_counter_map; - { - std::lock_guard l(_counter_map_lock); - counter_map = _counter_map; - child_counter_map = _child_counter_map; - } - - auto total_time = counter_map.find("TotalTime"); - DCHECK(total_time != counter_map.end()); - - // profile name like "VDataBufferSender (dst_fragment_instance_id=-2608c96868f3b77d--713968f450bfbe0d):" - // to "VDataBufferSender" - auto i = _name.find_first_of("(: "); - auto short_name = _name.substr(0, i); - span->SetAttribute(short_name + SPAN_ATTRIBUTE_KEY_SEPARATOR + "TotalTime", - print_counter(total_time->second)); - - { - std::lock_guard l(_info_strings_lock); - for (const std::string& key : _info_strings_display_order) { - // nlohmann json will core dump when serializing 'KeyRanges', here temporarily skip it. - if (key.compare("KeyRanges") == 0) { - continue; - } - span->SetAttribute(short_name + SPAN_ATTRIBUTE_KEY_SEPARATOR + key, - _info_strings.find(key)->second); - } - } - - RuntimeProfile::add_child_counters_to_span(span, short_name, ROOT_COUNTER, counter_map, - child_counter_map); - - ChildVector children; - { - std::lock_guard l(_children_lock); - children = _children; - } - - for (auto& [profile, flag] : children) { - profile->add_to_span(span); - } -} - -void RuntimeProfile::add_child_counters_to_span(OpentelemetrySpan span, - const std::string& profile_name, - const std::string& counter_name, - const CounterMap& counter_map, - const ChildCounterMap& child_counter_map) { - ChildCounterMap::const_iterator itr = child_counter_map.find(counter_name); - - if (itr != child_counter_map.end()) { - const std::set& child_counters = itr->second; - for (const std::string& child_counter : child_counters) { - CounterMap::const_iterator iter = counter_map.find(child_counter); - DCHECK(iter != counter_map.end()); - span->SetAttribute(profile_name + SPAN_ATTRIBUTE_KEY_SEPARATOR + iter->first, - print_counter(iter->second)); - RuntimeProfile::add_child_counters_to_span(span, profile_name, child_counter, - counter_map, child_counter_map); - } - } -} - void RuntimeProfile::to_thrift(TRuntimeProfileTree* tree) { tree->nodes.clear(); to_thrift(&tree->nodes); diff --git a/be/src/util/runtime_profile.h b/be/src/util/runtime_profile.h index e4ddee2fe44140..fed0debbbd332b 100644 --- a/be/src/util/runtime_profile.h +++ b/be/src/util/runtime_profile.h @@ -37,12 +37,10 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "util/binary_cast.hpp" #include "util/pretty_printer.h" #include "util/stopwatch.hpp" -#include "util/telemetry/telemetry.h" namespace doris { class TRuntimeProfileNode; @@ -339,8 +337,6 @@ class RuntimeProfile { // Does not hold locks when it makes any function calls. void pretty_print(std::ostream* s, const std::string& prefix = "") const; - void add_to_span(OpentelemetrySpan span); - // Serializes profile to thrift. // Does not hold locks when it makes any function calls. void to_thrift(TRuntimeProfileTree* tree); @@ -498,8 +494,6 @@ class RuntimeProfile { // of the total time in the entire profile tree. double _local_time_percent; - bool _added_to_span {false}; - enum PeriodicCounterType { RATE_COUNTER = 0, SAMPLING_COUNTER, @@ -538,11 +532,6 @@ class RuntimeProfile { const CounterMap& counter_map, const ChildCounterMap& child_counter_map, std::ostream* s); - static void add_child_counters_to_span(OpentelemetrySpan span, const std::string& profile_name, - const std::string& counter_name, - const CounterMap& counter_map, - const ChildCounterMap& child_counter_map); - static std::string print_counter(Counter* counter) { return PrettyPrinter::print(counter->value(), counter->type()); } diff --git a/be/src/util/s3_util.cpp b/be/src/util/s3_util.cpp index 07a6a72768b675..442bb7b205ae9d 100644 --- a/be/src/util/s3_util.cpp +++ b/be/src/util/s3_util.cpp @@ -34,7 +34,9 @@ #include "common/config.h" #include "common/logging.h" +#include "runtime/exec_env.h" #include "s3_uri.h" +#include "vec/exec/scan/scanner_scheduler.h" namespace doris { @@ -152,6 +154,9 @@ std::shared_ptr S3ClientFactory::create(const S3Conf& s3_conf Aws::Auth::AWSCredentials aws_cred(s3_conf.ak, s3_conf.sk); DCHECK(!aws_cred.IsExpiredOrEmpty()); + if (!s3_conf.token.empty()) { + aws_cred.SetSessionToken(s3_conf.token); + } Aws::Client::ClientConfiguration aws_config = S3ClientFactory::getClientConfiguration(); aws_config.endpointOverride = s3_conf.endpoint; @@ -159,7 +164,14 @@ std::shared_ptr S3ClientFactory::create(const S3Conf& s3_conf if (s3_conf.max_connections > 0) { aws_config.maxConnections = s3_conf.max_connections; } else { - aws_config.maxConnections = config::doris_remote_scanner_thread_pool_thread_num; +#ifdef BE_TEST + // the S3Client may shared by many threads. + // So need to set the number of connections large enough. + aws_config.maxConnections = config::doris_scanner_thread_pool_thread_num; +#else + aws_config.maxConnections = + ExecEnv::GetInstance()->scanner_scheduler()->remote_thread_pool_max_size(); +#endif } if (s3_conf.request_timeout_ms > 0) { @@ -189,6 +201,9 @@ Status S3ClientFactory::convert_properties_to_s3_conf( StringCaseMap properties(prop.begin(), prop.end()); s3_conf->ak = properties.find(S3_AK)->second; s3_conf->sk = properties.find(S3_SK)->second; + if (properties.find(S3_TOKEN) != properties.end()) { + s3_conf->token = properties.find(S3_TOKEN)->second; + } s3_conf->endpoint = properties.find(S3_ENDPOINT)->second; s3_conf->region = properties.find(S3_REGION)->second; diff --git a/be/src/util/s3_util.h b/be/src/util/s3_util.h index 247c6e5ef28475..82f5cff8ff9c55 100644 --- a/be/src/util/s3_util.h +++ b/be/src/util/s3_util.h @@ -69,6 +69,7 @@ const static std::string S3_CONN_TIMEOUT_MS = "AWS_CONNECTION_TIMEOUT_MS"; struct S3Conf { std::string ak; std::string sk; + std::string token; std::string endpoint; std::string region; std::string bucket; @@ -80,9 +81,10 @@ struct S3Conf { std::string to_string() const { return fmt::format( - "(ak={}, sk=*, endpoint={}, region={}, bucket={}, prefix={}, max_connections={}, " - "request_timeout_ms={}, connect_timeout_ms={}, use_virtual_addressing={})", - ak, endpoint, region, bucket, prefix, max_connections, request_timeout_ms, + "(ak={}, sk=*, token={}, endpoint={}, region={}, bucket={}, prefix={}, " + "max_connections={}, request_timeout_ms={}, connect_timeout_ms={}, " + "use_virtual_addressing={})", + ak, token, endpoint, region, bucket, prefix, max_connections, request_timeout_ms, connect_timeout_ms, use_virtual_addressing); } @@ -90,6 +92,7 @@ struct S3Conf { uint64_t hash_code = 0; hash_code += Fingerprint(ak); hash_code += Fingerprint(sk); + hash_code += Fingerprint(token); hash_code += Fingerprint(endpoint); hash_code += Fingerprint(region); hash_code += Fingerprint(bucket); diff --git a/be/src/util/sort_heap.h b/be/src/util/sort_heap.h index f3f6e546b703dd..b6df8f395e29f8 100644 --- a/be/src/util/sort_heap.h +++ b/be/src/util/sort_heap.h @@ -22,7 +22,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep namespace doris { diff --git a/be/src/util/string_parser.hpp b/be/src/util/string_parser.hpp index 83289e73adeed2..623fc23a414e68 100644 --- a/be/src/util/string_parser.hpp +++ b/be/src/util/string_parser.hpp @@ -35,7 +35,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/status.h" #include "runtime/large_int_value.h" @@ -93,7 +92,7 @@ class StringParser { template static T get_scale_multiplier(int scale) { static_assert(std::is_same_v || std::is_same_v || - std::is_same_v || std::is_same_v, + std::is_same_v || std::is_same_v, "You can only instantiate as int32_t, int64_t, __int128."); if constexpr (std::is_same_v) { return common::exp10_i32(scale); @@ -101,7 +100,7 @@ class StringParser { return common::exp10_i64(scale); } else if constexpr (std::is_same_v) { return common::exp10_i128(scale); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { return common::exp10_i256(scale); } } @@ -580,6 +579,10 @@ inline int StringParser::StringParseTraits::max_ascii_len() { template T StringParser::string_to_decimal(const char* s, int len, int type_precision, int type_scale, ParseResult* result) { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v, + "Cast string to decimal only support target type int32_t, int64_t, __int128 or " + "wide::Int256."); // Special cases: // 1) '' == Fail, an empty string fails to parse. // 2) ' # ' == #, leading and trailing white space is ignored. @@ -608,11 +611,9 @@ T StringParser::string_to_decimal(const char* s, int len, int type_precision, in } // Ignore leading zeros. - bool leading_zero = false; bool found_value = false; while (len > 0 && UNLIKELY(*s == '0')) { found_value = true; - leading_zero = true; ++s; --len; } @@ -636,6 +637,7 @@ T StringParser::string_to_decimal(const char* s, int len, int type_precision, in int precision = 0; int max_digit = type_precision - type_scale; + int cur_digit = 0; bool found_exponent = false; int8_t exponent = 0; T value = 0; @@ -685,31 +687,37 @@ T StringParser::string_to_decimal(const char* s, int len, int type_precision, in } } else { // decimalv3 + bool has_round = false; for (int i = 0; i < len; ++i) { const char& c = s[i]; - // keep a rounding precision to round the decimal value - if (LIKELY('0' <= c && c <= '9') && - ((!leading_zero && LIKELY(type_precision >= precision)) || - (leading_zero && type_precision > precision))) { + if (LIKELY('0' <= c && c <= '9')) { found_value = true; // Ignore digits once the type's precision limit is reached. This avoids // overflowing the underlying storage while handling a string like // 10000000000e-10 into a DECIMAL(1, 0). Adjustments for ignored digits and // an exponent will be made later. - ++precision; - scale += found_dot; - // decimalv3 should make sure the type_scale and type_precision - if (!found_dot && max_digit < (precision - scale)) { - // parse_overflow should only happen when the digit part reached the max + if (LIKELY(type_precision > precision) && !has_round) { + value = (value * 10) + (c - '0'); // Benchmarks are faster with parenthesis... + ++precision; + scale += found_dot; + cur_digit = precision - scale; + } else if (!found_dot && max_digit < (precision - scale)) { *result = StringParser::PARSE_OVERFLOW; value = is_negative ? vectorized::min_decimal_value>( type_precision) : vectorized::max_decimal_value>( type_precision); return value; + } else if (found_dot && scale >= type_scale && !has_round) { + // make rounding cases + if (c > '4') { + value += 1; + } + has_round = true; + continue; + } else if (!found_dot) { + ++cur_digit; } - // keep a rounding precision to round the decimal value - value = (value * 10) + (c - '0'); // Benchmarks are faster with parenthesis... DCHECK(value >= 0); // For some reason //DCHECK_GE doesn't work with __int128. } else if (c == '.' && LIKELY(!found_dot)) { found_dot = 1; @@ -724,7 +732,6 @@ T StringParser::string_to_decimal(const char* s, int len, int type_precision, in } break; } else { - // jump to here: should handle the wrong character of decimal if (value == 0) { *result = StringParser::PARSE_FAILURE; return 0; @@ -737,9 +744,20 @@ T StringParser::string_to_decimal(const char* s, int len, int type_precision, in // the E/e character because we make right user-given type_precision // not max number type_precision if (!is_numeric_ascii(c)) { + if (cur_digit > type_precision) { + *result = StringParser::PARSE_OVERFLOW; + value = is_negative + ? vectorized::min_decimal_value>( + type_precision) + : vectorized::max_decimal_value>( + type_precision); + return value; + } return is_negative ? T(-value) : T(value); } } + + return is_negative ? T(-value) : T(value); } } } @@ -779,17 +797,14 @@ T StringParser::string_to_decimal(const char* s, int len, int type_precision, in } else if (UNLIKELY(scale > type_scale)) { *result = StringParser::PARSE_UNDERFLOW; int shift = scale - type_scale; - if (shift > 0) { - T divisor = get_scale_multiplier(shift); - if (LIKELY(divisor > 0)) { - T remainder = value % divisor; - value /= divisor; - if ((remainder > 0 ? T(remainder) : T(-remainder)) >= (divisor >> 1)) { - value += 1; - } - } else { - DCHECK(divisor == -1 || divisor == 0); // //DCHECK_EQ doesn't work with __int128. - value = 0; + T divisor = get_scale_multiplier(shift); + if (UNLIKELY(divisor == std::numeric_limits::max())) { + value = 0; + } else { + T remainder = value % divisor; + value /= divisor; + if ((remainder > 0 ? T(remainder) : T(-remainder)) >= (divisor >> 1)) { + value += 1; } } DCHECK(value >= 0); // //DCHECK_GE doesn't work with __int128. diff --git a/be/src/util/telemetry/brpc_carrier.h b/be/src/util/telemetry/brpc_carrier.h deleted file mode 100644 index 0e6b4d2d478c95..00000000000000 --- a/be/src/util/telemetry/brpc_carrier.h +++ /dev/null @@ -1,80 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "opentelemetry/context/propagation/global_propagator.h" -#include "opentelemetry/context/propagation/text_map_propagator.h" -#include "opentelemetry/context/runtime_context.h" -#include "opentelemetry/trace/context.h" -#include "opentelemetry/trace/span_metadata.h" -#include "opentelemetry/trace/span_startoptions.h" -#include "util/telemetry/telemetry.h" - -namespace google { -namespace protobuf { -class RpcController; -} // namespace protobuf -} // namespace google - -namespace doris::telemetry { - -class RpcServerCarrier : public opentelemetry::context::propagation::TextMapCarrier { -public: - explicit RpcServerCarrier(const brpc::Controller* cntl) : cntl_(cntl) {} - - RpcServerCarrier() = default; - - opentelemetry::nostd::string_view Get( - opentelemetry::nostd::string_view key) const noexcept override; - - void Set(opentelemetry::nostd::string_view key, - opentelemetry::nostd::string_view value) noexcept override; - -private: - const brpc::Controller* cntl_ {}; -}; - -inline OpentelemetrySpan start_rpc_server_span(std::string span_name, - google::protobuf::RpcController* cntl_base) { - RpcServerCarrier carrier(static_cast(cntl_base)); - auto current_ctx = opentelemetry::context::RuntimeContext::GetCurrent(); - auto prop = opentelemetry::context::propagation::GlobalTextMapPropagator::GetGlobalPropagator(); - auto parent_context = prop->Extract(carrier, current_ctx); - - if (opentelemetry::trace::GetSpan(parent_context)->GetContext().IsValid()) { - opentelemetry::trace::StartSpanOptions options; - options.kind = opentelemetry::trace::SpanKind::kServer; - options.parent = parent_context; - return telemetry::get_tracer("tracer")->StartSpan(std::move(span_name), options); - } else { - return telemetry::get_noop_tracer()->StartSpan(""); - } -} -} // namespace doris::telemetry diff --git a/be/src/util/telemetry/open_telemetry_scop_wrapper.hpp b/be/src/util/telemetry/open_telemetry_scop_wrapper.hpp deleted file mode 100644 index 2f1d8eb7b2a373..00000000000000 --- a/be/src/util/telemetry/open_telemetry_scop_wrapper.hpp +++ /dev/null @@ -1,53 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -// IWYU pragma: no_include -#include "common/compiler_util.h" // IWYU pragma: keep -#include "opentelemetry/trace/provider.h" - -namespace doris { - -using OpentelemetryTracer = opentelemetry::nostd::shared_ptr; -using OpentelemetryScope = opentelemetry::trace::Scope; -using OpentelemetrySpan = opentelemetry::nostd::shared_ptr; - -class OpenTelemetryScopeWrapper { -public: - OpenTelemetryScopeWrapper(bool enable, OpentelemetryTracer tracer, const std::string& name) { - if (enable) { - auto span = tracer->StartSpan(name); - _scope.reset(new OpentelemetryScope(span)); - } - } - - OpenTelemetryScopeWrapper(bool enable, OpentelemetryTracer tracer, OpentelemetrySpan span, - const std::string& name) { - if (enable) { - if (UNLIKELY(!span)) { - span = tracer->StartSpan(name); - } - _scope.reset(new OpentelemetryScope(span)); - } - } - -private: - std::unique_ptr _scope; -}; - -} // namespace doris diff --git a/be/src/util/telemetry/telemetry.cpp b/be/src/util/telemetry/telemetry.cpp deleted file mode 100644 index 01351129228b0d..00000000000000 --- a/be/src/util/telemetry/telemetry.cpp +++ /dev/null @@ -1,119 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "telemetry.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -// IWYU pragma: no_include -#include // IWYU pragma: keep -#include -#include -#include - -#include "common/config.h" -#include "common/logging.h" -#include "opentelemetry/context/propagation/global_propagator.h" -#include "opentelemetry/context/propagation/text_map_propagator.h" -#include "opentelemetry/exporters/otlp/otlp_http_exporter.h" -#include "opentelemetry/exporters/zipkin/zipkin_exporter.h" -#include "opentelemetry/nostd/shared_ptr.h" -#include "opentelemetry/sdk/common/attribute_utils.h" -#include "opentelemetry/sdk/common/global_log_handler.h" -#include "opentelemetry/sdk/trace/batch_span_processor.h" -#include "opentelemetry/sdk/trace/tracer_provider.h" -#include "opentelemetry/trace/propagation/http_trace_context.h" -#include "service/backend_options.h" - -namespace trace = opentelemetry::trace; -namespace nostd = opentelemetry::nostd; -namespace trace_sdk = opentelemetry::sdk::trace; -namespace zipkin = opentelemetry::exporter::zipkin; -namespace resource = opentelemetry::sdk::resource; -namespace propagation = opentelemetry::context::propagation; -namespace otlp = opentelemetry::exporter::otlp; -namespace internal_log = opentelemetry::sdk::common::internal_log; - -class OpenTelemetryLogHandler : public internal_log::LogHandler { -public: - void Handle(internal_log::LogLevel level, const char* file, int line, const char* msg, - const opentelemetry::sdk::common::AttributeMap& attributes) noexcept override { - if ((level == internal_log::LogLevel::Error || level == internal_log::LogLevel::Warning) && - file != nullptr && msg != nullptr) { - LOG(WARNING) << fmt::format("OpenTelemetry File: {}:{} {}", file, line, msg); - } - } -}; - -void doris::telemetry::init_tracer() { - if (!doris::config::enable_tracing) { - return; - } - - std::unique_ptr exporter = nullptr; - std::string trace_exporter = config::trace_exporter; - boost::to_lower(trace_exporter); - if (trace_exporter.compare("collector") == 0) { - otlp::OtlpHttpExporterOptions opts {}; - opts.url = doris::config::trace_export_url; - exporter = std::make_unique(opts); - } else if (trace_exporter.compare("zipkin") == 0) { - // ZipkinExporter converts span to zipkin's format and exports span to zipkin. - zipkin::ZipkinExporterOptions opts {}; - opts.endpoint = doris::config::trace_export_url; - exporter = std::make_unique(opts); - } else { - LOG(FATAL) << "unknown value " << trace_exporter << " of trace_exporter in be.conf"; - } - - // BatchSpanProcessor exports span by batch. - trace_sdk::BatchSpanProcessorOptions batchOptions; - batchOptions.schedule_delay_millis = - std::chrono::milliseconds(doris::config::export_span_schedule_delay_millis); - batchOptions.max_queue_size = doris::config::max_span_queue_size; - batchOptions.max_export_batch_size = doris::config::max_span_export_batch_size; - auto processor = std::unique_ptr( - new trace_sdk::BatchSpanProcessor(std::move(exporter), batchOptions)); - - std::string service_name = "BACKEND:" + BackendOptions::get_localhost(); - resource::ResourceAttributes attributes = {{"service.name", service_name}}; - auto resource = resource::Resource::Create(attributes); - - auto provider = nostd::shared_ptr( - new trace_sdk::TracerProvider(std::move(processor), resource)); - // Set the global trace provider - trace::Provider::SetTracerProvider(std::move(provider)); - - // Specifies the format for parsing trace and span messages propagated via http or gpc. - propagation::GlobalTextMapPropagator::SetGlobalPropagator( - nostd::shared_ptr( - new opentelemetry::trace::propagation::HttpTraceContext())); - - // Output OpenTelemetry logs by glog - internal_log::GlobalLogHandler::SetLogHandler( - nostd::shared_ptr(new OpenTelemetryLogHandler())); -} diff --git a/be/src/util/telemetry/telemetry.h b/be/src/util/telemetry/telemetry.h deleted file mode 100644 index dc71f3cfe05053..00000000000000 --- a/be/src/util/telemetry/telemetry.h +++ /dev/null @@ -1,87 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -#include - -// IWYU pragma: no_include -#include "common/compiler_util.h" // IWYU pragma: keep -#include "opentelemetry/trace/provider.h" - -/// A trace represents the execution process of a single request in the system, span represents a -/// logical operation unit with start time and execution duration in the system, and multiple spans -/// form a trace. -namespace doris { - -using OpentelemetryTracer = opentelemetry::nostd::shared_ptr; -using OpentelemetrySpan = opentelemetry::nostd::shared_ptr; -using OpentelemetryScope = opentelemetry::trace::Scope; - -/// Start a span with the specified tracer, name, and variable name, and create a Scope for this -/// span. -/// -/// span represents the execution time of an operation unit, the time of its construction is the -/// start time, the time of its destructuring is the end time, you can also call the End method to -/// let span terminate, like "span->End();" -/// -/// We can add Event and Attribute to span. Event will record the timestamp and event content, -/// and Attribute can record key-value pairs. -/// For example: -/// span->AddEvent("event content"); -/// span->SetAttribute("key", "value"); -/// -/// Scope will add the span to a thread-local stack during construction, and remove the span from -/// the stack during destructuring. When starting a span, the top-of-stack span will be the parent -/// span by default, and the top-of-stack span can be obtained via the -/// opentelemetry::trace::Tracer::GetCurrentSpan() method. -#define START_AND_SCOPE_SPAN(tracer, span, name) \ - auto span = tracer->StartSpan(name); \ - OpentelemetryScope scope {span}; - -namespace telemetry { - -void init_tracer(); - -/// Return NoopTracer, the span generated by NoopTracer is NoopSpan, and the method bodies of -/// NoopTracer and NoopSpan are empty. -inline OpentelemetryTracer& get_noop_tracer() { - static OpentelemetryTracer noop_tracer = - opentelemetry::nostd::shared_ptr( - new opentelemetry::trace::NoopTracer); - return noop_tracer; -} - -inline OpentelemetryTracer get_tracer(const std::string& tracer_name) { - return opentelemetry::trace::Provider::GetTracerProvider()->GetTracer(tracer_name); -} - -/// Returns true if the active pan stack is not empty. -inline bool is_current_span_valid() { - return opentelemetry::trace::Tracer::GetCurrentSpan()->GetContext().IsValid(); -} - -} // namespace telemetry -} // namespace doris diff --git a/be/src/util/thread.cpp b/be/src/util/thread.cpp index cf7bd63a878b61..e6d5d10e7370e5 100644 --- a/be/src/util/thread.cpp +++ b/be/src/util/thread.cpp @@ -59,6 +59,7 @@ #include "gutil/stringprintf.h" #include "gutil/strings/substitute.h" #include "http/web_page_handler.h" +#include "runtime/thread_context.h" #include "util/debug/sanitizer_scopes.h" #include "util/easy_json.h" #include "util/os_util.h" @@ -479,6 +480,9 @@ void* Thread::supervise_thread(void* arg) { // already incremented the reference count in StartThread. Thread::_tls = t; + // Create thread context, there is no need to create it when func is executed. + ThreadLocalHandle::create_thread_local_if_not_exits(); + // Publish our tid to '_tid', which unblocks any callers waiting in // WaitForTid(). Release_Store(&t->_tid, system_tid); @@ -514,6 +518,8 @@ void Thread::finish_thread(void* arg) { // NOTE: the above 'Release' call could be the last reference to 'this', // so 'this' could be destructed at this point. Do not add any code // following here! + + ThreadLocalHandle::del_thread_local_if_count_is_zero(); } void Thread::init_threadmgr() { diff --git a/be/src/util/thrift_util.cpp b/be/src/util/thrift_util.cpp index 9356ba8bed3dca..fd141f3c74b73f 100644 --- a/be/src/util/thrift_util.cpp +++ b/be/src/util/thrift_util.cpp @@ -25,7 +25,6 @@ // IWYU pragma: no_include #include // IWYU pragma: keep -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/logging.h" #include "util/thrift_server.h" diff --git a/be/src/util/uid_util.h b/be/src/util/uid_util.h index c187bb9d6d0896..5c0b5fb72fb3cd 100644 --- a/be/src/util/uid_util.h +++ b/be/src/util/uid_util.h @@ -45,14 +45,14 @@ void to_hex(T val, char* buf) { } template -void from_hex(T* ret, const std::string& buf) { +void from_hex(T* ret, std::string_view buf) { T val = 0; - for (int i = 0; i < buf.length(); ++i) { + for (char i : buf) { int buf_val = 0; - if (buf.c_str()[i] >= '0' && buf.c_str()[i] <= '9') { - buf_val = buf.c_str()[i] - '0'; + if (i >= '0' && i <= '9') { + buf_val = i - '0'; } else { - buf_val = buf.c_str()[i] - 'a' + 10; + buf_val = i - 'a' + 10; } val <<= 4; val = val | buf_val; diff --git a/be/src/vec/aggregate_functions/aggregate_function_approx_count_distinct.cpp b/be/src/vec/aggregate_functions/aggregate_function_approx_count_distinct.cpp index 8add4627f200a4..d0dcafd0144ed2 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_approx_count_distinct.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_approx_count_distinct.cpp @@ -22,6 +22,7 @@ #include "vec/columns/column_array.h" #include "vec/columns/column_decimal.h" #include "vec/columns/column_map.h" +#include "vec/columns/column_object.h" #include "vec/columns/column_string.h" #include "vec/columns/column_struct.h" #include "vec/data_types/data_type.h" diff --git a/be/src/vec/aggregate_functions/aggregate_function_bitmap.h b/be/src/vec/aggregate_functions/aggregate_function_bitmap.h index b070bae8c1d544..aa167b6571c16d 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_bitmap.h +++ b/be/src/vec/aggregate_functions/aggregate_function_bitmap.h @@ -25,7 +25,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "util/bitmap_value.h" #include "vec/aggregate_functions/aggregate_function.h" @@ -141,7 +140,10 @@ struct AggregateFunctionBitmapData { void read(BufferReadable& buf) { DataTypeBitMap::deserialize_as_stream(value, buf); } - void reset() { is_first = true; } + void reset() { + is_first = true; + value.reset(); // it's better to call reset function by self firstly. + } BitmapValue& get() { return value; } }; diff --git a/be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.h b/be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.h index 382957302ec245..a4c08aefe2ad43 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.h +++ b/be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.h @@ -46,7 +46,7 @@ struct AggregateFunctionBitmapAggData { void add(const T& value_) { value.add(value_); } - void reset() { value.clear(); } + void reset() { value.reset(); } void merge(const AggregateFunctionBitmapAggData& other) { value |= other.value; } diff --git a/be/src/vec/aggregate_functions/aggregate_function_combinator.h b/be/src/vec/aggregate_functions/aggregate_function_combinator.h index 9f67df02dbe13b..1593d74ed4e59d 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_combinator.h +++ b/be/src/vec/aggregate_functions/aggregate_function_combinator.h @@ -50,8 +50,6 @@ class IAggregateFunctionCombinator { public: virtual String get_name() const = 0; - virtual bool is_for_internal_usage_only() const { return false; } - /** From the arguments for combined function (ex: UInt64, UInt8 for sumIf), * get the arguments for nested function (ex: UInt64 for sum). * If arguments are not suitable for combined function, throw an exception. diff --git a/be/src/vec/aggregate_functions/aggregate_function_percentile_approx.cpp b/be/src/vec/aggregate_functions/aggregate_function_percentile_approx.cpp index 537022f194ccef..5ffac66f8f2daa 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_percentile_approx.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_percentile_approx.cpp @@ -26,14 +26,21 @@ template AggregateFunctionPtr create_aggregate_function_percentile_approx(const std::string& name, const DataTypes& argument_types, const bool result_is_nullable) { + const DataTypePtr& argument_type = remove_nullable(argument_types[0]); + WhichDataType which(argument_type); + if (which.idx != TypeIndex::Float64) { + return nullptr; + } if (argument_types.size() == 1) { return creator_without_type::create>( remove_nullable(argument_types), result_is_nullable); - } else if (argument_types.size() == 2) { + } + if (argument_types.size() == 2) { return creator_without_type::create< AggregateFunctionPercentileApproxTwoParams>( remove_nullable(argument_types), result_is_nullable); - } else if (argument_types.size() == 3) { + } + if (argument_types.size() == 3) { return creator_without_type::create< AggregateFunctionPercentileApproxThreeParams>( remove_nullable(argument_types), result_is_nullable); diff --git a/be/src/vec/aggregate_functions/aggregate_function_quantile_state.h b/be/src/vec/aggregate_functions/aggregate_function_quantile_state.h index a7c22cab489afc..7c6a9cb9da0926 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_quantile_state.h +++ b/be/src/vec/aggregate_functions/aggregate_function_quantile_state.h @@ -22,7 +22,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "util/quantile_state.h" #include "vec/aggregate_functions/aggregate_function.h" diff --git a/be/src/vec/aggregate_functions/aggregate_function_reader_first_last.h b/be/src/vec/aggregate_functions/aggregate_function_reader_first_last.h index b10d69b2d4b382..110e1bcc1b0442 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_reader_first_last.h +++ b/be/src/vec/aggregate_functions/aggregate_function_reader_first_last.h @@ -23,6 +23,7 @@ #include "vec/columns/column_array.h" #include "vec/columns/column_map.h" #include "vec/columns/column_nullable.h" +#include "vec/columns/column_object.h" #include "vec/columns/column_struct.h" #include "vec/columns/column_vector.h" #include "vec/data_types/data_type_decimal.h" diff --git a/be/src/vec/aggregate_functions/aggregate_function_simple_factory.cpp b/be/src/vec/aggregate_functions/aggregate_function_simple_factory.cpp index 08e6e44311f3d1..068e2efaac48f0 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_simple_factory.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_simple_factory.cpp @@ -100,7 +100,6 @@ AggregateFunctionSimpleFactory& AggregateFunctionSimpleFactory::instance() { register_aggregate_function_replace_reader_load(instance); register_aggregate_function_window_lead_lag_first_last(instance); register_aggregate_function_HLL_union_agg(instance); - register_aggregate_function_percentile_approx(instance); }); return instance; } diff --git a/be/src/vec/aggregate_functions/aggregate_function_uniq.h b/be/src/vec/aggregate_functions/aggregate_function_uniq.h index b9445c4ed715ab..3ef0359461b442 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_uniq.h +++ b/be/src/vec/aggregate_functions/aggregate_function_uniq.h @@ -28,7 +28,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "vec/aggregate_functions/aggregate_function.h" #include "vec/columns/column.h" diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h index cfb9163820fbe5..5202c51a3daa6c 100644 --- a/be/src/vec/columns/column.h +++ b/be/src/vec/columns/column.h @@ -144,6 +144,9 @@ class IColumn : public COW { return nullptr; } + /// Some columns may require finalization before using of other operations. + virtual void finalize() {} + // Only used on ColumnDictionary virtual void set_rowset_segment_id(std::pair rowset_segment_id) {} @@ -590,6 +593,8 @@ class IColumn : public COW { virtual bool is_hll() const { return false; } + virtual bool is_variant() const { return false; } + virtual bool is_quantile_state() const { return false; } // true if column has null element @@ -641,6 +646,18 @@ class IColumn : public COW { return 0; } + /// Returns ratio of values in column, that are equal to default value of column. + /// Checks only @sample_ratio ratio of rows. + virtual double get_ratio_of_default_rows(double sample_ratio = 1.0) const { + LOG(FATAL) << fmt::format("get_ratio_of_default_rows of column {} are not implemented.", + get_name()); + return 0.0; + } + + /// Template is to devirtualize calls to 'isDefaultAt' method. + template + double get_ratio_of_default_rows_impl(double sample_ratio) const; + /// Column is ColumnVector of numbers or ColumnConst of it. Note that Nullable columns are not numeric. /// Implies is_fixed_and_contiguous. virtual bool is_numeric() const { return false; } @@ -649,8 +666,6 @@ class IColumn : public COW { virtual bool is_column_decimal() const { return false; } - virtual bool is_predicate_column() const { return false; } - virtual bool is_column_dictionary() const { return false; } virtual bool is_column_array() const { return false; } @@ -662,9 +677,6 @@ class IColumn : public COW { /// If the only value column can contain is NULL. virtual bool only_null() const { return false; } - /// Can be inside ColumnNullable. - virtual bool can_be_inside_nullable() const { return false; } - virtual bool low_cardinality() const { return false; } virtual void sort_column(const ColumnSorter* sorter, EqualFlags& flags, diff --git a/be/src/vec/columns/column_array.cpp b/be/src/vec/columns/column_array.cpp index ca3b6c12da5bb9..fa9e048636719f 100644 --- a/be/src/vec/columns/column_array.cpp +++ b/be/src/vec/columns/column_array.cpp @@ -481,6 +481,10 @@ void ColumnArray::insert_range_from(const IColumn& src, size_t start, size_t len } } +double ColumnArray::get_ratio_of_default_rows(double sample_ratio) const { + return get_ratio_of_default_rows_impl(sample_ratio); +} + ColumnPtr ColumnArray::filter(const Filter& filt, ssize_t result_size_hint) const { if (typeid_cast(data.get())) return filter_number(filt, result_size_hint); @@ -829,42 +833,35 @@ ColumnPtr ColumnArray::replicate(const IColumn::Offsets& replicate_offsets) cons return replicate_generic(replicate_offsets); } -void ColumnArray::replicate(const uint32_t* indexs, size_t target_size, IColumn& column) const { +void ColumnArray::replicate(const uint32_t* indices, size_t target_size, IColumn& column) const { if (target_size == 0) { return; } - auto total_size = get_offsets().size(); - // |---------------------|-------------------------|-------------------------| - // [0, begin) [begin, begin + count_sz) [begin + count_sz, size()) - // do not need to copy copy counts[n] times do not need to copy - IColumn::Offsets replicate_offsets(total_size, 0); - // copy original data at offset n counts[n] times - auto begin = 0, end = 0; - while (begin < target_size) { - while (end < target_size && indexs[begin] == indexs[end]) { - end++; - } - long index = indexs[begin]; - replicate_offsets[index] = end - begin; - begin = end; - } - // ignored - for (size_t i = 1; i < total_size; ++i) { - replicate_offsets[i] += replicate_offsets[i - 1]; - } + auto& dst_col = assert_cast(column); + auto& dst_data_col = dst_col.get_data(); + auto& dst_offsets = dst_col.get_offsets(); + dst_offsets.reserve(target_size); - auto rep_res = replicate(replicate_offsets); - if (!rep_res) { - LOG(WARNING) << "ColumnArray replicate failed, replicate_offsets count=" - << replicate_offsets.size() << ", max=" << replicate_offsets.back(); - return; + PODArray data_indices_to_replicate; + + for (size_t i = 0; i < target_size; ++i) { + const auto index = indices[i]; + const auto start = offset_at(index); + const auto length = size_at(index); + dst_offsets.push_back(dst_offsets.back() + length); + if (UNLIKELY(length == 0)) { + continue; + } + + data_indices_to_replicate.reserve(data_indices_to_replicate.size() + length); + for (size_t j = start; j != start + length; ++j) { + data_indices_to_replicate.push_back(j); + } } - auto& rep_res_arr = typeid_cast(*rep_res); - ColumnArray& res_arr = typeid_cast(column); - res_arr.data = rep_res_arr.get_data_ptr(); - res_arr.offsets = rep_res_arr.get_offsets_ptr(); + get_data().replicate(data_indices_to_replicate.data(), data_indices_to_replicate.size(), + dst_data_col); } template diff --git a/be/src/vec/columns/column_array.h b/be/src/vec/columns/column_array.h index c37fb48ba52b20..172815d76511b2 100644 --- a/be/src/vec/columns/column_array.h +++ b/be/src/vec/columns/column_array.h @@ -30,7 +30,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/status.h" #include "vec/columns/column.h" @@ -127,7 +126,6 @@ class ColumnArray final : public COWHelper { std::string get_name() const override; const char* get_family_name() const override { return "Array"; } bool is_column_array() const override { return true; } - bool can_be_inside_nullable() const override { return true; } MutableColumnPtr clone_resized(size_t size) const override; size_t size() const override; void resize(size_t n) override; @@ -266,6 +264,8 @@ class ColumnArray final : public COWHelper { ColumnPtr index(const IColumn& indexes, size_t limit) const override; + double get_ratio_of_default_rows(double sample_ratio) const override; + private: // [[2,1,5,9,1], [1,2,4]] --> data column [2,1,5,9,1,1,2,4], offset[-1] = 0, offset[0] = 5, offset[1] = 8 // [[[2,1,5],[9,1]], [[1,2]]] --> data column [3 column array], offset[-1] = 0, offset[0] = 2, offset[1] = 3 diff --git a/be/src/vec/columns/column_complex.h b/be/src/vec/columns/column_complex.h index b25ae9f1577291..6c752d082b4044 100644 --- a/be/src/vec/columns/column_complex.h +++ b/be/src/vec/columns/column_complex.h @@ -233,8 +233,6 @@ class ColumnComplexType final : public COWHelper> "compare_at for " + std::string(get_family_name())); } - bool can_be_inside_nullable() const override { return true; } - bool is_fixed_and_contiguous() const override { return true; } size_t size_of_value_if_fixed() const override { return sizeof(T); } diff --git a/be/src/vec/columns/column_decimal.h b/be/src/vec/columns/column_decimal.h index 6c1e8893a3d94a..30c4f1116fd1f0 100644 --- a/be/src/vec/columns/column_decimal.h +++ b/be/src/vec/columns/column_decimal.h @@ -106,7 +106,6 @@ class ColumnDecimal final : public COWHelper> { const char* get_family_name() const override { return "ColumnDictionary"; } - [[noreturn]] MutableColumnPtr clone_resized(size_t size) const override { - LOG(FATAL) << "clone_resized not supported in ColumnDictionary"; + MutableColumnPtr clone_resized(size_t size) const override { + DCHECK(size == 0); + return this->create(); } void insert(const Field& x) override { @@ -145,8 +146,6 @@ class ColumnDictionary final : public COWHelper> { LOG(FATAL) << "compare_at not supported in ColumnDictionary"; } - bool can_be_inside_nullable() const override { return true; } - bool is_fixed_and_contiguous() const override { return true; } void get_indices_of_non_default_rows(IColumn::Offsets64& indices, size_t from, diff --git a/be/src/vec/columns/column_dummy.h b/be/src/vec/columns/column_dummy.h new file mode 100644 index 00000000000000..a152dc9751a0e1 --- /dev/null +++ b/be/src/vec/columns/column_dummy.h @@ -0,0 +1,175 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/src/AggregateFunctions/IColumnDummy.h +// and modified by Doris + +#pragma once + +#include "vec/columns/column.h" +#include "vec/columns/columns_common.h" +#include "vec/common/arena.h" +#include "vec/common/pod_array.h" + +namespace doris::vectorized { + +/** Base class for columns-constants that contain a value that is not in the `Field`. + * Not a full-fledged column and is used in a special way. + */ +class IColumnDummy : public IColumn { +public: + IColumnDummy() : s(0) {} + IColumnDummy(size_t s_) : s(s_) {} + +public: + virtual MutableColumnPtr clone_dummy(size_t s_) const = 0; + + MutableColumnPtr clone_resized(size_t s) const override { return clone_dummy(s); } + size_t size() const override { return s; } + void insert_default() override { ++s; } + void pop_back(size_t n) override { s -= n; } + size_t byte_size() const override { return 0; } + size_t allocated_bytes() const override { return 0; } + int compare_at(size_t, size_t, const IColumn&, int) const override { return 0; } + + [[noreturn]] Field operator[](size_t) const override { + LOG(FATAL) << "Cannot get value from " << get_name(); + } + + void get(size_t, Field&) const override { + LOG(FATAL) << "Cannot get value from " << get_name(); + } + + void insert(const Field&) override { + LOG(FATAL) << "Cannot insert element into " << get_name(); + } + + StringRef get_data_at(size_t) const override { return {}; } + + void insert_data(const char*, size_t) override { ++s; } + + StringRef serialize_value_into_arena(size_t /*n*/, Arena& arena, + char const*& begin) const override { + return {arena.alloc_continue(0, begin), 0}; + } + + const char* deserialize_and_insert_from_arena(const char* pos) override { + ++s; + return pos; + } + + void insert_from(const IColumn&, size_t) override { ++s; } + + void insert_range_from(const IColumn& /*src*/, size_t /*start*/, size_t length) override { + s += length; + } + + void insert_indices_from(const IColumn& src, const int* indices_begin, + const int* indices_end) override { + s += (indices_end - indices_begin); + } + + ColumnPtr filter(const Filter& filt, ssize_t /*result_size_hint*/) const override { + return clone_dummy(count_bytes_in_filter(filt)); + } + + size_t filter(const Filter& filter) override { + const auto result_size = count_bytes_in_filter(filter); + s = result_size; + return result_size; + } + + ColumnPtr permute(const Permutation& perm, size_t limit) const override { + if (s != perm.size()) { + LOG(FATAL) << "Size of permutation doesn't match size of column."; + } + + return clone_dummy(limit ? std::min(s, limit) : s); + } + + void get_permutation(bool /*reverse*/, size_t /*limit*/, int /*nan_direction_hint*/, + Permutation& res) const override { + res.resize(s); + for (size_t i = 0; i < s; ++i) res[i] = i; + } + + ColumnPtr replicate(const Offsets& offsets) const override { + column_match_offsets_size(s, offsets.size()); + + return clone_dummy(offsets.back()); + } + + void replicate(const uint32_t* indexs, size_t target_size, IColumn& column) const override { + LOG(FATAL) << "Not implemented"; + } + + MutableColumns scatter(ColumnIndex num_columns, const Selector& selector) const override { + if (s != selector.size()) { + LOG(FATAL) << "Size of selector doesn't match size of column."; + } + + std::vector counts(num_columns); + for (auto idx : selector) ++counts[idx]; + + MutableColumns res(num_columns); + for (size_t i = 0; i < num_columns; ++i) res[i] = clone_resized(counts[i]); + + return res; + } + + void append_data_by_selector(MutableColumnPtr& res, + const IColumn::Selector& selector) const override { + size_t num_rows = size(); + + if (num_rows < selector.size()) { + LOG(FATAL) << fmt::format("Size of selector: {}, is larger than size of column:{}", + selector.size(), num_rows); + } + + res->reserve(num_rows); + + for (size_t i = 0; i < selector.size(); ++i) res->insert_from(*this, selector[i]); + } + + void addSize(size_t delta) { s += delta; } + + bool is_dummy() const override { return true; } + + void replace_column_data(const IColumn& rhs, size_t row, size_t self_row = 0) override { + LOG(FATAL) << "should not call the method in column dummy"; + } + + void replace_column_data_default(size_t self_row = 0) override { + LOG(FATAL) << "should not call the method in column dummy"; + } + + void get_indices_of_non_default_rows(Offsets64&, size_t, size_t) const override { + LOG(FATAL) << "should not call the method in column dummy"; + } + + ColumnPtr index(const IColumn& indexes, size_t limit) const override { + if (indexes.size() < limit) { + LOG(FATAL) << "Size of indexes is less than required."; + } + return clone_dummy(limit ? limit : s); + } + +protected: + size_t s; +}; + +} // namespace doris::vectorized diff --git a/be/src/vec/columns/column_fixed_length_object.h b/be/src/vec/columns/column_fixed_length_object.h index dce6666f132f91..5b9733748d09fd 100644 --- a/be/src/vec/columns/column_fixed_length_object.h +++ b/be/src/vec/columns/column_fixed_length_object.h @@ -50,8 +50,6 @@ class ColumnFixedLengthObject final : public COWHelper +double IColumn::get_ratio_of_default_rows_impl(double sample_ratio) const { + if (sample_ratio <= 0.0 || sample_ratio > 1.0) { + LOG(FATAL) << "Value of 'sample_ratio' must be in interval (0.0; 1.0], but got: " + << sample_ratio; + } + static constexpr auto max_number_of_rows_for_full_search = 1000; + size_t num_rows = size(); + size_t num_sampled_rows = std::min(static_cast(num_rows * sample_ratio), num_rows); + size_t num_checked_rows = 0; + size_t res = 0; + if (num_sampled_rows == num_rows || num_rows <= max_number_of_rows_for_full_search) { + for (size_t i = 0; i < num_rows; ++i) + res += static_cast(*this).is_default_at(i); + num_checked_rows = num_rows; + } else if (num_sampled_rows != 0) { + for (size_t i = 0; i < num_rows; ++i) { + if (num_checked_rows * num_rows <= i * num_sampled_rows) { + res += static_cast(*this).is_default_at(i); + ++num_checked_rows; + } + } + } + if (num_checked_rows == 0) { + return 0.0; + } + return static_cast(res) / num_checked_rows; +} + } // namespace doris::vectorized diff --git a/be/src/vec/columns/column_map.cpp b/be/src/vec/columns/column_map.cpp index 47a41c3dcfe571..f3fa5ab6c26479 100644 --- a/be/src/vec/columns/column_map.cpp +++ b/be/src/vec/columns/column_map.cpp @@ -433,14 +433,24 @@ ColumnPtr ColumnMap::replicate(const Offsets& offsets) const { return res; } -void ColumnMap::replicate(const uint32_t* indexs, size_t target_size, IColumn& column) const { +void ColumnMap::replicate(const uint32_t* indices, size_t target_size, IColumn& column) const { auto& res = reinterpret_cast(column); - // Make a temp column array for reusing its replicate function - ColumnArray::create(keys_column->assume_mutable(), offsets_column->assume_mutable()) - ->replicate(indexs, target_size, res.keys_column->assume_mutable_ref()); - ColumnArray::create(values_column->assume_mutable(), offsets_column->assume_mutable()) - ->replicate(indexs, target_size, res.values_column->assume_mutable_ref()); + auto keys_array = + ColumnArray::create(keys_column->assume_mutable(), offsets_column->assume_mutable()); + + auto result_array = ColumnArray::create(res.keys_column->assume_mutable(), + res.offsets_column->assume_mutable()); + keys_array->replicate(indices, target_size, result_array->assume_mutable_ref()); + + result_array = ColumnArray::create(res.values_column->assume_mutable(), + res.offsets_column->clone_empty()); + + auto values_array = + ColumnArray::create(values_column->assume_mutable(), offsets_column->assume_mutable()); + + /// FIXME: To reuse the replicate of ColumnArray, the offsets column was replicated twice + values_array->replicate(indices, target_size, result_array->assume_mutable_ref()); } MutableColumnPtr ColumnMap::get_shrinked_column() { diff --git a/be/src/vec/columns/column_map.h b/be/src/vec/columns/column_map.h index 551b6f9bfd6328..f6fd313208501e 100644 --- a/be/src/vec/columns/column_map.h +++ b/be/src/vec/columns/column_map.h @@ -30,7 +30,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/status.h" #include "vec/columns/column.h" @@ -94,8 +93,6 @@ class ColumnMap final : public COWHelper { MutableColumnPtr clone_resized(size_t size) const override; - bool can_be_inside_nullable() const override { return true; } - Field operator[](size_t n) const override; void get(size_t n, Field& res) const override; StringRef get_data_at(size_t n) const override; diff --git a/be/src/vec/columns/column_nothing.h b/be/src/vec/columns/column_nothing.h new file mode 100644 index 00000000000000..8a10eec8b6f36d --- /dev/null +++ b/be/src/vec/columns/column_nothing.h @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/src/AggregateFunctions/ColumnNothing.h +// and modified by Doris + +#pragma once + +#include "vec/columns/column_dummy.h" + +namespace doris::vectorized { + +class ColumnNothing final : public COWHelper { +private: + friend class COWHelper; + + ColumnNothing(size_t s_) { s = s_; } + + ColumnNothing(const ColumnNothing&) = default; + +public: + const char* get_family_name() const override { return "Nothing"; } + MutableColumnPtr clone_dummy(size_t s_) const override { return ColumnNothing::create(s_); } + + bool structure_equals(const IColumn& rhs) const override { + return typeid(rhs) == typeid(ColumnNothing); + } +}; + +} // namespace doris::vectorized diff --git a/be/src/vec/columns/column_nullable.cpp b/be/src/vec/columns/column_nullable.cpp index 42b88ac7ae9af4..4f25a3f4b159a3 100644 --- a/be/src/vec/columns/column_nullable.cpp +++ b/be/src/vec/columns/column_nullable.cpp @@ -44,10 +44,6 @@ ColumnNullable::ColumnNullable(MutableColumnPtr&& nested_column_, MutableColumnP nested_column = assert_cast(*nested_column).get_nested_column_ptr(); } - if (!get_nested_column().can_be_inside_nullable()) { - LOG(FATAL) << get_nested_column().get_name() << " cannot be inside Nullable column"; - } - if (is_column_const(*null_map)) { LOG(FATAL) << "ColumnNullable cannot have constant null map"; } @@ -564,9 +560,7 @@ bool ColumnNullable::has_null(size_t size) const { } ColumnPtr make_nullable(const ColumnPtr& column, bool is_nullable) { - if (is_column_nullable(*column)) { - return column; - } + if (is_column_nullable(*column)) return column; if (is_column_const(*column)) { return ColumnConst::create( diff --git a/be/src/vec/columns/column_nullable.h b/be/src/vec/columns/column_nullable.h index 3b97e708d1155b..8a45c51d236549 100644 --- a/be/src/vec/columns/column_nullable.h +++ b/be/src/vec/columns/column_nullable.h @@ -27,7 +27,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/status.h" #include "olap/olap_common.h" @@ -93,6 +92,7 @@ class ColumnNullable final : public COWHelper { bool is_null_at(size_t n) const override { return assert_cast(*null_map).get_data()[n] != 0; } + bool is_default_at(size_t n) const override { return is_null_at(n); } Field operator[](size_t n) const override; void get(size_t n, Field& res) const override; bool get_bool(size_t n) const override { @@ -354,6 +354,10 @@ class ColumnNullable final : public COWHelper { return get_ptr(); } + double get_ratio_of_default_rows(double sample_ratio) const override { + return get_ratio_of_default_rows_impl(sample_ratio); + } + void convert_dict_codes_if_necessary() override { get_nested_column().convert_dict_codes_if_necessary(); } diff --git a/be/src/vec/columns/column_object.cpp b/be/src/vec/columns/column_object.cpp index f3571c8ba29674..cf11b47947582b 100644 --- a/be/src/vec/columns/column_object.cpp +++ b/be/src/vec/columns/column_object.cpp @@ -23,125 +23,102 @@ #include #include #include +#include +#include #include #include #include +#include #include +#include "common/compiler_util.h" // IWYU pragma: keep #include "common/exception.h" +#include "common/logging.h" #include "common/status.h" +#include "exprs/json_functions.h" +#include "olap/olap_common.h" +#include "util/defer_op.h" +#include "util/simd/bits.h" +#include "vec/aggregate_functions/aggregate_function.h" +#include "vec/columns/column.h" #include "vec/columns/column_array.h" #include "vec/columns/column_nullable.h" +#include "vec/columns/column_string.h" +#include "vec/columns/column_vector.h" #include "vec/columns/columns_number.h" +#include "vec/common/assert_cast.h" #include "vec/common/field_visitors.h" #include "vec/common/schema_util.h" +#include "vec/common/string_buffer.hpp" +#include "vec/common/typeid_cast.h" +#include "vec/core/column_with_type_and_name.h" #include "vec/core/field.h" +#include "vec/core/types.h" #include "vec/data_types/convert_field_to_type.h" +#include "vec/data_types/data_type.h" #include "vec/data_types/data_type_array.h" +#include "vec/data_types/data_type_decimal.h" #include "vec/data_types/data_type_factory.hpp" +#include "vec/data_types/data_type_jsonb.h" #include "vec/data_types/data_type_nothing.h" -#include "vec/data_types/get_least_supertype.h" - -// IWYU pragma: no_include -#include "common/compiler_util.h" // IWYU pragma: keep -#include "common/logging.h" -#include "vec/aggregate_functions/aggregate_function.h" -#include "vec/columns/column.h" -#include "vec/columns/column_vector.h" -#include "vec/common/assert_cast.h" -#include "vec/common/typeid_cast.h" -#include "vec/data_types/data_type.h" -#include "vec/data_types/data_type_decimal.h" #include "vec/data_types/data_type_nullable.h" +#include "vec/data_types/get_least_supertype.h" namespace doris::vectorized { namespace { -DataTypePtr create_array_of_type(DataTypePtr type, size_t num_dimensions) { +DataTypePtr create_array_of_type(DataTypePtr type, size_t num_dimensions, bool is_nullable) { + const DataTypeNullable* nullable = typeid_cast(type.get()); + if ((nullable && + typeid_cast(nullable->get_nested_type().get())) || + typeid_cast(type.get())) { + // JSONB type MUST NOT wrapped in ARRAY column, it should be top level. + // So we ignored num_dimensions. + return type; + } for (size_t i = 0; i < num_dimensions; ++i) { type = std::make_shared(std::move(type)); + if (is_nullable) { + // wrap array with nullable + type = make_nullable(type); + } } return type; } -DataTypePtr getBaseTypeOfArray(const DataTypePtr& type) { +DataTypePtr get_base_type_of_array(const DataTypePtr& type) { /// Get raw pointers to avoid extra copying of type pointers. const DataTypeArray* last_array = nullptr; const auto* current_type = type.get(); + if (const auto* nullable = typeid_cast(current_type)) { + current_type = nullable->get_nested_type().get(); + } while (const auto* type_array = typeid_cast(current_type)) { current_type = type_array->get_nested_type().get(); last_array = type_array; + if (const auto* nullable = typeid_cast(current_type)) { + current_type = nullable->get_nested_type().get(); + } } return last_array ? last_array->get_nested_type() : type; } -size_t getNumberOfDimensions(const IDataType& type) { - if (const auto* type_array = typeid_cast(&type)) { - return type_array->get_number_of_dimensions(); - } - return 0; -} - -DataTypePtr get_data_type_by_column(const IColumn& column) { - // Removed in the future PR - // assert(false); - return nullptr; -} - -/// Recreates column with default scalar values and keeps sizes of arrays. -ColumnPtr recreate_column_with_default_value(const ColumnPtr& column, - const DataTypePtr& scalar_type, - size_t num_dimensions) { - const auto* column_array = check_and_get_column(column.get()); - if (column_array && num_dimensions) { - return ColumnArray::create( - recreate_column_with_default_value(column_array->get_data_ptr(), scalar_type, - num_dimensions - 1), - IColumn::mutate(column_array->get_offsets_ptr())); +size_t get_number_of_dimensions(const IDataType& type) { + int num_dimensions = 0; + const auto* current_type = &type; + if (const auto* nullable = typeid_cast(current_type)) { + current_type = nullable->get_nested_type().get(); } - return create_array_of_type(scalar_type, num_dimensions) - ->create_column() - ->clone_resized(column->size()); -} - -Array create_empty_array_field(size_t num_dimensions) { - assert(num_dimensions != 0); - Array array; - Array* current_array = &array; - for (size_t i = 1; i < num_dimensions; ++i) { - current_array->push_back(Array()); - current_array = ¤t_array->back().get(); - } - return array; -} - -/// Replaces NULL fields to given field or empty array. -class FieldVisitorReplaceNull : public StaticVisitor { -public: - explicit FieldVisitorReplaceNull(const Field& replacement_, size_t num_dimensions_) - : replacement(replacement_), num_dimensions(num_dimensions_) {} - Field operator()(const Null&) const { - return num_dimensions ? create_empty_array_field(num_dimensions) : replacement; - } - Field operator()(const Array& x) const { - assert(num_dimensions > 0); - const size_t size = x.size(); - Array res(size); - for (size_t i = 0; i < size; ++i) { - res[i] = apply_visitor(FieldVisitorReplaceNull(replacement, num_dimensions - 1), x[i]); + while (const auto* type_array = typeid_cast(current_type)) { + current_type = type_array->get_nested_type().get(); + num_dimensions += 1; + if (const auto* nullable = typeid_cast(current_type)) { + current_type = nullable->get_nested_type().get(); } - return res; } - template - Field operator()(const T& x) const { - return x; - } - -private: - const Field& replacement; - size_t num_dimensions; -}; + return num_dimensions; +} /// Calculates number of dimensions in array field. /// Returns 0 for scalar fields. @@ -149,23 +126,12 @@ class FieldVisitorToNumberOfDimensions : public StaticVisitor { public: size_t operator()(const Array& x) const { const size_t size = x.size(); - std::optional dimensions; + size_t dimensions = 0; for (size_t i = 0; i < size; ++i) { - /// Do not count Nulls, because they will be replaced by default - /// values with proper number of dimensions. - if (x[i].is_null()) { - continue; - } - size_t current_dimensions = apply_visitor(*this, x[i]); - if (!dimensions) { - dimensions = current_dimensions; - } else if (current_dimensions != *dimensions) { - throw doris::Exception(doris::ErrorCode::INVALID_ARGUMENT, - "Number of dimensions mismatched among array elements"); - return 0; - } + size_t element_dimensions = apply_visitor(*this, x[i]); + dimensions = std::max(dimensions, element_dimensions); } - return 1 + dimensions.value_or(0); + return 1 + dimensions; } template size_t operator()(const T&) const { @@ -202,18 +168,25 @@ class FieldVisitorToScalarType : public StaticVisitor { return 0; } size_t operator()(const Int64& x) { - // // Only Int64 | Int32 at present - // field_types.insert(FieldType::Int64); - // type_indexes.insert(TypeIndex::Int64); - // return 0; field_types.insert(FieldType::Int64); - if (x <= std::numeric_limits::max() && x >= std::numeric_limits::min()) { + if (x <= std::numeric_limits::max() && x >= std::numeric_limits::min()) { + type_indexes.insert(TypeIndex::Int8); + } else if (x <= std::numeric_limits::max() && + x >= std::numeric_limits::min()) { + type_indexes.insert(TypeIndex::Int16); + } else if (x <= std::numeric_limits::max() && + x >= std::numeric_limits::min()) { type_indexes.insert(TypeIndex::Int32); } else { type_indexes.insert(TypeIndex::Int64); } return 0; } + size_t operator()(const JsonbField& x) { + field_types.insert(FieldType::JSONB); + type_indexes.insert(TypeIndex::JSONB); + return 0; + } size_t operator()(const Null&) { have_nulls = true; return 0; @@ -226,7 +199,7 @@ class FieldVisitorToScalarType : public StaticVisitor { return 0; } void get_scalar_type(DataTypePtr* type) const { - get_least_supertype(type_indexes, type, true /*compatible with string type*/); + get_least_supertype(type_indexes, type); } bool contain_nulls() const { return have_nulls; } bool need_convert_field() const { return field_types.size() > 1; } @@ -252,15 +225,18 @@ void get_field_info(const Field& field, FieldInfo* info) { }; } -ColumnObject::Subcolumn::Subcolumn(MutableColumnPtr&& data_, bool is_nullable_) - : least_common_type(get_data_type_by_column(*data_)), is_nullable(is_nullable_) { +ColumnObject::Subcolumn::Subcolumn(MutableColumnPtr&& data_, DataTypePtr type, bool is_nullable_, + bool is_root_) + : least_common_type(type), is_nullable(is_nullable_), is_root(is_root_) { data.push_back(std::move(data_)); + data_types.push_back(type); } -ColumnObject::Subcolumn::Subcolumn(size_t size_, bool is_nullable_) +ColumnObject::Subcolumn::Subcolumn(size_t size_, bool is_nullable_, bool is_root_) : least_common_type(std::make_shared()), is_nullable(is_nullable_), - num_of_defaults_in_prefix(size_) {} + num_of_defaults_in_prefix(size_), + is_root(is_root_) {} size_t ColumnObject::Subcolumn::Subcolumn::size() const { size_t res = num_of_defaults_in_prefix; @@ -294,7 +270,8 @@ void ColumnObject::Subcolumn::insert(Field field) { void ColumnObject::Subcolumn::add_new_column_part(DataTypePtr type) { data.push_back(type->create_column()); - least_common_type = LeastCommonType {std::move(type)}; + least_common_type = LeastCommonType {type}; + data_types.push_back(type); } void ColumnObject::Subcolumn::insert(Field field, FieldInfo info) { @@ -305,45 +282,42 @@ void ColumnObject::Subcolumn::insert(Field field, FieldInfo info) { } auto column_dim = least_common_type.get_dimensions(); auto value_dim = info.num_dimensions; - if (is_nothing(least_common_type.getBase())) { + if (is_nothing(least_common_type.get_base())) { column_dim = value_dim; } if (is_nothing(base_type)) { value_dim = column_dim; } - if (value_dim != column_dim) { - throw doris::Exception(doris::ErrorCode::INVALID_ARGUMENT, - "Dimension of types mismatched between inserted value and column, " - "expected:{}, but meet:{} for type:{}", - column_dim, value_dim, least_common_type.get()->get_name()); + bool type_changed = false; + if (value_dim != column_dim || info.num_dimensions >= 2) { + // Deduce to JSONB + VLOG_DEBUG << fmt::format( + "Dimension of types mismatched between inserted value and column, " + "expected:{}, but meet:{} for type:{}", + column_dim, value_dim, least_common_type.get()->get_name()); + base_type = std::make_shared(); + value_dim = 0; + type_changed = true; } if (is_nullable && !is_nothing(base_type)) { base_type = make_nullable(base_type); } - // alawys nullable at present - if (!is_nullable && info.have_nulls) { - field = apply_visitor(FieldVisitorReplaceNull(base_type->get_default(), value_dim), - std::move(field)); - } - // need replace muli dimensions array which contains null. eg. [[1, 2, 3], null] -> [[1, 2, 3], []] - // since column array doesnt known null's dimension - if (info.num_dimensions >= 2 && info.have_nulls) { - field = apply_visitor(FieldVisitorReplaceNull(base_type->get_default(), value_dim), - std::move(field)); - } - bool type_changed = false; - const auto& least_common_base_type = least_common_type.getBase(); + const auto& least_common_base_type = least_common_type.get_base(); if (data.empty()) { - add_new_column_part(create_array_of_type(std::move(base_type), value_dim)); + add_new_column_part(create_array_of_type(std::move(base_type), value_dim, is_nullable)); } else if (!least_common_base_type->equals(*base_type) && !is_nothing(base_type)) { if (!schema_util::is_conversion_required_between_integers(*base_type, *least_common_base_type)) { - get_least_supertype(DataTypes {std::move(base_type), least_common_base_type}, - &base_type, true /*compatible with string type*/); + get_least_supertype( + DataTypes {std::move(base_type), least_common_base_type}, &base_type); type_changed = true; + if (is_nullable) { + base_type = make_nullable(base_type); + } if (!least_common_base_type->equals(*base_type)) { - add_new_column_part(create_array_of_type(std::move(base_type), value_dim)); + add_new_column_part( + create_array_of_type(std::move(base_type), value_dim, is_nullable)); } } } @@ -358,53 +332,121 @@ void ColumnObject::Subcolumn::insert(Field field, FieldInfo info) { } void ColumnObject::Subcolumn::insertRangeFrom(const Subcolumn& src, size_t start, size_t length) { - assert(src.is_finalized()); - const auto& src_column = src.data.back(); - const auto& src_type = src.least_common_type.get(); + assert(start + length <= src.size()); + size_t end = start + length; + // num_rows += length; if (data.empty()) { - add_new_column_part(src.least_common_type.get()); - data.back()->insert_range_from(*src_column, start, length); - } else if (least_common_type.get()->equals(*src_type)) { - data.back()->insert_range_from(*src_column, start, length); - } else { - DataTypePtr new_least_common_type = nullptr; - get_least_supertype(DataTypes {least_common_type.get(), src_type}, &new_least_common_type, - true /*compatible with string type*/); - ColumnPtr casted_column; - Status st = schema_util::cast_column({src_column, src_type, ""}, new_least_common_type, - &casted_column); + add_new_column_part(src.get_least_common_type()); + } else if (!least_common_type.get()->equals(*src.get_least_common_type())) { + DataTypePtr new_least_common_type; + get_least_supertype( + DataTypes {least_common_type.get(), src.get_least_common_type()}, + &new_least_common_type); + if (!new_least_common_type->equals(*least_common_type.get())) { + add_new_column_part(std::move(new_least_common_type)); + } + } + if (end <= src.num_of_defaults_in_prefix) { + data.back()->insert_many_defaults(length); + return; + } + if (start < src.num_of_defaults_in_prefix) { + data.back()->insert_many_defaults(src.num_of_defaults_in_prefix - start); + } + auto insert_from_part = [&](const auto& column, const auto& column_type, size_t from, + size_t n) { + assert(from + n <= column->size()); + if (column_type->equals(*least_common_type.get())) { + data.back()->insert_range_from(*column, from, n); + return; + } + /// If we need to insert large range, there is no sense to cut part of column and cast it. + /// Casting of all column and inserting from it can be faster. + /// Threshold is just a guess. + if (n * 3 >= column->size()) { + ColumnPtr casted_column; + Status st = schema_util::cast_column({column, column_type, ""}, least_common_type.get(), + &casted_column); + if (!st.ok()) { + throw doris::Exception(ErrorCode::INVALID_ARGUMENT, + st.to_string() + ", real_code:{}", st.code()); + } + data.back()->insert_range_from(*casted_column, from, n); + return; + } + auto casted_column = column->cut(from, n); + Status st = schema_util::cast_column({casted_column, column_type, ""}, + least_common_type.get(), &casted_column); if (!st.ok()) { throw doris::Exception(ErrorCode::INVALID_ARGUMENT, st.to_string() + ", real_code:{}", st.code()); } - if (!least_common_type.get()->equals(*new_least_common_type)) { - add_new_column_part(std::move(new_least_common_type)); - } - data.back()->insert_range_from(*casted_column, start, length); + data.back()->insert_range_from(*casted_column, 0, n); + }; + size_t pos = 0; + size_t processed_rows = src.num_of_defaults_in_prefix; + /// Find the first part of the column that intersects the range. + while (pos < src.data.size() && processed_rows + src.data[pos]->size() < start) { + processed_rows += src.data[pos]->size(); + ++pos; + } + /// Insert from the first part of column. + if (pos < src.data.size() && processed_rows < start) { + size_t part_start = start - processed_rows; + size_t part_length = std::min(src.data[pos]->size() - part_start, end - start); + insert_from_part(src.data[pos], src.data_types[pos], part_start, part_length); + processed_rows += src.data[pos]->size(); + ++pos; + } + /// Insert from the parts of column in the middle of range. + while (pos < src.data.size() && processed_rows + src.data[pos]->size() < end) { + insert_from_part(src.data[pos], src.data_types[pos], 0, src.data[pos]->size()); + processed_rows += src.data[pos]->size(); + ++pos; + } + /// Insert from the last part of column if needed. + if (pos < src.data.size() && processed_rows < end) { + size_t part_end = end - processed_rows; + insert_from_part(src.data[pos], src.data_types[pos], 0, part_end); } } bool ColumnObject::Subcolumn::is_finalized() const { - return data.empty() || (data.size() == 1 && num_of_defaults_in_prefix == 0); + return num_of_defaults_in_prefix == 0 && (data.empty() || (data.size() == 1)); } template -ColumnPtr ColumnObject::apply_for_subcolumns(Func&& func, std::string_view func_name) const { +MutableColumnPtr ColumnObject::apply_for_subcolumns(Func&& func) const { if (!is_finalized()) { - // LOG(FATAL) << "Cannot " << func_name << " non-finalized ColumnObject"; - throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, - "Cannot {} non-finalized ColumnObject", func_name); + auto finalized = clone_finalized(); + auto& finalized_object = assert_cast(*finalized); + return finalized_object.apply_for_subcolumns(std::forward(func)); } - auto res = ColumnObject::create(is_nullable); + auto res = ColumnObject::create(is_nullable, false); for (const auto& subcolumn : subcolumns) { auto new_subcolumn = func(subcolumn->data.get_finalized_column()); - res->add_sub_column(subcolumn->path, new_subcolumn->assume_mutable()); + res->add_sub_column(subcolumn->path, new_subcolumn->assume_mutable(), + subcolumn->data.get_least_common_type()); } return res; } ColumnPtr ColumnObject::index(const IColumn& indexes, size_t limit) const { return apply_for_subcolumns( - [&](const auto& subcolumn) { return subcolumn.index(indexes, limit); }, "index"); + [&](const auto& subcolumn) { return subcolumn.index(indexes, limit); }); +} + +bool ColumnObject::Subcolumn::check_if_sparse_column(size_t num_rows) { + constexpr static size_t s_threshold_rows_estimate_sparse_column = 1000; + if (num_rows < s_threshold_rows_estimate_sparse_column) { + return false; + } + std::vector defaults_ratio; + for (size_t i = 0; i < data.size(); ++i) { + defaults_ratio.push_back(data[i]->get_ratio_of_default_rows()); + } + double default_ratio = std::accumulate(defaults_ratio.begin(), defaults_ratio.end(), 0.0) / + defaults_ratio.size(); + return default_ratio >= config::ratio_of_defaults_as_sparse_column; } void ColumnObject::Subcolumn::finalize() { @@ -415,37 +457,35 @@ void ColumnObject::Subcolumn::finalize() { data[0] = data[0]->convert_to_full_column_if_const(); return; } - const auto& to_type = least_common_type.get(); + DataTypePtr to_type = least_common_type.get(); + if (is_root) { + // Root always JSONB type + to_type = is_nullable ? make_nullable(std::make_shared()) + : std::make_shared(); + least_common_type = LeastCommonType {to_type}; + } auto result_column = to_type->create_column(); if (num_of_defaults_in_prefix) { result_column->insert_many_defaults(num_of_defaults_in_prefix); } - for (auto& part : data) { + for (size_t i = 0; i < data.size(); ++i) { + auto& part = data[i]; + auto from_type = data_types[i]; part = part->convert_to_full_column_if_const(); - auto from_type = get_data_type_by_column(*part); size_t part_size = part->size(); if (!from_type->equals(*to_type)) { - auto offsets = ColumnUInt64::create(); - auto& offsets_data = offsets->get_data(); - /// We need to convert only non-default values and then recreate column - /// with default value of new type, because default values (which represents misses in data) - /// may be inconsistent between types (e.g "0" in UInt64 and empty string in String). - part->get_indices_of_non_default_rows(offsets_data, 0, part_size); - if (offsets->size() == part_size) { - ColumnPtr ptr; - static_cast(schema_util::cast_column({part, from_type, ""}, to_type, &ptr)); - part = ptr; - } else { - auto values = part->index(*offsets, offsets->size()); - static_cast( - schema_util::cast_column({values, from_type, ""}, to_type, &values)); - part = values->create_with_offsets(offsets_data, to_type->get_default(), part_size, - /*shift=*/0); + ColumnPtr ptr; + Status st = schema_util::cast_column({part, from_type, ""}, to_type, &ptr); + if (!st.ok()) { + throw doris::Exception(ErrorCode::INVALID_ARGUMENT, + st.to_string() + ", real_code:{}", st.code()); } + part = ptr; } result_column->insert_range_from(*part, 0, part_size); } data = {std::move(result_column)}; + data_types = {std::move(to_type)}; num_of_defaults_in_prefix = 0; } @@ -481,7 +521,9 @@ void ColumnObject::Subcolumn::pop_back(size_t n) { n -= column->size(); } } - data.resize(data.size() - num_removed); + size_t sz = data.size() - num_removed; + data.resize(sz); + data_types.resize(sz); num_of_defaults_in_prefix -= n; } @@ -494,25 +536,6 @@ Field ColumnObject::Subcolumn::get_last_field() const { return (*last_part)[last_part->size() - 1]; } -ColumnObject::Subcolumn ColumnObject::Subcolumn::recreate_with_default_values( - const FieldInfo& field_info) const { - auto scalar_type = field_info.scalar_type; - if (is_nullable) { - scalar_type = make_nullable(scalar_type); - } - Subcolumn new_subcolumn; - new_subcolumn.least_common_type = - LeastCommonType {create_array_of_type(scalar_type, field_info.num_dimensions)}; - new_subcolumn.is_nullable = is_nullable; - new_subcolumn.num_of_defaults_in_prefix = num_of_defaults_in_prefix; - new_subcolumn.data.reserve(data.size()); - for (const auto& part : data) { - new_subcolumn.data.push_back( - recreate_column_with_default_value(part, scalar_type, field_info.num_dimensions)); - } - return new_subcolumn; -} - IColumn& ColumnObject::Subcolumn::get_finalized_column() { assert(is_finalized()); return *data[0]; @@ -528,6 +551,11 @@ const ColumnPtr& ColumnObject::Subcolumn::get_finalized_column_ptr() const { return data[0]; } +ColumnPtr& ColumnObject::Subcolumn::get_finalized_column_ptr() { + assert(is_finalized()); + return data[0]; +} + void ColumnObject::Subcolumn::remove_nullable() { assert(is_finalized()); data[0] = doris::vectorized::remove_nullable(data[0]); @@ -536,10 +564,19 @@ void ColumnObject::Subcolumn::remove_nullable() { ColumnObject::Subcolumn::LeastCommonType::LeastCommonType(DataTypePtr type_) : type(std::move(type_)), - base_type(getBaseTypeOfArray(type)), - num_dimensions(getNumberOfDimensions(*type)) {} + base_type(get_base_type_of_array(type)), + num_dimensions(get_number_of_dimensions(*type)) { + if (!WhichDataType(type).is_nothing()) { + least_common_type_serder = type->get_serde(); + } +} -ColumnObject::ColumnObject(bool is_nullable_) : is_nullable(is_nullable_), num_rows(0) {} +ColumnObject::ColumnObject(bool is_nullable_, bool create_root_) + : is_nullable(is_nullable_), num_rows(0) { + if (create_root_) { + subcolumns.create_root(Subcolumn(0, is_nullable, true /*root*/)); + } +} ColumnObject::ColumnObject(Subcolumns&& subcolumns_, bool is_nullable_) : is_nullable(is_nullable_), @@ -571,12 +608,11 @@ size_t ColumnObject::size() const { } MutableColumnPtr ColumnObject::clone_resized(size_t new_size) const { - /// cloneResized with new_size == 0 is used for cloneEmpty(). - if (new_size != 0) { - throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, - "ColumnObject doesn't support resize to non-zero length"); + if (new_size == 0) { + return ColumnObject::create(is_nullable); } - return ColumnObject::create(is_nullable); + return apply_for_subcolumns( + [&](const auto& subcolumn) { return subcolumn.clone_resized(new_size); }); } size_t ColumnObject::byte_size() const { @@ -596,24 +632,43 @@ size_t ColumnObject::allocated_bytes() const { } void ColumnObject::for_each_subcolumn(ColumnCallback callback) { - if (!is_finalized()) { - assert(false); - } for (auto& entry : subcolumns) { - callback(entry->data.data.back()); + for (auto& part : entry->data.data) { + callback(part); + } } } -void ColumnObject::try_insert_from(const IColumn& src, size_t n) { +void ColumnObject::insert_from(const IColumn& src, size_t n) { + const auto& src_v = assert_cast(src); + // optimize when src and this column are scalar variant, since try_insert is inefficiency + if (src_v.is_scalar_variant() && is_scalar_variant() && + src_v.get_root_type()->equals(*get_root_type()) && src_v.is_finalized() && is_finalized()) { + assert_cast(*get_root()).insert_from(*src_v.get_root(), n); + ++num_rows; + return; + } return try_insert(src[n]); } void ColumnObject::try_insert(const Field& field) { + if (field.get_type() != Field::Types::VariantMap) { + auto* root = get_subcolumn({}); + if (!root) { + doris::Exception(doris::ErrorCode::INVALID_ARGUMENT, "Failed to find root column_path"); + } + root->insert(field); + ++num_rows; + return; + } const auto& object = field.get(); phmap::flat_hash_set inserted; size_t old_size = size(); for (const auto& [key_str, value] : object) { - PathInData key(key_str); + PathInData key; + if (!key_str.empty()) { + key = PathInData(key_str); + } inserted.insert(key_str); if (!has_subcolumn(key)) { bool succ = add_sub_column(key, old_size); @@ -646,10 +701,17 @@ void ColumnObject::insert_default() { Field ColumnObject::operator[](size_t n) const { if (!is_finalized()) { - assert(false); + const_cast(this)->finalize(); } VariantMap map; for (const auto& entry : subcolumns) { + if (WhichDataType(remove_nullable(entry->data.data_types.back())).is_json()) { + // JsonbFiled is special case + Field f = JsonbField(); + (*entry->data.data.back()).get(n, f); + map[entry->path.get_path()] = std::move(f); + continue; + } map[entry->path.get_path()] = (*entry->data.data.back())[n]; } return map; @@ -657,11 +719,15 @@ Field ColumnObject::operator[](size_t n) const { void ColumnObject::get(size_t n, Field& res) const { if (!is_finalized()) { - assert(false); + const_cast(this)->finalize(); } auto& map = res.get(); for (const auto& entry : subcolumns) { auto it = map.try_emplace(entry->path.get_path()).first; + if (WhichDataType(remove_nullable(entry->data.data_types.back())).is_json()) { + // JsonbFiled is special case + it->second = JsonbField(); + } entry->data.data.back()->get(n, it->second); } } @@ -672,59 +738,52 @@ Status ColumnObject::try_insert_indices_from(const IColumn& src, const int* indi if (*x == -1) { ColumnObject::insert_default(); } else { - ColumnObject::try_insert_from(src, *x); + ColumnObject::insert_from(src, *x); } } finalize(); return Status::OK(); } -void ColumnObject::try_insert_range_from(const IColumn& src, size_t start, size_t length) { +FieldInfo ColumnObject::Subcolumn::get_subcolumn_field_info() const { + const auto& base_type = least_common_type.get_base(); + return FieldInfo { + .scalar_type = base_type, + .have_nulls = base_type->is_nullable(), + .need_convert = false, + .num_dimensions = least_common_type.get_dimensions(), + }; +} + +void ColumnObject::insert_range_from(const IColumn& src, size_t start, size_t length) { const auto& src_object = assert_cast(src); - if (UNLIKELY(src_object.empty())) { - return; + for (const auto& entry : src_object.subcolumns) { + if (!has_subcolumn(entry->path)) { + add_sub_column(entry->path, num_rows); + } + auto* subcolumn = get_subcolumn(entry->path); + subcolumn->insertRangeFrom(entry->data, start, length); } for (auto& entry : subcolumns) { - if (src_object.has_subcolumn(entry->path)) { - auto* subcolumn = src_object.get_subcolumn(entry->path); - if (!subcolumn) { - throw doris::Exception(doris::ErrorCode::INVALID_ARGUMENT, - "Failed to find sub column {}", entry->path.get_path()); - } - entry->data.insertRangeFrom(*subcolumn, start, length); - } else { + if (!src_object.has_subcolumn(entry->path)) { entry->data.insertManyDefaults(length); } } - for (const auto& entry : src_object.subcolumns) { - if (!has_subcolumn(entry->path)) { - bool succ = false; - if (entry->path.has_nested_part()) { - const auto& base_type = entry->data.get_least_common_typeBase(); - FieldInfo field_info { - .scalar_type = base_type, - .have_nulls = base_type->is_nullable(), - .need_convert = false, - .num_dimensions = entry->data.get_dimensions(), - }; - succ = add_nested_subcolumn(entry->path, field_info, num_rows); - } else { - succ = add_sub_column(entry->path, num_rows); - } - if (!succ) { - throw doris::Exception(doris::ErrorCode::INVALID_ARGUMENT, - "Failed to add column {}", entry->path.get_path()); - } - auto* subcolumn = get_subcolumn(entry->path); - if (!subcolumn) { - throw doris::Exception(doris::ErrorCode::INVALID_ARGUMENT, - "Failed to find sub column {}", entry->path.get_path()); - } - subcolumn->insertRangeFrom(entry->data, start, length); - } - } num_rows += length; finalize(); +#ifndef NDEBUG + check_consistency(); +#endif +} + +ColumnPtr ColumnObject::replicate(const Offsets& offsets) const { + return apply_for_subcolumns( + [&](const auto& subcolumn) { return subcolumn.replicate(offsets); }); +} + +ColumnPtr ColumnObject::permute(const Permutation& perm, size_t limit) const { + return apply_for_subcolumns( + [&](const auto& subcolumn) { return subcolumn.permute(perm, limit); }); } void ColumnObject::pop_back(size_t length) { @@ -756,9 +815,27 @@ bool ColumnObject::has_subcolumn(const PathInData& key) const { return subcolumns.find_leaf(key) != nullptr; } -bool ColumnObject::add_sub_column(const PathInData& key, MutableColumnPtr&& subcolumn) { +bool ColumnObject::add_sub_column(const PathInData& key, MutableColumnPtr&& subcolumn, + DataTypePtr type) { size_t new_size = subcolumn->size(); - bool inserted = subcolumns.add(key, Subcolumn(std::move(subcolumn), is_nullable)); + doc_structure = nullptr; + if (key.empty() && subcolumns.empty()) { + // create root + subcolumns.create_root(Subcolumn(std::move(subcolumn), type, is_nullable, true)); + num_rows = new_size; + return true; + } + if (key.empty() && ((!subcolumns.get_root()->is_scalar()) || + is_nothing(subcolumns.get_root()->data.get_least_common_type()))) { + // update root + subcolumns.get_mutable_root()->modify_to_scalar( + Subcolumn(std::move(subcolumn), type, is_nullable, true)); + if (num_rows == 0) { + num_rows = new_size; + } + return true; + } + bool inserted = subcolumns.add(key, Subcolumn(std::move(subcolumn), type, is_nullable)); if (!inserted) { VLOG_DEBUG << "Duplicated sub column " << key.get_path(); return false; @@ -773,6 +850,21 @@ bool ColumnObject::add_sub_column(const PathInData& key, MutableColumnPtr&& subc } bool ColumnObject::add_sub_column(const PathInData& key, size_t new_size) { + if (key.empty() && subcolumns.empty()) { + // create root + subcolumns.create_root(Subcolumn(new_size, is_nullable, true)); + num_rows = new_size; + return true; + } + if (key.empty() && (!subcolumns.get_root()->is_scalar())) { + // update none scalar root column to scalar node + subcolumns.get_mutable_root()->modify_to_scalar(Subcolumn(new_size, is_nullable, true)); + if (num_rows == 0) { + num_rows = new_size; + } + return true; + } + doc_structure = nullptr; bool inserted = subcolumns.add(key, Subcolumn(new_size, is_nullable)); if (!inserted) { VLOG_DEBUG << "Duplicated sub column " << key.get_path(); @@ -787,40 +879,6 @@ bool ColumnObject::add_sub_column(const PathInData& key, size_t new_size) { return true; } -bool ColumnObject::add_nested_subcolumn(const PathInData& key, const FieldInfo& field_info, - size_t new_size) { - assert(key.has_nested_part()); - bool inserted = false; - /// We find node that represents the same Nested type as @key. - const auto* nested_node = subcolumns.find_best_match(key); - if (nested_node) { - /// Find any leaf of Nested subcolumn. - const auto* leaf = doris::vectorized::ColumnObject::Subcolumns::find_leaf( - nested_node, [&](const auto&) { return true; }); - assert(leaf); - /// Recreate subcolumn with default values and the same sizes of arrays. - auto new_subcolumn = leaf->data.recreate_with_default_values(field_info); - /// It's possible that we have already inserted value from current row - /// to this subcolumn. So, adjust size to expected. - if (new_subcolumn.size() > new_size) { - new_subcolumn.pop_back(new_subcolumn.size() - new_size); - } - assert(new_subcolumn.size() == new_size); - inserted = subcolumns.add(key, new_subcolumn); - } else { - /// If node was not found just add subcolumn with empty arrays. - inserted = subcolumns.add(key, Subcolumn(new_size, is_nullable)); - } - if (!inserted) { - VLOG_DEBUG << "Subcolumn already exists"; - return false; - } - if (num_rows == 0) { - num_rows = new_size; - } - return true; -} - PathsInData ColumnObject::getKeys() const { PathsInData keys; keys.reserve(subcolumns.size()); @@ -845,28 +903,318 @@ bool ColumnObject::is_finalized() const { [](const auto& entry) { return entry->data.is_finalized(); }); } -void ColumnObject::finalize() { +static bool check_if_valid_column_name(const PathInData& path) { + static const std::regex COLUMN_NAME_REGEX("^[_a-zA-Z@0-9][.a-zA-Z0-9_+-/>is_column_array() && !result_column->is_nullable()) { + auto new_null_map = ColumnUInt8::create(); + new_null_map->reserve(result_column->size()); + auto& null_map_data = new_null_map->get_data(); + auto array = static_cast(result_column.get()); + for (size_t i = 0; i < array->size(); ++i) { + null_map_data.push_back(array->is_default_at(i)); + } + result_column = ColumnNullable::create(std::move(result_column), std::move(new_null_map)); + data_types[0] = make_nullable(data_types[0]); + least_common_type = LeastCommonType {data_types[0]}; + } +} + +rapidjson::Value* find_leaf_node_by_path(rapidjson::Value& json, const PathInData& path, + int idx = 0) { + if (idx >= path.get_parts().size()) { + return &json; + } + + std::string_view current_key = path.get_parts()[idx].key; + if (!json.IsObject()) { + return nullptr; + } + rapidjson::Value name(current_key.data(), current_key.size()); + auto it = json.FindMember(name); + if (it == json.MemberEnd()) { + return nullptr; + } + rapidjson::Value& current = it->value; + // if (idx == path.get_parts().size() - 1) { + // return ¤t; + // } + return find_leaf_node_by_path(current, path, idx + 1); +} + +void find_and_set_leave_value(const IColumn* column, const PathInData& path, + const DataTypeSerDeSPtr& type, rapidjson::Value& root, + rapidjson::Document::AllocatorType& allocator, int row) { + const auto* nullable = assert_cast(column); + if (nullable->is_null_at(row)) { + return; + } + // TODO could cache the result of leaf nodes with it's path info + rapidjson::Value* target = find_leaf_node_by_path(root, path); + if (UNLIKELY(!target)) { + rapidjson::StringBuffer buffer; + rapidjson::Writer writer(buffer); + root.Accept(writer); + LOG(FATAL) << "could not find path " << path.get_path() + << ", root: " << std::string(buffer.GetString(), buffer.GetSize()); + } + type->write_one_cell_to_json(*column, *target, allocator, row); +} + +// compact null values +// {"a" : {"b" : "d" {"n" : null}, "e" : null}, "c" : 10 } +// after compact -> {"a" : {"c"} : 10} +void compact_null_values(rapidjson::Value& json, rapidjson::Document::AllocatorType& allocator) { + if (!json.IsObject() || json.IsNull()) { + return; + } + + rapidjson::Value::MemberIterator it = json.MemberBegin(); + while (it != json.MemberEnd()) { + rapidjson::Value& value = it->value; + if (value.IsNull()) { + it = json.EraseMember(it); + continue; + } + compact_null_values(value, allocator); + if (value.IsObject() && value.ObjectEmpty()) { + it = json.EraseMember(it); + continue; + } + ++it; + } +} + +// Construct rapidjson value from Subcolumns +void get_json_by_column_tree(rapidjson::Value& root, rapidjson::Document::AllocatorType& allocator, + const ColumnObject::Subcolumns::Node* node_root) { + if (node_root == nullptr || node_root->children.empty()) { + root.SetNull(); + return; + } + root.SetObject(); + for (auto it = node_root->children.begin(); it != node_root->children.end(); ++it) { + auto child = it->get_second(); + rapidjson::Value value(rapidjson::kObjectType); + get_json_by_column_tree(value, allocator, child.get()); + root.AddMember(rapidjson::StringRef(it->get_first().data, it->get_first().size), value, + allocator); + } +} + +bool ColumnObject::serialize_one_row_to_string(int row, std::string* output) const { + if (!is_finalized()) { + const_cast(this)->finalize(); + } + rapidjson::StringBuffer buf; + if (is_scalar_variant()) { + auto type = get_root_type(); + *output = type->to_string(*get_root(), row); + return true; + } + bool res = serialize_one_row_to_json_format(row, &buf, nullptr); + if (res) { + // TODO avoid copy + *output = std::string(buf.GetString(), buf.GetSize()); + } + return res; +} + +bool ColumnObject::serialize_one_row_to_string(int row, BufferWritable& output) const { + if (!is_finalized()) { + const_cast(this)->finalize(); + } + if (is_scalar_variant()) { + auto type = get_root_type(); + type->to_string(*get_root(), row, output); + return true; + } + rapidjson::StringBuffer buf; + bool res = serialize_one_row_to_json_format(row, &buf, nullptr); + if (res) { + output.write(buf.GetString(), buf.GetLength()); + } + return res; +} + +bool ColumnObject::serialize_one_row_to_json_format(int row, rapidjson::StringBuffer* output, + bool* is_null) const { + CHECK(is_finalized()); + if (subcolumns.empty()) { + if (is_null != nullptr) { + *is_null = true; + } else { + rapidjson::Value root(rapidjson::kNullType); + rapidjson::Writer writer(*output); + return root.Accept(writer); + } + return true; + } + CHECK(size() > row); + rapidjson::StringBuffer buffer; + rapidjson::Value root(rapidjson::kNullType); + if (doc_structure == nullptr) { + doc_structure = std::make_shared(); + rapidjson::Document::AllocatorType& allocator = doc_structure->GetAllocator(); + get_json_by_column_tree(*doc_structure, allocator, subcolumns.get_root()); + } + if (!doc_structure->IsNull()) { + root.CopyFrom(*doc_structure, doc_structure->GetAllocator()); + } +#ifndef NDEBUG + VLOG_DEBUG << "dump structure " << JsonFunctions::print_json_value(*doc_structure); +#endif + for (const auto& subcolumn : subcolumns) { + find_and_set_leave_value(subcolumn->data.get_finalized_column_ptr(), subcolumn->path, + subcolumn->data.get_least_common_type_serde(), root, + doc_structure->GetAllocator(), row); + } + compact_null_values(root, doc_structure->GetAllocator()); + if (root.IsNull() && is_null != nullptr) { + // Fast path + *is_null = true; + } else { + output->Clear(); + rapidjson::Writer writer(*output); + return root.Accept(writer); + } + return true; +} + +void ColumnObject::merge_sparse_to_root_column() { + CHECK(is_finalized()); + if (sparse_columns.empty()) { + return; + } + ColumnPtr src = subcolumns.get_mutable_root()->data.get_finalized_column_ptr(); + MutableColumnPtr mresult = src->clone_empty(); + const ColumnNullable* src_null = assert_cast(src.get()); + const ColumnString* src_column_ptr = + assert_cast(&src_null->get_nested_column()); + rapidjson::StringBuffer buffer; + doc_structure = std::make_shared(); + rapidjson::Document::AllocatorType& allocator = doc_structure->GetAllocator(); + get_json_by_column_tree(*doc_structure, allocator, sparse_columns.get_root()); + +#ifndef NDEBUG + VLOG_DEBUG << "dump structure " << JsonFunctions::print_json_value(*doc_structure); +#endif + + ColumnNullable* result_column_nullable = + assert_cast(mresult->assume_mutable().get()); + ColumnString* result_column_ptr = + assert_cast(&result_column_nullable->get_nested_column()); + result_column_nullable->reserve(num_rows); + // parse each row to jsonb + for (size_t i = 0; i < num_rows; ++i) { + // root is not null, store original value, eg. the root is scalar type like '[1]' + if (!src_null->empty() && !src_null->is_null_at(i)) { + result_column_ptr->insert_data(src_column_ptr->get_data_at(i).data, + src_column_ptr->get_data_at(i).size); + result_column_nullable->get_null_map_data().push_back(0); + continue; + } + + // parse and encode sparse columns + buffer.Clear(); + rapidjson::Value root(rapidjson::kNullType); + if (!doc_structure->IsNull()) { + root.CopyFrom(*doc_structure, doc_structure->GetAllocator()); + } + size_t null_count = 0; + for (const auto& subcolumn : sparse_columns) { + auto& column = subcolumn->data.get_finalized_column_ptr(); + if (assert_cast(*column).is_null_at(i)) { + ++null_count; + continue; + } + find_and_set_leave_value(column, subcolumn->path, + subcolumn->data.get_least_common_type_serde(), root, + doc_structure->GetAllocator(), i); + } + + // all null values, store null to sparse root + if (null_count == sparse_columns.size()) { + result_column_ptr->insert_default(); + result_column_nullable->get_null_map_data().push_back(1); + continue; + } + + // encode sparse columns into jsonb format + compact_null_values(root, doc_structure->GetAllocator()); + // parse as jsonb value and put back to rootnode + // TODO, we could convert to jsonb directly from rapidjson::Value for better performance, instead of parsing + JsonbParser parser; + rapidjson::Writer writer(buffer); + root.Accept(writer); + bool res = parser.parse(buffer.GetString(), buffer.GetSize()); + CHECK(res) << "buffer:" << std::string(buffer.GetString(), buffer.GetSize()) + << ", row_num:" << i; + result_column_ptr->insert_data(parser.getWriter().getOutput()->getBuffer(), + parser.getWriter().getOutput()->getSize()); + result_column_nullable->get_null_map_data().push_back(0); + } + + // assign merged column + subcolumns.get_mutable_root()->data.get_finalized_column_ptr() = mresult->get_ptr(); +} + +void ColumnObject::finalize(bool ignore_sparse) { Subcolumns new_subcolumns; + // finalize root first + if (!ignore_sparse || !is_null_root()) { + new_subcolumns.create_root(subcolumns.get_root()->data); + new_subcolumns.get_mutable_root()->data.finalize(); + } for (auto&& entry : subcolumns) { const auto& least_common_type = entry->data.get_least_common_type(); - /// Do not add subcolumns, which consists only from NULLs. - if (is_nothing(getBaseTypeOfArray(least_common_type))) { + /// Do not add subcolumns, which consists only from NULLs + if (is_nothing(get_base_type_of_array(least_common_type))) { continue; } - if (!entry->data.data.empty()) { - entry->data.finalize(); - new_subcolumns.add(entry->path, entry->data); + entry->data.finalize(); + entry->data.wrapp_array_nullable(); + + if (entry->data.is_root) { + continue; } + + // Check and spilit sparse subcolumns + if (!ignore_sparse && (entry->data.check_if_sparse_column(num_rows) || + !check_if_valid_column_name(entry->path))) { + // TODO seperate ambiguous path + sparse_columns.add(entry->path, entry->data); + continue; + } + + new_subcolumns.add(entry->path, entry->data); } - /// If all subcolumns were skipped add a dummy subcolumn, - /// because Tuple type must have at least one element. - // if (new_subcolumns.empty()) { - // new_subcolumns.add( - // PathInData {COLUMN_NAME_DUMMY}, - // Subcolumn {static_cast(ColumnUInt8::create(old_size, 0)), - // is_nullable}); - // } std::swap(subcolumns, new_subcolumns); + doc_structure = nullptr; +} + +void ColumnObject::finalize() { + finalize(true); +} + +void ColumnObject::ensure_root_node_type(const DataTypePtr& expected_root_type) { + auto& root = subcolumns.get_mutable_root()->data; + if (!root.get_least_common_type()->equals(*expected_root_type)) { + // make sure the root type is alawys as expected + ColumnPtr casted_column; + static_cast( + schema_util::cast_column(ColumnWithTypeAndName {root.get_finalized_column_ptr(), + root.get_least_common_type(), ""}, + expected_root_type, &casted_column)); + root.data[0] = casted_column; + root.data_types[0] = expected_root_type; + root.least_common_type = Subcolumn::LeastCommonType {expected_root_type}; + } } bool ColumnObject::empty() const { @@ -888,33 +1236,77 @@ void ColumnObject::strip_outer_array() { new_subcolumns.add(entry->path, Subcolumn {base_column->assume_mutable(), is_nullable}); num_rows = base_column->size(); } - /// If all subcolumns were skipped add a dummy subcolumn, - /// because Tuple type must have at least one element. - // if (new_subcolumns.empty()) { - // new_subcolumns.add( - // PathInData {COLUMN_NAME_DUMMY}, - // Subcolumn {static_cast(ColumnUInt8::create(old_size, 0)), - // is_nullable}); - // } std::swap(subcolumns, new_subcolumns); } ColumnPtr ColumnObject::filter(const Filter& filter, ssize_t count) const { - DCHECK(is_finalized()); - auto new_column = ColumnObject::create(true); + if (!is_finalized()) { + const_cast(this)->finalize(); + } + auto new_column = ColumnObject::create(true, false); for (auto& entry : subcolumns) { auto subcolumn = entry->data.get_finalized_column().filter(filter, count); - new_column->add_sub_column(entry->path, std::move(subcolumn)); + new_column->add_sub_column(entry->path, subcolumn->assume_mutable(), + entry->data.get_least_common_type()); } return new_column; } +Status ColumnObject::filter_by_selector(const uint16_t* sel, size_t sel_size, IColumn* col_ptr) { + if (!is_finalized()) { + finalize(); + } + auto* res = assert_cast(col_ptr); + for (const auto& subcolumn : subcolumns) { + auto new_subcolumn = subcolumn->data.get_least_common_type()->create_column(); + RETURN_IF_ERROR(subcolumn->data.get_finalized_column().filter_by_selector( + sel, sel_size, new_subcolumn.get())); + res->add_sub_column(subcolumn->path, new_subcolumn->assume_mutable(), + subcolumn->data.get_least_common_type()); + } + return Status::OK(); +} + size_t ColumnObject::filter(const Filter& filter) { - DCHECK(is_finalized()); + if (!is_finalized()) { + finalize(); + } + size_t count = filter.size() - simd::count_zero_num((int8_t*)filter.data(), filter.size()); + if (count == 0) { + for_each_subcolumn([](auto& part) { part->clear(); }); + } else { + for_each_subcolumn([&](auto& part) { + if (part->size() != count) { + if (part->is_exclusive()) { + const auto result_size = part->filter(filter); + if (result_size != count) { + throw Exception(ErrorCode::INTERNAL_ERROR, + "result_size not euqal with filter_size, result_size={}, " + "filter_size={}", + result_size, count); + } + CHECK_EQ(result_size, count); + } else { + part = part->filter(filter, count); + } + } + }); + } + num_rows = count; +#ifndef NDEBUG + check_consistency(); +#endif + return count; +} + +void ColumnObject::clear() { for (auto& entry : subcolumns) { - num_rows = entry->data.get_finalized_column().filter(filter); + for (auto& part : entry->data.data) { + part->clear(); + } + entry->data.num_of_defaults_in_prefix = 0; } - return num_rows; + num_rows = 0; } void ColumnObject::revise_to(int target_num_rows) { @@ -926,10 +1318,85 @@ void ColumnObject::revise_to(int target_num_rows) { num_rows = target_num_rows; } +void ColumnObject::create_root() { + auto type = is_nullable ? make_nullable(std::make_shared()) + : std::make_shared(); + add_sub_column({}, type->create_column(), type); +} + +void ColumnObject::create_root(const DataTypePtr& type, MutableColumnPtr&& column) { + if (num_rows == 0) { + num_rows = column->size(); + } + add_sub_column({}, std::move(column), type); +} + +bool ColumnObject::is_null_root() const { + auto* root = subcolumns.get_root(); + if (root == nullptr) { + return true; + } + if (root->data.num_of_defaults_in_prefix == 0 && + (root->data.data.empty() || is_nothing(root->data.get_least_common_type()))) { + return true; + } + return false; +} + +bool ColumnObject::is_scalar_variant() const { + // Only root itself + return !is_null_root() && subcolumns.get_leaves().size() == 1; +} + +DataTypePtr ColumnObject::get_root_type() const { + return subcolumns.get_root()->data.get_least_common_type(); +} + +#define SANITIZE_ROOT() \ + if (is_null_root()) { \ + return Status::InternalError("No root column, path {}", path.get_path()); \ + } \ + if (!WhichDataType(remove_nullable(subcolumns.get_root()->data.get_least_common_type())) \ + .is_json()) { \ + return Status::InternalError( \ + "Root column is not jsonb type but {}, path {}", \ + subcolumns.get_root()->data.get_least_common_type()->get_name(), path.get_path()); \ + } + +Status ColumnObject::extract_root(const PathInData& path) { + SANITIZE_ROOT(); + if (!path.empty()) { + MutableColumnPtr extracted; + RETURN_IF_ERROR(schema_util::extract(subcolumns.get_root()->data.get_finalized_column_ptr(), + path, extracted)); + subcolumns.get_mutable_root()->data.data[0] = extracted->get_ptr(); + } + return Status::OK(); +} + +Status ColumnObject::extract_root(const PathInData& path, MutableColumnPtr& dst) const { + SANITIZE_ROOT(); + if (!path.empty()) { + RETURN_IF_ERROR(schema_util::extract(subcolumns.get_root()->data.get_finalized_column_ptr(), + path, dst)); + } else { + if (!dst) { + dst = subcolumns.get_root()->data.get_finalized_column_ptr()->clone_empty(); + dst->reserve(num_rows); + } + dst->insert_range_from(*subcolumns.get_root()->data.get_finalized_column_ptr(), 0, + num_rows); + } + return Status::OK(); +} + template void align_variant_by_name_and_type(ColumnObject& dst, const ColumnObject& src, size_t row_cnt, ColumnInserterFn inserter) { - CHECK(dst.is_finalized() && src.is_finalized()); + CHECK(dst.is_finalized()); + if (!src.is_finalized()) { + const_cast(src).finalize(); + } // Use rows() here instead of size(), since size() will check_consistency // but we could not check_consistency since num_rows will be upgraded even // if src and dst is empty, we just increase the num_rows of dst and fill @@ -940,6 +1407,10 @@ void align_variant_by_name_and_type(ColumnObject& dst, const ColumnObject& src, if (src_subcol == nullptr) { entry->data.get_finalized_column().insert_many_defaults(row_cnt); } else { + // It's the first time alignment, so that we should build it + if (entry->data.get_least_common_type()->get_type_id() == TypeIndex::Nothing) { + entry->data.add_new_column_part(src_subcol->get_least_common_type()); + } // TODO handle type confict here, like ColumnObject before CHECK(entry->data.get_least_common_type()->equals( *src_subcol->get_least_common_type())); @@ -972,15 +1443,6 @@ void align_variant_by_name_and_type(ColumnObject& dst, const ColumnObject& src, #endif } -void ColumnObject::insert_range_from(const IColumn& src, size_t start, size_t length) { - // insert_range_from with alignment - const ColumnObject& src_column = *check_and_get_column(src); - align_variant_by_name_and_type(*this, src_column, length, - [start, length](const IColumn& src, IColumn* dst) { - dst->insert_range_from(src, start, length); - }); -} - void ColumnObject::append_data_by_selector(MutableColumnPtr& res, const IColumn::Selector& selector) const { // append by selector with alignment diff --git a/be/src/vec/columns/column_object.h b/be/src/vec/columns/column_object.h index 6bff69b1e67ddc..cade1342b63afb 100644 --- a/be/src/vec/columns/column_object.h +++ b/be/src/vec/columns/column_object.h @@ -20,6 +20,8 @@ #pragma once #include +#include +#include #include #include @@ -39,7 +41,10 @@ #include "vec/core/field.h" #include "vec/core/types.h" #include "vec/data_types/data_type.h" +#include "vec/data_types/data_type_jsonb.h" #include "vec/data_types/data_type_nullable.h" +#include "vec/data_types/serde/data_type_serde.h" +#include "vec/io/reader_buffer.h" #include "vec/json/path_in_data.h" class SipHash; @@ -82,13 +87,17 @@ class ColumnObject final : public COWHelper { * After insertion of all values subcolumn should be finalized * for writing and other operations. */ + + // Using jsonb type as most common type, since it's adopted all types of json + using MostCommonType = DataTypeJsonb; class Subcolumn { public: Subcolumn() = default; - Subcolumn(size_t size_, bool is_nullable_); + Subcolumn(size_t size_, bool is_nullable_, bool is_root = false); - Subcolumn(MutableColumnPtr&& data_, bool is_nullable_); + Subcolumn(MutableColumnPtr&& data_, DataTypePtr type, bool is_nullable_, + bool is_root = false); size_t size() const; @@ -100,7 +109,13 @@ class ColumnObject final : public COWHelper { const DataTypePtr& get_least_common_type() const { return least_common_type.get(); } - const DataTypePtr& get_least_common_typeBase() const { return least_common_type.getBase(); } + const DataTypePtr& get_least_common_typeBase() const { + return least_common_type.get_base(); + } + + const DataTypeSerDeSPtr& get_least_common_type_serde() const { + return least_common_type.get_serde(); + } size_t get_dimensions() const { return least_common_type.get_dimensions(); } @@ -126,12 +141,12 @@ class ColumnObject final : public COWHelper { /// creates a single column that stores all values. void finalize(); + bool check_if_sparse_column(size_t num_rows); + /// Returns last inserted field. Field get_last_field() const; - /// Recreates subcolumn with default scalar values and keeps sizes of arrays. - /// Used to create columns of type Nested with consistent array sizes. - Subcolumn recreate_with_default_values(const FieldInfo& field_info) const; + FieldInfo get_subcolumn_field_info() const; /// Returns single column if subcolumn in finalizes. /// Otherwise -- undefined behaviour. @@ -141,8 +156,12 @@ class ColumnObject final : public COWHelper { const ColumnPtr& get_finalized_column_ptr() const; + ColumnPtr& get_finalized_column_ptr(); + void remove_nullable(); + void add_new_column_part(DataTypePtr type); + friend class ColumnObject; private: @@ -154,18 +173,22 @@ class ColumnObject final : public COWHelper { const DataTypePtr& get() const { return type; } - const DataTypePtr& getBase() const { return base_type; } + const DataTypePtr& get_base() const { return base_type; } size_t get_dimensions() const { return num_dimensions; } void remove_nullable() { type = doris::vectorized::remove_nullable(type); } + const DataTypeSerDeSPtr& get_serde() const { return least_common_type_serder; } + private: DataTypePtr type; DataTypePtr base_type; size_t num_dimensions = 0; + DataTypeSerDeSPtr least_common_type_serder; }; - void add_new_column_part(DataTypePtr type); + + void wrapp_array_nullable(); /// Current least common type of all values inserted to this subcolumn. LeastCommonType least_common_type; @@ -176,10 +199,14 @@ class ColumnObject final : public COWHelper { /// That means that the least common type for i-th prefix is the type of i-th part /// and it's the supertype for all type of column from 0 to i-1. std::vector data; + std::vector data_types; /// Until we insert any non-default field we don't know further /// least common type and we count number of defaults in prefix, /// which will be converted to the default type of final common type. size_t num_of_defaults_in_prefix = 0; + // If it is the root subcolumn of SubcolumnsTree, + // the root Node should be JSONB type when finalize + bool is_root = false; }; using Subcolumns = SubcolumnsTree; @@ -188,23 +215,66 @@ class ColumnObject final : public COWHelper { const bool is_nullable; Subcolumns subcolumns; size_t num_rows; + // sparse columns will be merge and encoded into root column + Subcolumns sparse_columns; + // The rapidjson document format of Subcolumns tree structure + // the leaves is null.In order to display whole document, copy + // this structure and fill with Subcolumns sub items + mutable std::shared_ptr doc_structure; public: static constexpr auto COLUMN_NAME_DUMMY = "_dummy"; - explicit ColumnObject(bool is_nullable_); + explicit ColumnObject(bool is_nullable_, bool create_root = true); ColumnObject(Subcolumns&& subcolumns_, bool is_nullable_); ~ColumnObject() override = default; - bool can_be_inside_nullable() const override { return true; } - /// Checks that all subcolumns have consistent sizes. void check_consistency() const; + MutableColumnPtr get_root() { + if (subcolumns.empty() || is_nothing(subcolumns.get_root()->data.get_least_common_type())) { + return nullptr; + } + return subcolumns.get_mutable_root()->data.get_finalized_column_ptr()->assume_mutable(); + } + + bool serialize_one_row_to_string(int row, std::string* output) const; + + bool serialize_one_row_to_string(int row, BufferWritable& output) const; + + // serialize one row to json format + bool serialize_one_row_to_json_format(int row, rapidjson::StringBuffer* output, + bool* is_null) const; + + // merge multiple sub sparse columns into root + void merge_sparse_to_root_column(); + + // ensure root node is a certain type + void ensure_root_node_type(const DataTypePtr& type); + + // create jsonb root if missing + void create_root(); + + // create root with type and column if missing + void create_root(const DataTypePtr& type, MutableColumnPtr&& column); + + // root is null or type nothing + bool is_null_root() const; + + // Only single scalar root column + bool is_scalar_variant() const; + + ColumnPtr get_root() const { return subcolumns.get_root()->data.get_finalized_column_ptr(); } + bool has_subcolumn(const PathInData& key) const; + DataTypePtr get_root_type() const; + + bool is_variant() const override { return true; } + // return null if not found const Subcolumn* get_subcolumn(const PathInData& key) const; @@ -226,15 +296,11 @@ class ColumnObject final : public COWHelper { size_t rows() const { return num_rows; } /// Adds a subcolumn from existing IColumn. - bool add_sub_column(const PathInData& key, MutableColumnPtr&& subcolumn); + bool add_sub_column(const PathInData& key, MutableColumnPtr&& subcolumn, DataTypePtr type); /// Adds a subcolumn of specific size with default values. bool add_sub_column(const PathInData& key, size_t new_size); - /// Adds a subcolumn of type Nested of specific size with default values. - /// It cares about consistency of sizes of Nested arrays. - bool add_nested_subcolumn(const PathInData& key, const FieldInfo& field_info, size_t new_size); - const Subcolumns& get_subcolumns() const { return subcolumns; } Subcolumns& get_subcolumns() { return subcolumns; } @@ -258,11 +324,21 @@ class ColumnObject final : public COWHelper { void remove_subcolumns(const std::unordered_set& keys); + void finalize(bool ignore_sparse); + /// Finalizes all subcolumns. - void finalize(); + void finalize() override; bool is_finalized() const; + MutableColumnPtr clone_finalized() const { + auto finalized = IColumn::mutate(get_ptr()); + static_cast(finalized.get())->finalize(); + return finalized; + } + + void clear() override; + /// Part of interface const char* get_family_name() const override { return "Variant"; } @@ -279,8 +355,6 @@ class ColumnObject final : public COWHelper { // Do nothing, call try_insert instead void insert(const Field& field) override { try_insert(field); } - void insert_range_from(const IColumn& src, size_t start, size_t length) override; - void append_data_by_selector(MutableColumnPtr& res, const IColumn::Selector& selector) const override; @@ -290,18 +364,16 @@ class ColumnObject final : public COWHelper { // May throw execption void try_insert(const Field& field); - void try_insert_from(const IColumn& src, size_t n); + void insert_from(const IColumn& src, size_t n) override; - void try_insert_range_from(const IColumn& src, size_t start, size_t length); + void insert_range_from(const IColumn& src, size_t start, size_t length) override; void insert_default() override; // Revise this column to specified num_rows void revise_to(int num_rows); - [[noreturn]] ColumnPtr replicate(const Offsets& offsets) const override { - LOG(FATAL) << "should not call the method replicate in column object"; - } + ColumnPtr replicate(const Offsets& offsets) const override; void pop_back(size_t length) override; @@ -339,12 +411,11 @@ class ColumnObject final : public COWHelper { ColumnPtr filter(const Filter&, ssize_t) const override; + Status filter_by_selector(const uint16_t* sel, size_t sel_size, IColumn* col_ptr) override; + size_t filter(const Filter&) override; - ColumnPtr permute(const Permutation&, size_t) const override { - LOG(FATAL) << "should not call the method in column object"; - return nullptr; - } + ColumnPtr permute(const Permutation&, size_t) const override; int compare_at(size_t n, size_t m, const IColumn& rhs, int nan_direction_hint) const override { LOG(FATAL) << "should not call the method in column object"; @@ -378,10 +449,17 @@ class ColumnObject final : public COWHelper { } template - ColumnPtr apply_for_subcolumns(Func&& func, std::string_view func_name) const; + MutableColumnPtr apply_for_subcolumns(Func&& func) const; ColumnPtr index(const IColumn& indexes, size_t limit) const override; + // Extract path from root column and replace root with new extracted column, + // root must be jsonb type + Status extract_root(const PathInData& path); + + // Extract path from root column and output to dst + Status extract_root(const PathInData& path, MutableColumnPtr& dst) const; + void strip_outer_array(); bool empty() const; diff --git a/be/src/vec/columns/column_set.h b/be/src/vec/columns/column_set.h new file mode 100644 index 00000000000000..3b330f82284838 --- /dev/null +++ b/be/src/vec/columns/column_set.h @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/src/Columns/ColumnSet.h +// and modified by Doris + +#pragma once + +#include "exprs/hybrid_set.h" +#include "vec/columns/column_dummy.h" + +namespace doris::vectorized { + +using ConstSetPtr = std::shared_ptr; + +/** A column containing multiple values in the `IN` section. + * Behaves like a constant-column (because the set is one, not its own for each line). + * This column has a nonstandard value, so it can not be obtained via a normal interface. + */ +class ColumnSet final : public COWHelper { +public: + friend class COWHelper; + + ColumnSet(size_t s_, const ConstSetPtr& data_) : data(data_) { s = s_; } + ColumnSet(const ColumnSet&) = default; + + const char* get_family_name() const override { return "Set"; } + MutableColumnPtr clone_dummy(size_t s_) const override { return ColumnSet::create(s_, data); } + + ConstSetPtr get_data() const { return data; } + +private: + ConstSetPtr data; +}; + +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/columns/column_string.cpp b/be/src/vec/columns/column_string.cpp index 5d5abd64349ecb..d6a3a514994067 100644 --- a/be/src/vec/columns/column_string.cpp +++ b/be/src/vec/columns/column_string.cpp @@ -207,6 +207,19 @@ size_t ColumnString::filter(const Filter& filter) { return filter_arrays_impl(chars, offsets, filter); } +Status ColumnString::filter_by_selector(const uint16_t* sel, size_t sel_size, IColumn* col_ptr) { + auto* col = static_cast(col_ptr); + Chars& res_chars = col->chars; + Offsets& res_offsets = col->offsets; + Filter filter; + filter.resize_fill(offsets.size(), 0); + for (size_t i = 0; i < sel_size; i++) { + filter[sel[i]] = 1; + } + filter_arrays_impl(chars, offsets, res_chars, res_offsets, filter, sel_size); + return Status::OK(); +} + ColumnPtr ColumnString::permute(const Permutation& perm, size_t limit) const { size_t size = offsets.size(); diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index ae2bb9d25f98f1..e3108170203b4a 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -29,7 +29,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/exception.h" #include "common/status.h" @@ -129,6 +128,11 @@ class ColumnString final : public COWHelper { void get(size_t n, Field& res) const override { assert(n < size()); + if (res.get_type() == Field::Types::JSONB) { + // Handle JsonbField + res = JsonbField(reinterpret_cast(&chars[offset_at(n)]), size_at(n)); + return; + } res.assign_string(&chars[offset_at(n)], size_at(n)); } @@ -138,15 +142,23 @@ class ColumnString final : public COWHelper { } void insert(const Field& x) override { - const String& s = vectorized::get(x); + StringRef s; + if (x.get_type() == Field::Types::JSONB) { + // Handle JsonbField + const auto& real_field = vectorized::get(x); + s = StringRef(real_field.get_value(), real_field.get_size()); + } else { + s.data = vectorized::get(x).data(); + s.size = vectorized::get(x).size(); + } const size_t old_size = chars.size(); - const size_t size_to_append = s.size(); + const size_t size_to_append = s.size; const size_t new_size = old_size + size_to_append; check_chars_length(new_size, old_size + 1); chars.resize(new_size); - memcpy(chars.data() + old_size, s.c_str(), size_to_append); + memcpy(chars.data() + old_size, s.data, size_to_append); offsets.push_back(new_size); } @@ -478,6 +490,8 @@ class ColumnString final : public COWHelper { ColumnPtr filter(const Filter& filt, ssize_t result_size_hint) const override; size_t filter(const Filter& filter) override; + Status filter_by_selector(const uint16_t* sel, size_t sel_size, IColumn* col_ptr) override; + ColumnPtr permute(const Permutation& perm, size_t limit) const override; void sort_column(const ColumnSorter* sorter, EqualFlags& flags, IColumn::Permutation& perms, @@ -523,8 +537,6 @@ class ColumnString final : public COWHelper { void resize(size_t n) override; - bool can_be_inside_nullable() const override { return true; } - bool is_column_string() const override { return true; } bool structure_equals(const IColumn& rhs) const override { @@ -589,6 +601,10 @@ class ColumnString final : public COWHelper { } ColumnPtr index(const IColumn& indexes, size_t limit) const override; + + double get_ratio_of_default_rows(double sample_ratio) const override { + return get_ratio_of_default_rows_impl(sample_ratio); + } }; } // namespace doris::vectorized diff --git a/be/src/vec/columns/column_struct.h b/be/src/vec/columns/column_struct.h index 36854824198dd2..919b971b5ac5fa 100644 --- a/be/src/vec/columns/column_struct.h +++ b/be/src/vec/columns/column_struct.h @@ -84,7 +84,6 @@ class ColumnStruct final : public COWHelper { std::string get_name() const override; bool is_column_struct() const override { return true; } const char* get_family_name() const override { return "Struct"; } - bool can_be_inside_nullable() const override { return true; } MutableColumnPtr clone_empty() const override; MutableColumnPtr clone_resized(size_t size) const override; size_t size() const override { return columns.at(0)->size(); } diff --git a/be/src/vec/columns/column_vector.h b/be/src/vec/columns/column_vector.h index 85d64740d19161..ffb91494c26c10 100644 --- a/be/src/vec/columns/column_vector.h +++ b/be/src/vec/columns/column_vector.h @@ -34,7 +34,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/status.h" #include "gutil/integral_types.h" @@ -412,6 +411,10 @@ class ColumnVector final : public COWHelper> template ColumnPtr index_impl(const PaddedPODArray& indexes, size_t limit) const; + double get_ratio_of_default_rows(double sample_ratio) const override { + return this->template get_ratio_of_default_rows_impl(sample_ratio); + } + ColumnPtr replicate(const IColumn::Offsets& offsets) const override; void replicate(const uint32_t* indexs, size_t target_size, IColumn& column) const override; @@ -428,8 +431,6 @@ class ColumnVector final : public COWHelper> // void gather(ColumnGathererStream & gatherer_stream) override; - bool can_be_inside_nullable() const override { return true; } - bool is_fixed_and_contiguous() const override { return true; } size_t size_of_value_if_fixed() const override { return sizeof(T); } StringRef get_raw_data() const override { diff --git a/be/src/vec/columns/predicate_column.h b/be/src/vec/columns/predicate_column.h index 3aec3b1540f229..87d1993e2747e2 100644 --- a/be/src/vec/columns/predicate_column.h +++ b/be/src/vec/columns/predicate_column.h @@ -137,8 +137,6 @@ class PredicateColumnType final : public COWHelper::get(); } - [[noreturn]] MutableColumnPtr clone_resized(size_t size) const override { - LOG(FATAL) << "clone_resized not supported in PredicateColumnType"; + MutableColumnPtr clone_resized(size_t size) const override { + DCHECK(size == 0); + return this->create(); } void insert(const Field& x) override { @@ -401,8 +400,6 @@ class PredicateColumnType final : public COWHelper class SubcolumnsTree { public: struct Node { - enum Kind { - TUPLE, - NESTED, - SCALAR, - }; + enum Kind { TUPLE, NESTED, SCALAR }; explicit Node(Kind kind_) : kind(kind_) {} Node(Kind kind_, const NodeData& data_) : kind(kind_), data(data_) {} Node(Kind kind_, const NodeData& data_, const PathInData& path_) : kind(kind_), data(data_), path(path_) {} + Node(Kind kind_, NodeData&& data_) : kind(kind_), data(std::move(data_)) {} + Node(Kind kind_, NodeData&& data_, const PathInData& path_) + : kind(kind_), data(std::move(data_)), path(path_) {} Kind kind = TUPLE; const Node* parent = nullptr; @@ -53,6 +52,20 @@ class SubcolumnsTree { bool is_nested() const { return kind == NESTED; } bool is_scalar() const { return kind == SCALAR; } + bool is_scalar_without_children() const { return kind == SCALAR && children.empty(); } + + // Only modify data and kind + void modify(std::shared_ptr&& other) { + data = std::move(other->data); + kind = other->kind; + path = other->path; + } + + // modify data and kind + void modify_to_scalar(NodeData&& other_data) { + data = std::move(other_data); + kind = Kind::SCALAR; + } void add_child(std::string_view key, std::shared_ptr next_node) { next_node->parent = this; @@ -79,10 +92,30 @@ class SubcolumnsTree { }); } + bool add(const PathInData& path, NodeData&& leaf_data) { + return add(path, [&](NodeKind kind, bool exists) -> NodePtr { + if (exists) { + return nullptr; + } + + if (kind == Node::SCALAR) { + return std::make_shared(kind, std::move(leaf_data), path); + } + + return std::make_shared(kind); + }); + } + /// Callback for creation of node. Receives kind of node and /// flag, which is true if node already exists. using NodeCreator = std::function; + // create root as SCALAR node + void create_root(const NodeData& leaf_data) { + root = std::make_shared(Node::SCALAR, leaf_data); + leaves.push_back(root); + } + bool add(const PathInData& path, const NodeCreator& node_creator) { const auto& parts = path.get_parts(); @@ -96,8 +129,6 @@ class SubcolumnsTree { Node* current_node = root.get(); for (size_t i = 0; i < parts.size() - 1; ++i) { - // assert(current_node->kind != Node::SCALAR); - auto it = current_node->children.find( StringRef {parts[i].key.data(), parts[i].key.size()}); if (it != current_node->children.end()) { @@ -118,7 +149,11 @@ class SubcolumnsTree { auto it = current_node->children.find( StringRef {parts.back().key.data(), parts.back().key.size()}); if (it != current_node->children.end()) { - return false; + // Modify this node to Node::SCALAR + auto new_node = node_creator(Node::SCALAR, false); + it->get_second()->modify(std::move(new_node)); + leaves.push_back(it->get_second()); + return true; } auto next_node = node_creator(Node::SCALAR, false); @@ -183,6 +218,19 @@ class SubcolumnsTree { const Nodes& get_leaves() const { return leaves; } const Node* get_root() const { return root.get(); } + Node* get_mutable_root() { return root.get(); } + + static void get_leaves_of_node(const Node* node, std::vector& nodes, + vectorized::PathsInData& paths) { + if (node->is_scalar()) { + nodes.push_back(node); + paths.push_back(node->path); + } + for (auto it = node->children.begin(); it != node->children.end(); ++it) { + auto child = it->get_second(); + get_leaves_of_node(child.get(), nodes, paths); + } + } using iterator = typename Nodes::iterator; using const_iterator = typename Nodes::const_iterator; diff --git a/be/src/vec/common/allocator.cpp b/be/src/vec/common/allocator.cpp index b4025807be6ed3..3d6d2a7a11e623 100644 --- a/be/src/vec/common/allocator.cpp +++ b/be/src/vec/common/allocator.cpp @@ -38,33 +38,45 @@ template void Allocator::sys_memory_check(size_t size) const { - if (doris::thread_context()->skip_memory_check) return; + if (doris::is_thread_context_init() && doris::thread_context()->skip_memory_check != 0) { + return; + } if (doris::MemTrackerLimiter::sys_mem_exceed_limit_check(size)) { // Only thread attach query, and has not completely waited for thread_wait_gc_max_milliseconds, // will wait for gc, asynchronous cancel or throw bad::alloc. // Otherwise, if the external catch, directly throw bad::alloc. - auto err_msg = fmt::format( - "Allocator sys memory check failed: Cannot alloc:{}, consuming " - "tracker:<{}>, peak used {}, current used {}, exec node:<{}>, {}.", - size, doris::thread_context()->thread_mem_tracker()->label(), - doris::thread_context()->thread_mem_tracker()->peak_consumption(), - doris::thread_context()->thread_mem_tracker()->consumption(), - doris::thread_context()->thread_mem_tracker_mgr->last_consumer_tracker(), - doris::MemTrackerLimiter::process_limit_exceeded_errmsg_str()); + std::string err_msg; + if (doris::is_thread_context_init()) { + err_msg += fmt::format( + "Allocator sys memory check failed: Cannot alloc:{}, consuming " + "tracker:<{}>, peak used {}, current used {}, exec node:<{}>, {}.", + size, doris::thread_context()->thread_mem_tracker()->label(), + doris::thread_context()->thread_mem_tracker()->peak_consumption(), + doris::thread_context()->thread_mem_tracker()->consumption(), + doris::thread_context()->thread_mem_tracker_mgr->last_consumer_tracker(), + doris::MemTrackerLimiter::process_limit_exceeded_errmsg_str()); + } else { + err_msg += fmt::format( + "Allocator sys memory check failed: Cannot alloc:{}, consuming " + "tracker:<{}>, {}.", + size, "Orphan", doris::MemTrackerLimiter::process_limit_exceeded_errmsg_str()); + } + if (size > 1024l * 1024 * 1024 && !doris::enable_thread_catch_bad_alloc && !doris::config::disable_memory_gc) { // 1G err_msg += "\nAlloc Stacktrace:\n" + doris::get_stack_trace(); } // TODO, Save the query context in the thread context, instead of finding whether the query id is canceled in fragment_mgr. - if (doris::ExecEnv::GetInstance()->fragment_mgr()->query_is_canceled( + if (doris::is_thread_context_init() && + doris::ExecEnv::GetInstance()->fragment_mgr()->query_is_canceled( doris::thread_context()->task_id())) { if (doris::enable_thread_catch_bad_alloc) { throw doris::Exception(doris::ErrorCode::MEM_ALLOC_FAILED, err_msg); } return; } - if (!doris::config::disable_memory_gc && + if (doris::is_thread_context_init() && !doris::config::disable_memory_gc && doris::thread_context()->thread_mem_tracker_mgr->is_attach_query() && doris::thread_context()->thread_mem_tracker_mgr->wait_gc()) { int64_t wait_milliseconds = 0; @@ -120,7 +132,12 @@ void Allocator::sys_memory_check(size_t template void Allocator::memory_tracker_check(size_t size) const { - if (doris::thread_context()->skip_memory_check) return; + if (doris::is_thread_context_init() && doris::thread_context()->skip_memory_check != 0) { + return; + } + if (!doris::is_thread_context_init()) { + return; + } auto st = doris::thread_context()->thread_mem_tracker()->check_limit(size); if (!st) { auto err_msg = fmt::format("Allocator mem tracker check failed, {}", st.to_string()); diff --git a/be/src/vec/common/allocator.h b/be/src/vec/common/allocator.h index 24fef16290bc99..8162c94fca283a 100644 --- a/be/src/vec/common/allocator.h +++ b/be/src/vec/common/allocator.h @@ -47,7 +47,6 @@ #include #include -// IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #ifdef THREAD_SANITIZER /// Thread sanitizer does not intercept mremap. The usage of mremap will lead to false positives. @@ -98,6 +97,7 @@ class Allocator { /// Allocate memory range. void* alloc_impl(size_t size, size_t alignment = 0) { memory_check(size); + consume_memory(size); void* buf; if (use_mmap && size >= doris::config::mmap_threshold) { @@ -107,7 +107,6 @@ class Allocator { "Too large alignment {}: more than page size when allocating {}.", alignment, size); - consume_memory(size); buf = mmap(nullptr, size, PROT_READ | PROT_WRITE, mmap_flags, -1, 0); if (MAP_FAILED == buf) { release_memory(size); @@ -123,6 +122,7 @@ class Allocator { buf = ::malloc(size); if (nullptr == buf) { + release_memory(size); throw_bad_alloc(fmt::format("Allocator: Cannot malloc {}.", size)); } } else { @@ -130,6 +130,7 @@ class Allocator { int res = posix_memalign(&buf, alignment, size); if (0 != res) { + release_memory(size); throw_bad_alloc( fmt::format("Cannot allocate memory (posix_memalign) {}.", size)); } @@ -146,12 +147,11 @@ class Allocator { DCHECK(size != -1); if (0 != munmap(buf, size)) { throw_bad_alloc(fmt::format("Allocator: Cannot munmap {}.", size)); - } else { - release_memory(size); } } else { ::free(buf); } + release_memory(size); } /** Enlarge memory range. @@ -162,13 +162,18 @@ class Allocator { if (old_size == new_size) { /// nothing to do. /// BTW, it's not possible to change alignment while doing realloc. - } else if (!use_mmap || (old_size < doris::config::mmap_threshold && - new_size < doris::config::mmap_threshold && - alignment <= MALLOC_MIN_ALIGNMENT)) { - memory_check(new_size); + return buf; + } + memory_check(new_size); + consume_memory(new_size - old_size); + + if (!use_mmap || + (old_size < doris::config::mmap_threshold && new_size < doris::config::mmap_threshold && + alignment <= MALLOC_MIN_ALIGNMENT)) { /// Resize malloc'd memory region with no special alignment requirement. void* new_buf = ::realloc(buf, new_size); if (nullptr == new_buf) { + release_memory(new_size - old_size); throw_bad_alloc(fmt::format("Allocator: Cannot realloc from {} to {}.", old_size, new_size)); } @@ -179,9 +184,7 @@ class Allocator { memset(reinterpret_cast(buf) + old_size, 0, new_size - old_size); } else if (old_size >= doris::config::mmap_threshold && new_size >= doris::config::mmap_threshold) { - memory_check(new_size); /// Resize mmap'd memory region. - consume_memory(new_size - old_size); // On apple and freebsd self-implemented mremap used (common/mremap.h) buf = clickhouse_mremap(buf, old_size, new_size, MREMAP_MAYMOVE, PROT_READ | PROT_WRITE, mmap_flags, -1, 0); @@ -200,7 +203,6 @@ class Allocator { memset(reinterpret_cast(buf) + old_size, 0, new_size - old_size); } } else { - memory_check(new_size); // Big allocs that requires a copy. void* new_buf = alloc(new_size, alignment); memcpy(new_buf, buf, std::min(old_size, new_size)); diff --git a/be/src/vec/common/cow.h b/be/src/vec/common/cow.h index d3ab0ebd681af2..5cd701e6badd45 100644 --- a/be/src/vec/common/cow.h +++ b/be/src/vec/common/cow.h @@ -410,4 +410,4 @@ class COWHelper : public Base { MutablePtr shallow_mutate() const { return MutablePtr(static_cast(Base::shallow_mutate().get())); } -}; +}; \ No newline at end of file diff --git a/be/src/vec/common/field_visitors.h b/be/src/vec/common/field_visitors.h index 8434483b7721ea..2066a94960a3d5 100644 --- a/be/src/vec/common/field_visitors.h +++ b/be/src/vec/common/field_visitors.h @@ -65,6 +65,8 @@ typename std::decay_t::ResultType apply_visitor(Visitor&& visitor, F&& return visitor(field.template get>()); case Field::Types::Decimal256: return visitor(field.template get>()); + case Field::Types::JSONB: + return visitor(field.template get()); default: LOG(FATAL) << "Bad type of Field"; return {}; diff --git a/be/src/vec/common/hash_table/hash_map_context.h b/be/src/vec/common/hash_table/hash_map_context.h index 0464380b18d43b..35df772b16dbca 100644 --- a/be/src/vec/common/hash_table/hash_map_context.h +++ b/be/src/vec/common/hash_table/hash_map_context.h @@ -20,6 +20,7 @@ #include #include +#include "common/compiler_util.h" #include "runtime/descriptors.h" #include "util/stack_util.h" #include "vec/columns/column_nullable.h" @@ -85,6 +86,7 @@ struct MethodBase { hash_values[k] = hash_table->hash(keys[k]); } } + void init_hash_values(size_t num_rows) { hash_values.resize(num_rows); for (size_t k = 0; k < num_rows; ++k) { @@ -93,21 +95,22 @@ struct MethodBase { } template - void prefetch(int current) { - if (LIKELY(current + HASH_MAP_PREFETCH_DIST < hash_values.size())) { - hash_table->template prefetch(keys[current + HASH_MAP_PREFETCH_DIST], - hash_values[current + HASH_MAP_PREFETCH_DIST]); + ALWAYS_INLINE void prefetch(size_t i) { + if (LIKELY(i + HASH_MAP_PREFETCH_DIST < hash_values.size())) { + hash_table->template prefetch(keys[i + HASH_MAP_PREFETCH_DIST], + hash_values[i + HASH_MAP_PREFETCH_DIST]); } } template - auto find(State& state, size_t i) { + ALWAYS_INLINE auto find(State& state, size_t i) { prefetch(i); return state.find_key_with_hash(*hash_table, hash_values[i], keys[i]); } template - auto& lazy_emplace(State& state, size_t i, F&& creator, FF&& creator_for_null_key) { + ALWAYS_INLINE auto& lazy_emplace(State& state, size_t i, F&& creator, + FF&& creator_for_null_key) { prefetch(i); return state.lazy_emplace_key(*hash_table, i, keys[i], hash_values[i], creator, creator_for_null_key); diff --git a/be/src/vec/common/hash_table/partitioned_hash_map.h b/be/src/vec/common/hash_table/partitioned_hash_map.h index 89958608c7337b..f23b0a347de111 100644 --- a/be/src/vec/common/hash_table/partitioned_hash_map.h +++ b/be/src/vec/common/hash_table/partitioned_hash_map.h @@ -55,18 +55,5 @@ template > using PartitionedHashMap = PartitionedHashMapTable>>; -template > -using PHPartitionedHashMap = PartitionedHashMapTable>; - template > using PHNormalHashMap = PHHashMap; - -template -struct HashTableTraits> { - static constexpr bool is_phmap = true; -}; - -template