From 6473d8b3e27f9f41a4152d7632bbae548bcc6e96 Mon Sep 17 00:00:00 2001
From: Yang Xiufeng <yangxiufeng.c@gmail.com>
Date: Fri, 27 Sep 2024 16:46:52 +0800
Subject: [PATCH 01/14] feat: http handler add X-DATABEND-VERSION in each
 response. (#16518)

* refactor: split middleware.rs.

* small refactors.

* not check type of databend_token when in management_mode.

* http handler add X-DATABEND-VERSION in each response.

* small refactor.

* add license.

* update tests.

* test X-DATABEND-VERSION.

* test X-DATABEND-VERSION.
---
 src/common/base/src/headers.rs                |   1 +
 .../src/servers/http/middleware/metrics.rs    |  70 +++++++++++
 .../src/servers/http/middleware/mod.rs        |  26 ++++
 .../servers/http/middleware/panic_handler.rs  |  36 ++++++
 .../{middleware.rs => middleware/session.rs}  | 116 +++++-------------
 .../servers/http/v1/http_query_handlers.rs    |   2 +-
 src/query/service/src/servers/http/v1/mod.rs  |   3 +-
 .../service/src/servers/http/v1/query/mod.rs  |   1 +
 .../src/servers/http/v1/query/page_manager.rs |   4 +-
 .../http/v1/{ => query}/string_block.rs       |   4 -
 .../it/servers/http/http_query_handlers.rs    |  17 +--
 .../tests/it/servers/http/json_block.rs       |   2 +-
 12 files changed, 179 insertions(+), 103 deletions(-)
 create mode 100644 src/query/service/src/servers/http/middleware/metrics.rs
 create mode 100644 src/query/service/src/servers/http/middleware/mod.rs
 create mode 100644 src/query/service/src/servers/http/middleware/panic_handler.rs
 rename src/query/service/src/servers/http/{middleware.rs => middleware/session.rs} (85%)
 rename src/query/service/src/servers/http/v1/{ => query}/string_block.rs (98%)
diff --git a/src/common/base/src/headers.rs b/src/common/base/src/headers.rs
index db18c883948c..823e28e9e0a1 100644
--- a/src/common/base/src/headers.rs
+++ b/src/common/base/src/headers.rs
@@ -25,6 +25,7 @@ pub const HEADER_NODE_ID: &str = "X-DATABEND-NODE-ID";
 
 pub const HEADER_QUERY_STATE: &str = "X-DATABEND-QUERY-STATE";
 pub const HEADER_QUERY_PAGE_ROWS: &str = "X-DATABEND-QUERY-PAGE-ROWS";
+pub const HEADER_VERSION: &str = "X-DATABEND-VERSION";
 
 pub const HEADER_SIGNATURE: &str = "X-DATABEND-SIGNATURE";
 pub const HEADER_AUTH_METHOD: &str = "X-DATABEND-AUTH-METHOD";
diff --git a/src/query/service/src/servers/http/middleware/metrics.rs b/src/query/service/src/servers/http/middleware/metrics.rs
new file mode 100644
index 000000000000..60afddc44f06
--- /dev/null
+++ b/src/query/service/src/servers/http/middleware/metrics.rs
@@ -0,0 +1,70 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::time::Instant;
+
+use databend_common_metrics::http::metrics_incr_http_request_count;
+use databend_common_metrics::http::metrics_incr_http_slow_request_count;
+use databend_common_metrics::http::metrics_observe_http_response_duration;
+use poem::Endpoint;
+use poem::IntoResponse;
+use poem::Middleware;
+use poem::Request;
+use poem::Response;
+
+pub struct MetricsMiddleware {
+    api: String,
+}
+
+impl MetricsMiddleware {
+    pub fn new(api: impl Into<String>) -> Self {
+        Self { api: api.into() }
+    }
+}
+
+impl<E: Endpoint> Middleware<E> for MetricsMiddleware {
+    type Output = MetricsMiddlewareEndpoint<E>;
+
+    fn transform(&self, ep: E) -> Self::Output {
+        MetricsMiddlewareEndpoint {
+            ep,
+            api: self.api.clone(),
+        }
+    }
+}
+
+pub struct MetricsMiddlewareEndpoint<E> {
+    api: String,
+    ep: E,
+}
+
+impl<E: Endpoint> Endpoint for MetricsMiddlewareEndpoint<E> {
+    type Output = Response;
+
+    async fn call(&self, req: Request) -> poem::error::Result<Self::Output> {
+        let start_time = Instant::now();
+        let method = req.method().to_string();
+        let output = self.ep.call(req).await?;
+        let resp = output.into_response();
+        let status_code = resp.status().to_string();
+        let duration = start_time.elapsed();
+        metrics_incr_http_request_count(method.clone(), self.api.clone(), status_code.clone());
+        metrics_observe_http_response_duration(method.clone(), self.api.clone(), duration);
+        if duration.as_secs_f64() > 60.0 {
+            // TODO: replace this into histogram
+            metrics_incr_http_slow_request_count(method, self.api.clone(), status_code);
+        }
+        Ok(resp)
+    }
+}
diff --git a/src/query/service/src/servers/http/middleware/mod.rs b/src/query/service/src/servers/http/middleware/mod.rs
new file mode 100644
index 000000000000..5da4f1c621b4
--- /dev/null
+++ b/src/query/service/src/servers/http/middleware/mod.rs
@@ -0,0 +1,26 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+mod metrics;
+mod panic_handler;
+mod session;
+
+pub(crate) use metrics::MetricsMiddleware;
+pub(crate) use panic_handler::PanicHandler;
+pub use session::json_response;
+pub(crate) use session::sanitize_request_headers;
+pub use session::EndpointKind;
+// for it tests only
+pub use session::HTTPSessionEndpoint;
+pub use session::HTTPSessionMiddleware;
diff --git a/src/query/service/src/servers/http/middleware/panic_handler.rs b/src/query/service/src/servers/http/middleware/panic_handler.rs
new file mode 100644
index 000000000000..cbff37083ef8
--- /dev/null
+++ b/src/query/service/src/servers/http/middleware/panic_handler.rs
@@ -0,0 +1,36 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+
+use databend_common_metrics::http::metrics_incr_http_response_panics_count;
+use http::StatusCode;
+
+#[derive(Clone, Debug)]
+pub(crate) struct PanicHandler {}
+
+impl PanicHandler {
+    pub fn new() -> Self {
+        Self {}
+    }
+}
+
+impl poem::middleware::PanicHandler for PanicHandler {
+    type Response = (StatusCode, &'static str);
+
+    fn get_response(&self, _err: Box<dyn Any + Send + 'static>) -> Self::Response {
+        metrics_incr_http_response_panics_count();
+        (StatusCode::INTERNAL_SERVER_ERROR, "internal server error")
+    }
+}
diff --git a/src/query/service/src/servers/http/middleware.rs b/src/query/service/src/servers/http/middleware/session.rs
similarity index 85%
rename from src/query/service/src/servers/http/middleware.rs
rename to src/query/service/src/servers/http/middleware/session.rs
index a0474310b346..3a95456f4f36 100644
--- a/src/query/service/src/servers/http/middleware.rs
+++ b/src/query/service/src/servers/http/middleware/session.rs
@@ -12,26 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::any::Any;
 use std::collections::HashMap;
 use std::sync::Arc;
-use std::time::Instant;
 
 use databend_common_base::headers::HEADER_DEDUPLICATE_LABEL;
 use databend_common_base::headers::HEADER_NODE_ID;
 use databend_common_base::headers::HEADER_QUERY_ID;
 use databend_common_base::headers::HEADER_SESSION_ID;
 use databend_common_base::headers::HEADER_TENANT;
+use databend_common_base::headers::HEADER_VERSION;
 use databend_common_base::runtime::ThreadTracker;
 use databend_common_config::GlobalConfig;
+use databend_common_config::QUERY_SEMVER;
 use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
 use databend_common_meta_app::principal::user_token::TokenType;
 use databend_common_meta_app::tenant::Tenant;
-use databend_common_metrics::http::metrics_incr_http_request_count;
-use databend_common_metrics::http::metrics_incr_http_response_panics_count;
-use databend_common_metrics::http::metrics_incr_http_slow_request_count;
-use databend_common_metrics::http::metrics_observe_http_response_duration;
 use fastrace::func_name;
 use headers::authorization::Basic;
 use headers::authorization::Bearer;
@@ -57,17 +53,18 @@ use poem::Request;
 use poem::Response;
 use uuid::Uuid;
 
-use super::v1::HttpQueryContext;
-use super::v1::SessionClaim;
 use crate::auth::AuthMgr;
 use crate::auth::Credential;
 use crate::servers::http::error::HttpErrorCode;
 use crate::servers::http::error::JsonErrorOnly;
 use crate::servers::http::error::QueryError;
+use crate::servers::http::v1::HttpQueryContext;
+use crate::servers::http::v1::SessionClaim;
 use crate::servers::HttpHandlerKind;
 use crate::sessions::SessionManager;
 use crate::sessions::SessionType;
-
+const USER_AGENT: &str = "User-Agent";
+const TRACE_PARENT: &str = "traceparent";
 #[derive(Debug, Copy, Clone)]
 pub enum EndpointKind {
     Login,
@@ -81,6 +78,7 @@ pub enum EndpointKind {
 }
 
 impl EndpointKind {
+    /// avoid the cost of get user from meta
     pub fn need_user_info(&self) -> bool {
         !matches!(self, EndpointKind::NoAuth | EndpointKind::PollQuery)
     }
@@ -89,7 +87,11 @@ impl EndpointKind {
             EndpointKind::Verify => Ok(None),
             EndpointKind::Refresh => Ok(Some(TokenType::Refresh)),
             EndpointKind::StartQuery | EndpointKind::PollQuery | EndpointKind::Logout => {
-                Ok(Some(TokenType::Session))
+                if GlobalConfig::instance().query.management_mode {
+                    Ok(None)
+                } else {
+                    Ok(Some(TokenType::Session))
+                }
             }
             _ => Err(ErrorCode::AuthenticateFailure(format!(
                 "should not use databend token for {self:?}",
@@ -98,9 +100,6 @@ impl EndpointKind {
     }
 }
 
-const USER_AGENT: &str = "User-Agent";
-const TRACE_PARENT: &str = "traceparent";
-
 pub struct HTTPSessionMiddleware {
     pub kind: HttpHandlerKind,
     pub endpoint_kind: EndpointKind,
@@ -165,14 +164,14 @@ fn get_credential(
     let client_ip = get_client_ip(req);
     if std_auth_headers.is_empty() {
         if matches!(kind, HttpHandlerKind::Clickhouse) {
-            auth_clickhouse_name_password(req, client_ip)
+            get_clickhouse_name_password(req, client_ip)
         } else {
             Err(ErrorCode::AuthenticateFailure(
                 "No authorization header detected",
             ))
         }
     } else {
-        auth_by_header(&std_auth_headers, client_ip, endpoint_kind)
+        get_credential_from_header(&std_auth_headers, client_ip, endpoint_kind)
     }
 }
 
@@ -180,7 +179,7 @@ fn get_credential(
 /// not found, fallback to the remote address, which might be local proxy's ip address.
 /// please note that when it comes with network policy, we need make sure the incoming
 /// traffic comes from a trustworthy proxy instance.
-pub fn get_client_ip(req: &Request) -> Option<String> {
+fn get_client_ip(req: &Request) -> Option<String> {
     let headers = ["X-Real-IP", "X-Forwarded-For", "CF-Connecting-IP"];
     for &header in headers.iter() {
         if let Some(value) = req.headers().get(header) {
@@ -203,7 +202,7 @@ pub fn get_client_ip(req: &Request) -> Option<String> {
     client_ip
 }
 
-fn auth_by_header(
+fn get_credential_from_header(
     std_auth_headers: &[&HeaderValue],
     client_ip: Option<String>,
     endpoint_kind: EndpointKind,
@@ -246,7 +245,7 @@ fn auth_by_header(
     }
 }
 
-fn auth_clickhouse_name_password(req: &Request, client_ip: Option<String>) -> Result<Credential> {
+fn get_clickhouse_name_password(req: &Request, client_ip: Option<String>) -> Result<Credential> {
     let (user, key) = (
         req.headers().get("X-CLICKHOUSE-USER"),
         req.headers().get("X-CLICKHOUSE-KEY"),
@@ -436,71 +435,8 @@ pub fn sanitize_request_headers(headers: &poem::http::HeaderMap) -> HashMap<Stri
         .collect()
 }
 
-pub struct MetricsMiddleware {
-    api: String,
-}
-
-impl MetricsMiddleware {
-    pub fn new(api: impl Into<String>) -> Self {
-        Self { api: api.into() }
-    }
-}
-
-impl<E: Endpoint> Middleware<E> for MetricsMiddleware {
-    type Output = MetricsMiddlewareEndpoint<E>;
-
-    fn transform(&self, ep: E) -> Self::Output {
-        MetricsMiddlewareEndpoint {
-            ep,
-            api: self.api.clone(),
-        }
-    }
-}
-
-pub struct MetricsMiddlewareEndpoint<E> {
-    api: String,
-    ep: E,
-}
-
-impl<E: Endpoint> Endpoint for MetricsMiddlewareEndpoint<E> {
-    type Output = Response;
-
-    async fn call(&self, req: Request) -> poem::error::Result<Self::Output> {
-        let start_time = Instant::now();
-        let method = req.method().to_string();
-        let output = self.ep.call(req).await?;
-        let resp = output.into_response();
-        let status_code = resp.status().to_string();
-        let duration = start_time.elapsed();
-        metrics_incr_http_request_count(method.clone(), self.api.clone(), status_code.clone());
-        metrics_observe_http_response_duration(method.clone(), self.api.clone(), duration);
-        if duration.as_secs_f64() > 60.0 {
-            // TODO: replace this into histogram
-            metrics_incr_http_slow_request_count(method, self.api.clone(), status_code);
-        }
-        Ok(resp)
-    }
-}
-
-#[derive(Clone, Debug)]
-pub(crate) struct PanicHandler {}
-
-impl PanicHandler {
-    pub fn new() -> Self {
-        Self {}
-    }
-}
-
-impl poem::middleware::PanicHandler for PanicHandler {
-    type Response = (StatusCode, &'static str);
-
-    fn get_response(&self, _err: Box<dyn Any + Send + 'static>) -> Self::Response {
-        metrics_incr_http_response_panics_count();
-        (StatusCode::INTERNAL_SERVER_ERROR, "internal server error")
-    }
-}
 pub async fn json_response<E: Endpoint>(next: E, req: Request) -> PoemResult<Response> {
-    let resp = match next.call(req).await {
+    let mut resp = match next.call(req).await {
         Ok(resp) => resp.into_response(),
         Err(err) => (
             err.status(),
@@ -514,5 +450,21 @@ pub async fn json_response<E: Endpoint>(next: E, req: Request) -> PoemResult<Res
         )
             .into_response(),
     };
+    resp.headers_mut()
+        .insert(HEADER_VERSION, QUERY_SEMVER.to_string().parse().unwrap());
     Ok(resp)
 }
+
+#[cfg(test)]
+mod tests {
+    use crate::servers::http::middleware::session::get_client_ip;
+
+    #[test]
+    fn test_parse_ip() {
+        let req = poem::Request::builder()
+            .header("X-Forwarded-For", "1.2.3.4")
+            .finish();
+        let ip = get_client_ip(&req);
+        assert_eq!(ip, Some("1.2.3.4".to_string()));
+    }
+}
diff --git a/src/query/service/src/servers/http/v1/http_query_handlers.rs b/src/query/service/src/servers/http/v1/http_query_handlers.rs
index 4dc9f09c83b2..d35606725618 100644
--- a/src/query/service/src/servers/http/v1/http_query_handlers.rs
+++ b/src/query/service/src/servers/http/v1/http_query_handlers.rs
@@ -48,11 +48,11 @@ use crate::servers::http::error::QueryError;
 use crate::servers::http::middleware::EndpointKind;
 use crate::servers::http::middleware::HTTPSessionMiddleware;
 use crate::servers::http::middleware::MetricsMiddleware;
+use crate::servers::http::v1::query::string_block::StringBlock;
 use crate::servers::http::v1::query::Progresses;
 use crate::servers::http::v1::HttpQueryContext;
 use crate::servers::http::v1::HttpQueryManager;
 use crate::servers::http::v1::HttpSessionConf;
-use crate::servers::http::v1::StringBlock;
 use crate::servers::HttpHandlerKind;
 use crate::sessions::QueryAffect;
 
diff --git a/src/query/service/src/servers/http/v1/mod.rs b/src/query/service/src/servers/http/v1/mod.rs
index 901ad11ee243..f98ce7b367eb 100644
--- a/src/query/service/src/servers/http/v1/mod.rs
+++ b/src/query/service/src/servers/http/v1/mod.rs
@@ -17,7 +17,6 @@ mod http_query_handlers;
 mod query;
 mod session;
 mod stage;
-pub mod string_block;
 mod suggestions;
 
 pub use discovery::discovery_nodes;
@@ -27,6 +26,7 @@ pub use http_query_handlers::make_state_uri;
 pub use http_query_handlers::query_route;
 pub use http_query_handlers::QueryResponse;
 pub use http_query_handlers::QueryStats;
+pub use query::string_block::StringBlock;
 pub use query::ExecuteStateKind;
 pub use query::ExpiringMap;
 pub use query::ExpiringState;
@@ -42,7 +42,6 @@ pub use session::ClientSessionManager;
 pub(crate) use session::SessionClaim;
 pub use stage::upload_to_stage;
 pub use stage::UploadToStageResponse;
-pub(crate) use string_block::StringBlock;
 pub use suggestions::list_suggestions;
 pub use suggestions::SuggestionsResponse;
 
diff --git a/src/query/service/src/servers/http/v1/query/mod.rs b/src/query/service/src/servers/http/v1/query/mod.rs
index 4efa40d7cd55..08ac26b8736a 100644
--- a/src/query/service/src/servers/http/v1/query/mod.rs
+++ b/src/query/service/src/servers/http/v1/query/mod.rs
@@ -20,6 +20,7 @@ mod http_query_context;
 mod http_query_manager;
 mod page_manager;
 pub mod sized_spsc;
+pub mod string_block;
 
 pub(crate) use execute_state::ExecuteState;
 pub use execute_state::ExecuteStateKind;
diff --git a/src/query/service/src/servers/http/v1/query/page_manager.rs b/src/query/service/src/servers/http/v1/query/page_manager.rs
index 2fc8a7910557..d370e6703985 100644
--- a/src/query/service/src/servers/http/v1/query/page_manager.rs
+++ b/src/query/service/src/servers/http/v1/query/page_manager.rs
@@ -25,9 +25,9 @@ use log::debug;
 use log::info;
 use parking_lot::RwLock;
 
+use super::string_block::block_to_strings;
+use super::string_block::StringBlock;
 use crate::servers::http::v1::query::sized_spsc::SizedChannelReceiver;
-use crate::servers::http::v1::string_block::block_to_strings;
-use crate::servers::http::v1::StringBlock;
 
 #[derive(Debug, PartialEq, Eq)]
 pub enum Wait {
diff --git a/src/query/service/src/servers/http/v1/string_block.rs b/src/query/service/src/servers/http/v1/query/string_block.rs
similarity index 98%
rename from src/query/service/src/servers/http/v1/string_block.rs
rename to src/query/service/src/servers/http/v1/query/string_block.rs
index 86b70c47aa05..be79d13de3a1 100644
--- a/src/query/service/src/servers/http/v1/string_block.rs
+++ b/src/query/service/src/servers/http/v1/query/string_block.rs
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::sync::Arc;
-
 use databend_common_exception::Result;
 use databend_common_expression::Column;
 use databend_common_expression::DataBlock;
@@ -25,8 +23,6 @@ pub struct StringBlock {
     pub(crate) data: Vec<Vec<Option<String>>>,
 }
 
-pub type StringBlockRef = Arc<StringBlock>;
-
 fn data_is_null(column: &Column, row_index: usize) -> bool {
     match column {
         Column::Null { .. } => true,
diff --git a/src/query/service/tests/it/servers/http/http_query_handlers.rs b/src/query/service/tests/it/servers/http/http_query_handlers.rs
index 23d611edfbc4..2ebd78d7079d 100644
--- a/src/query/service/tests/it/servers/http/http_query_handlers.rs
+++ b/src/query/service/tests/it/servers/http/http_query_handlers.rs
@@ -21,15 +21,16 @@ use base64::engine::general_purpose;
 use base64::prelude::*;
 use databend_common_base::base::get_free_tcp_port;
 use databend_common_base::base::tokio;
+use databend_common_base::headers::HEADER_VERSION;
 use databend_common_config::UserAuthConfig;
 use databend_common_config::UserConfig;
+use databend_common_config::QUERY_SEMVER;
 use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
 use databend_common_meta_app::principal::PasswordHashMethod;
 use databend_common_users::CustomClaims;
 use databend_common_users::EnsureUser;
 use databend_query::servers::http::error::QueryError;
-use databend_query::servers::http::middleware::get_client_ip;
 use databend_query::servers::http::middleware::json_response;
 use databend_query::servers::http::v1::make_page_uri;
 use databend_query::servers::http::v1::query_route;
@@ -184,6 +185,10 @@ impl TestHttpQueryRequest {
             .await
             .map_err(|e| ErrorCode::Internal(e.to_string()))
             .unwrap();
+        assert_eq!(
+            resp.header(HEADER_VERSION),
+            Some(QUERY_SEMVER.to_string().as_str())
+        );
 
         let status_code = resp.status();
         let body = resp.into_body().into_string().await.unwrap();
@@ -1671,16 +1676,6 @@ async fn test_txn_timeout() -> Result<()> {
     Ok(())
 }
 
-#[test]
-fn test_parse_ip() -> Result<()> {
-    let req = poem::Request::builder()
-        .header("X-Forwarded-For", "1.2.3.4")
-        .finish();
-    let ip = get_client_ip(&req);
-    assert_eq!(ip, Some("1.2.3.4".to_string()));
-    Ok(())
-}
-
 #[tokio::test(flavor = "current_thread")]
 async fn test_has_result_set() -> Result<()> {
     let _fixture = TestFixture::setup().await?;
diff --git a/src/query/service/tests/it/servers/http/json_block.rs b/src/query/service/tests/it/servers/http/json_block.rs
index 060728c634cb..0ca921a6ed66 100644
--- a/src/query/service/tests/it/servers/http/json_block.rs
+++ b/src/query/service/tests/it/servers/http/json_block.rs
@@ -23,7 +23,7 @@ use databend_common_expression::types::StringType;
 use databend_common_expression::DataBlock;
 use databend_common_expression::FromData;
 use databend_common_io::prelude::FormatSettings;
-use databend_query::servers::http::v1::string_block::StringBlock;
+use databend_query::servers::http::v1::StringBlock;
 use pretty_assertions::assert_eq;
 
 fn test_data_block(is_nullable: bool) -> Result<()> {

From 3d11bd976b46290ffb287e979aa95d1e428e4cef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E7=82=8E=E6=B3=BC?= <drdr.xp@gmail.com>
Date: Fri, 27 Sep 2024 21:07:19 +0800
Subject: [PATCH 02/14] refactor: dropped table listing and GC (#16531)

- Replace `TableInfoFilter` with `Range<Option<DateTime<Utc>>>` for more flexible filtering
- Simplify `ListDroppedTableReq` struct and add builder methods
- Update schema_api_impl and tests to use new ListDroppedTableReq API
- Adjust vacuum logic in interpreter_vacuum_drop_tables to use new API
- Minor code cleanup and documentation improvements
---
 src/meta/api/src/schema_api_impl.rs           | 94 ++++++++++---------
 src/meta/api/src/schema_api_test_suite.rs     | 46 +++------
 src/meta/app/src/schema/mod.rs                |  1 -
 src/meta/app/src/schema/table.rs              | 91 ++++++++++++++----
 .../interpreter_vacuum_drop_tables.rs         | 19 ++--
 5 files changed, 143 insertions(+), 108 deletions(-)

diff --git a/src/meta/api/src/schema_api_impl.rs b/src/meta/api/src/schema_api_impl.rs
index 8c0991aade57..e3b482986866 100644
--- a/src/meta/api/src/schema_api_impl.rs
+++ b/src/meta/api/src/schema_api_impl.rs
@@ -19,6 +19,7 @@ use std::collections::HashMap;
 use std::collections::HashSet;
 use std::convert::Infallible;
 use std::fmt::Display;
+use std::ops::Range;
 use std::sync::Arc;
 use std::time::Duration;
 
@@ -140,7 +141,6 @@ use databend_common_meta_app::schema::TableIdToName;
 use databend_common_meta_app::schema::TableIdent;
 use databend_common_meta_app::schema::TableIndex;
 use databend_common_meta_app::schema::TableInfo;
-use databend_common_meta_app::schema::TableInfoFilter;
 use databend_common_meta_app::schema::TableMeta;
 use databend_common_meta_app::schema::TableNameIdent;
 use databend_common_meta_app::schema::TruncateTableReply;
@@ -2644,11 +2644,13 @@ impl<KV: kvapi::KVApi<Error = MetaError> + ?Sized> SchemaApi for KV {
 
         let the_limit = req.limit.unwrap_or(usize::MAX);
 
-        if let TableInfoFilter::DroppedTableOrDroppedDatabase(retention_boundary) = &req.filter {
+        let drop_time_range = req.drop_time_range;
+
+        if req.database_name.is_none() {
             let db_infos = self
                 .get_tenant_history_databases(
                     ListDatabaseReq {
-                        tenant: req.inner.tenant().clone(),
+                        tenant: req.tenant.clone(),
                     },
                     true,
                 )
@@ -2665,24 +2667,17 @@ impl<KV: kvapi::KVApi<Error = MetaError> + ?Sized> SchemaApi for KV {
                     });
                 }
 
-                // If boundary is None, it means choose all tables.
-                // Thus, we just choose a very large time.
-                let boundary = retention_boundary.unwrap_or(DateTime::<Utc>::MAX_UTC);
-
-                let vacuum_db = {
-                    let drop_on = db_info.meta.drop_on;
-                    drop_on.is_some() && drop_on <= Some(boundary)
-                };
+                let vacuum_db = drop_time_range.contains(&db_info.meta.drop_on);
 
                 // If to vacuum a db, just vacuum all tables.
                 // Otherwise, choose only dropped tables(before retention time).
-                let filter = if vacuum_db {
-                    TableInfoFilter::All
+                let table_drop_time_range = if vacuum_db {
+                    None..Some(DateTime::<Utc>::MAX_UTC)
                 } else {
-                    TableInfoFilter::DroppedTables(*retention_boundary)
+                    drop_time_range.clone()
                 };
 
-                let db_filter = (filter, db_info.clone());
+                let db_filter = (table_drop_time_range, db_info.clone());
 
                 let capacity = the_limit - vacuum_table_infos.len();
                 let table_infos = do_get_table_history(self, db_filter, capacity).await?;
@@ -2723,12 +2718,13 @@ impl<KV: kvapi::KVApi<Error = MetaError> + ?Sized> SchemaApi for KV {
             });
         }
 
-        let tenant_dbname = &req.inner;
+        let database_name = req.database_name.clone().unwrap();
+        let tenant_dbname = DatabaseNameIdent::new(&req.tenant, database_name);
 
         // Get db by name to ensure presence
         let res = get_db_or_err(
             self,
-            tenant_dbname,
+            &tenant_dbname,
             format!("get_table_history: {}", tenant_dbname.display()),
         )
         .await;
@@ -2742,10 +2738,10 @@ impl<KV: kvapi::KVApi<Error = MetaError> + ?Sized> SchemaApi for KV {
 
         let db_info = Arc::new(DatabaseInfo {
             database_id: seq_db_id.data,
-            name_ident: req.inner.clone(),
+            name_ident: tenant_dbname.clone(),
             meta: db_meta,
         });
-        let db_filter = (req.filter, db_info);
+        let db_filter = (drop_time_range.clone(), db_info);
         let table_infos = do_get_table_history(self, db_filter, the_limit).await?;
         let mut drop_ids = vec![];
         let mut drop_table_infos = vec![];
@@ -3531,11 +3527,17 @@ fn build_upsert_table_deduplicated_label(deduplicated_label: String) -> TxnOp {
     )
 }
 
+#[allow(clippy::type_complexity)]
 #[logcall::logcall(input = "")]
 #[fastrace::trace]
 async fn batch_filter_table_info(
     kv_api: &(impl kvapi::KVApi<Error = MetaError> + ?Sized),
-    args: &[(&TableInfoFilter, &Arc<DatabaseInfo>, u64, String)],
+    args: &[(
+        Range<Option<DateTime<Utc>>>,
+        &Arc<DatabaseInfo>,
+        u64,
+        String,
+    )],
     filter_tb_infos: &mut Vec<(Arc<TableInfo>, u64)>,
 ) -> Result<(), KVAppError> {
     let table_id_idents = args
@@ -3544,9 +3546,7 @@ async fn batch_filter_table_info(
 
     let seq_metas = kv_api.get_pb_values_vec(table_id_idents).await?;
 
-    for (seq_meta, (filter, db_info, table_id, table_name)) in
-        seq_metas.into_iter().zip(args.iter())
-    {
+    for (seq_meta, (rng, db_info, table_id, table_name)) in seq_metas.into_iter().zip(args.iter()) {
         let Some(seq_meta) = seq_meta else {
             error!(
                 "batch_filter_table_info cannot find {:?} table_meta",
@@ -3555,14 +3555,8 @@ async fn batch_filter_table_info(
             continue;
         };
 
-        if let TableInfoFilter::DroppedTables(retention_boundary) = filter {
-            let Some(meta_drop_on) = seq_meta.drop_on else {
-                continue;
-            };
-
-            if meta_drop_on > retention_boundary.unwrap_or(DateTime::<Utc>::MAX_UTC) {
-                continue;
-            }
+        if !rng.contains(&seq_meta.data.drop_on) {
+            continue;
         }
 
         let tb_info = TableInfo {
@@ -3583,7 +3577,12 @@ async fn batch_filter_table_info(
     Ok(())
 }
 
-type TableFilterInfoList<'a> = Vec<(&'a TableInfoFilter, &'a Arc<DatabaseInfo>, u64, String)>;
+type TableFilterInfoList<'a> = Vec<(
+    Range<Option<DateTime<Utc>>>,
+    &'a Arc<DatabaseInfo>,
+    u64,
+    String,
+)>;
 
 #[logcall::logcall(input = "")]
 #[fastrace::trace]
@@ -3607,19 +3606,15 @@ async fn get_gc_table_info(
 #[fastrace::trace]
 async fn do_get_table_history(
     kv_api: &(impl kvapi::KVApi<Error = MetaError> + ?Sized),
-    db_filter: (TableInfoFilter, Arc<DatabaseInfo>),
+    db_filter: (Range<Option<DateTime<Utc>>>, Arc<DatabaseInfo>),
     limit: usize,
 ) -> Result<Vec<(Arc<TableInfo>, u64)>, KVAppError> {
     let mut filter_tb_infos = vec![];
 
     // step 1: list db table name with db id
-    let mut filter_db_info_with_table_id_key_list: Vec<(
-        &TableInfoFilter,
-        &Arc<DatabaseInfo>,
-        TableIdHistoryIdent,
-    )> = vec![];
+    let mut filter_db_info_with_table_id_key_list: Vec<_> = vec![];
 
-    let (filter, db_info) = db_filter;
+    let (drop_time_range, db_info) = db_filter;
     let db_id = db_info.database_id.db_id;
 
     // List tables by tenant, db_id, table_name.
@@ -3633,7 +3628,7 @@ async fn do_get_table_history(
 
     let keys = table_id_list_keys
         .iter()
-        .map(|table_id_list_key| (&filter, &db_info, table_id_list_key.clone()))
+        .map(|table_id_list_key| (drop_time_range.clone(), &db_info, table_id_list_key.clone()))
         .collect::<Vec<_>>();
 
     filter_db_info_with_table_id_key_list.extend(keys);
@@ -3660,10 +3655,17 @@ async fn do_get_table_history(
             let (filter, db_info, table_id_list_key) = table_id_list_keys_iter.next().unwrap();
             let tb_id_list = seq_table_id_list.data;
 
-            let id_list: Vec<(&TableInfoFilter, &Arc<DatabaseInfo>, u64, String)> = tb_id_list
+            let id_list: Vec<_> = tb_id_list
                 .id_list
                 .iter()
-                .map(|id| (filter, db_info, *id, table_id_list_key.table_name.clone()))
+                .map(|id| {
+                    (
+                        filter.clone(),
+                        db_info,
+                        *id,
+                        table_id_list_key.table_name.clone(),
+                    )
+                })
                 .collect();
 
             filter_db_info_with_table_id_list.extend(id_list);
@@ -3694,6 +3696,10 @@ async fn do_get_table_history(
     Ok(filter_tb_infos)
 }
 
+/// Permanently remove a dropped database from the meta-service.
+///
+/// Upon calling this method, the dropped database must be already marked as `gc_in_progress`,
+/// then remove all **dropped and non-dropped** tables in the database.
 async fn gc_dropped_db_by_id(
     kv_api: &(impl kvapi::KVApi<Error = MetaError> + ?Sized),
     db_id: u64,
@@ -3892,6 +3898,10 @@ async fn update_txn_to_remove_table_history(
     Ok(())
 }
 
+/// Update TxnRequest to remove a dropped table's own data.
+///
+/// This function returns the updated TxnRequest,
+/// or Err of the reason in string if it can not proceed.
 async fn remove_data_for_dropped_table(
     kv_api: &(impl kvapi::KVApi<Error = MetaError> + ?Sized),
     table_id: &TableId,
diff --git a/src/meta/api/src/schema_api_test_suite.rs b/src/meta/api/src/schema_api_test_suite.rs
index cc87b3e3a707..e17b6b752288 100644
--- a/src/meta/api/src/schema_api_test_suite.rs
+++ b/src/meta/api/src/schema_api_test_suite.rs
@@ -110,7 +110,6 @@ use databend_common_meta_app::schema::TableIdList;
 use databend_common_meta_app::schema::TableIdToName;
 use databend_common_meta_app::schema::TableIdent;
 use databend_common_meta_app::schema::TableInfo;
-use databend_common_meta_app::schema::TableInfoFilter;
 use databend_common_meta_app::schema::TableMeta;
 use databend_common_meta_app::schema::TableNameIdent;
 use databend_common_meta_app::schema::TableStatistics;
@@ -3472,11 +3471,7 @@ impl SchemaApiTestSuite {
         assert_eq!(id_list.len(), 2);
 
         {
-            let req = ListDroppedTableReq {
-                inner: DatabaseNameIdent::new(&tenant, ""),
-                filter: TableInfoFilter::DroppedTableOrDroppedDatabase(None),
-                limit: None,
-            };
+            let req = ListDroppedTableReq::new(&tenant);
             let resp = mt.get_drop_table_infos(req).await?;
 
             let req = GcDroppedTableReq {
@@ -3682,11 +3677,7 @@ impl SchemaApiTestSuite {
 
         // gc the drop tables
         {
-            let req = ListDroppedTableReq {
-                inner: DatabaseNameIdent::new(&tenant, ""),
-                filter: TableInfoFilter::DroppedTableOrDroppedDatabase(None),
-                limit: None,
-            };
+            let req = ListDroppedTableReq::new(&tenant);
             let resp = mt.get_drop_table_infos(req).await?;
 
             let req = GcDroppedTableReq {
@@ -3879,11 +3870,7 @@ impl SchemaApiTestSuite {
 
         // gc the data
         {
-            let req = ListDroppedTableReq {
-                inner: DatabaseNameIdent::new(&tenant, ""),
-                filter: TableInfoFilter::DroppedTableOrDroppedDatabase(None),
-                limit: None,
-            };
+            let req = ListDroppedTableReq::new(&tenant);
             let resp = mt.get_drop_table_infos(req).await?;
 
             let req = GcDroppedTableReq {
@@ -4352,11 +4339,7 @@ impl SchemaApiTestSuite {
         // case 1: test AllDroppedTables with filter time
         {
             let now = Utc::now();
-            let req = ListDroppedTableReq {
-                inner: DatabaseNameIdent::new(&tenant, ""),
-                filter: TableInfoFilter::DroppedTableOrDroppedDatabase(Some(now)),
-                limit: None,
-            };
+            let req = ListDroppedTableReq::new(&tenant).with_retention_boundary(now);
             let resp = mt.get_drop_table_infos(req).await?;
 
             let got = resp
@@ -4390,11 +4373,7 @@ impl SchemaApiTestSuite {
 
         // case 2: test AllDroppedTables without filter time
         {
-            let req = ListDroppedTableReq {
-                inner: DatabaseNameIdent::new(&tenant, ""),
-                filter: TableInfoFilter::DroppedTableOrDroppedDatabase(None),
-                limit: None,
-            };
+            let req = ListDroppedTableReq::new(&tenant);
             let resp = mt.get_drop_table_infos(req).await?;
 
             let got = resp
@@ -4609,10 +4588,11 @@ impl SchemaApiTestSuite {
             ),
         ];
         for (limit, number, drop_ids) in limit_and_drop_ids {
-            let req = ListDroppedTableReq {
-                inner: DatabaseNameIdent::new(&tenant, ""),
-                filter: TableInfoFilter::DroppedTableOrDroppedDatabase(None),
-                limit,
+            let req = ListDroppedTableReq::new(&tenant);
+            let req = if let Some(limit) = limit {
+                req.with_limit(limit)
+            } else {
+                req
             };
             let resp = mt.get_drop_table_infos(req).await?;
             assert_eq!(resp.drop_ids.len(), number);
@@ -5290,11 +5270,7 @@ impl SchemaApiTestSuite {
             assert!(seqv.is_some() && seqv.unwrap().seq != 0);
 
             // vacuum drop table
-            let req = ListDroppedTableReq {
-                inner: DatabaseNameIdent::new(&tenant, ""),
-                filter: TableInfoFilter::DroppedTableOrDroppedDatabase(None),
-                limit: None,
-            };
+            let req = ListDroppedTableReq::new(&tenant);
             let resp = mt.get_drop_table_infos(req).await?;
             assert!(!resp.drop_ids.is_empty());
 
diff --git a/src/meta/app/src/schema/mod.rs b/src/meta/app/src/schema/mod.rs
index 095f8e67f43b..e7e12d9e7bce 100644
--- a/src/meta/app/src/schema/mod.rs
+++ b/src/meta/app/src/schema/mod.rs
@@ -115,7 +115,6 @@ pub use table::TableIdToName;
 pub use table::TableIdent;
 pub use table::TableIndex;
 pub use table::TableInfo;
-pub use table::TableInfoFilter;
 pub use table::TableMeta;
 pub use table::TableNameIdent;
 pub use table::TableStatistics;
diff --git a/src/meta/app/src/schema/table.rs b/src/meta/app/src/schema/table.rs
index 9a0384a9f5ec..7542b8177ddd 100644
--- a/src/meta/app/src/schema/table.rs
+++ b/src/meta/app/src/schema/table.rs
@@ -19,6 +19,7 @@ use std::fmt;
 use std::fmt::Display;
 use std::fmt::Formatter;
 use std::ops::Deref;
+use std::ops::Range;
 use std::sync::Arc;
 use std::time::Duration;
 
@@ -906,31 +907,81 @@ impl ListTableReq {
 }
 
 #[derive(Clone, Debug, PartialEq, Eq)]
-pub enum TableInfoFilter {
-    /// Choose only dropped tables.
-    ///
-    /// If the arg `retention_boundary` time is Some, choose only tables dropped before this boundary time.
-    DroppedTables(Option<DateTime<Utc>>),
-    /// Choose dropped table or all table in dropped databases.
-    ///
-    /// In this case, `ListTableReq`.db_name will be ignored.
-    ///
-    /// If the `retention_boundary` time is Some,
-    /// choose the table dropped before this time
-    /// or choose the database before this time.
-    DroppedTableOrDroppedDatabase(Option<DateTime<Utc>>),
+pub struct ListDroppedTableReq {
+    pub tenant: Tenant,
 
-    /// return all tables, ignore drop on time.
-    All,
-}
+    /// If `database_name` is None, choose all tables in all databases.
+    /// Otherwise, choose only tables in this database.
+    pub database_name: Option<String>,
+
+    /// The time range in which the database/table will be returned.
+    /// choose only tables/databases dropped before this boundary time.
+    /// It can include non-dropped tables/databases with `None..Some()`
+    pub drop_time_range: Range<Option<DateTime<Utc>>>,
 
-#[derive(Clone, Debug, PartialEq, Eq)]
-pub struct ListDroppedTableReq {
-    pub inner: DatabaseNameIdent,
-    pub filter: TableInfoFilter,
     pub limit: Option<usize>,
 }
 
+impl ListDroppedTableReq {
+    pub fn new(tenant: &Tenant) -> ListDroppedTableReq {
+        let rng_start = Some(DateTime::<Utc>::MIN_UTC);
+        let rng_end = Some(DateTime::<Utc>::MAX_UTC);
+        ListDroppedTableReq {
+            tenant: tenant.clone(),
+            database_name: None,
+            drop_time_range: rng_start..rng_end,
+            limit: None,
+        }
+    }
+
+    pub fn with_db(self, db_name: impl ToString) -> Self {
+        Self {
+            database_name: Some(db_name.to_string()),
+            ..self
+        }
+    }
+
+    pub fn with_retention_boundary(self, d: DateTime<Utc>) -> Self {
+        let rng_start = Some(DateTime::<Utc>::MIN_UTC);
+        let rng_end = Some(d);
+        Self {
+            drop_time_range: rng_start..rng_end,
+            ..self
+        }
+    }
+
+    pub fn with_limit(self, limit: usize) -> Self {
+        Self {
+            limit: Some(limit),
+            ..self
+        }
+    }
+
+    pub fn new4(
+        tenant: &Tenant,
+        database_name: Option<impl ToString>,
+        retention_boundary: Option<DateTime<Utc>>,
+        limit: Option<usize>,
+    ) -> ListDroppedTableReq {
+        let rng_start = Some(DateTime::<Utc>::MIN_UTC);
+        let rng_end = if let Some(b) = retention_boundary {
+            Some(b)
+        } else {
+            Some(DateTime::<Utc>::MAX_UTC)
+        };
+        ListDroppedTableReq {
+            tenant: tenant.clone(),
+            database_name: database_name.map(|s| s.to_string()),
+            drop_time_range: rng_start..rng_end,
+            limit,
+        }
+    }
+
+    pub fn database_name(&self) -> Option<&str> {
+        self.database_name.as_deref()
+    }
+}
+
 #[derive(Clone, Debug, PartialEq, Eq)]
 pub enum DroppedId {
     Db {
diff --git a/src/query/service/src/interpreters/interpreter_vacuum_drop_tables.rs b/src/query/service/src/interpreters/interpreter_vacuum_drop_tables.rs
index cb7c4d768662..d1b87942fb07 100644
--- a/src/query/service/src/interpreters/interpreter_vacuum_drop_tables.rs
+++ b/src/query/service/src/interpreters/interpreter_vacuum_drop_tables.rs
@@ -25,11 +25,9 @@ use databend_common_expression::DataBlock;
 use databend_common_expression::FromData;
 use databend_common_license::license::Feature::Vacuum;
 use databend_common_license::license_manager::LicenseManagerSwitch;
-use databend_common_meta_app::schema::database_name_ident::DatabaseNameIdent;
 use databend_common_meta_app::schema::DroppedId;
 use databend_common_meta_app::schema::GcDroppedTableReq;
 use databend_common_meta_app::schema::ListDroppedTableReq;
-use databend_common_meta_app::schema::TableInfoFilter;
 use databend_common_sql::plans::VacuumDropTablePlan;
 use databend_enterprise_vacuum_handler::get_vacuum_handler;
 use log::info;
@@ -125,19 +123,20 @@ impl Interpreter for VacuumDropTablesInterpreter {
             self.plan.database, retention_time
         );
         // if database if empty, vacuum all tables
-        let filter = if self.plan.database.is_empty() {
-            TableInfoFilter::DroppedTableOrDroppedDatabase(Some(retention_time))
+        let database_name = if self.plan.database.is_empty() {
+            None
         } else {
-            TableInfoFilter::DroppedTables(Some(retention_time))
+            Some(self.plan.database.clone())
         };
 
         let tenant = self.ctx.get_tenant();
         let (tables, drop_ids) = catalog
-            .get_drop_table_infos(ListDroppedTableReq {
-                inner: DatabaseNameIdent::new(&tenant, &self.plan.database),
-                filter,
-                limit: self.plan.option.limit,
-            })
+            .get_drop_table_infos(ListDroppedTableReq::new4(
+                &tenant,
+                database_name,
+                Some(retention_time),
+                self.plan.option.limit,
+            ))
             .await?;
 
         info!(

From 0189b016a28cb8a30d933160ab16167596077a48 Mon Sep 17 00:00:00 2001
From: Jk Xu <54522439+Dousir9@users.noreply.github.com>
Date: Sat, 28 Sep 2024 12:19:24 +0800
Subject: [PATCH 03/14] fix(query): fix column leaf_index (#16537)

* chore(query): fix column leaf_index

* chore(test): update sqllogictest

* chore(binder): refine code

* chore(binder): fix tuple inner_field

* chore(binder): fix tuple inner column_id

* chore(test): update test

* chore(binder): fix TUPLE inner_column_id

* chore(test): update sqllogictest

* chore(test): update sqllogictest
---
 .../sql/src/planner/expression_parser.rs      |  2 +-
 src/query/sql/src/planner/metadata.rs         | 34 +++++-----
 .../statistics/collect_statistics.rs          | 11 ++--
 .../base/05_ddl/05_0003_ddl_alter_table.test  | 24 +++++++
 .../mode/standalone/explain/bloom_filter.test |  8 +--
 .../mode/standalone/explain/explain.test      | 64 +++++++++++++++++++
 .../explain/prewhere_optimization.test        |  6 +-
 .../explain_native/bloom_filter.test          |  2 +-
 .../standalone/explain_native/explain.test    | 52 +++++++++++++++
 .../explain_native/prewhere_optimization.test |  2 +-
 10 files changed, 172 insertions(+), 33 deletions(-)

diff --git a/src/query/sql/src/planner/expression_parser.rs b/src/query/sql/src/planner/expression_parser.rs
index eec13e6968b8..e6d7b20b87ac 100644
--- a/src/query/sql/src/planner/expression_parser.rs
+++ b/src/query/sql/src/planner/expression_parser.rs
@@ -297,7 +297,7 @@ pub fn parse_computed_expr_to_string(
             field.data_type().clone(),
             0,
             None,
-            None,
+            Some(field.column_id),
             None,
             None,
         );
diff --git a/src/query/sql/src/planner/metadata.rs b/src/query/sql/src/planner/metadata.rs
index 6bd121374f96..f42e6d5477f8 100644
--- a/src/query/sql/src/planner/metadata.rs
+++ b/src/query/sql/src/planner/metadata.rs
@@ -229,7 +229,7 @@ impl Metadata {
         data_type: TableDataType,
         table_index: IndexType,
         path_indices: Option<Vec<IndexType>>,
-        leaf_index: Option<IndexType>,
+        column_id: Option<u32>,
         column_position: Option<usize>,
         virtual_computed_expr: Option<String>,
     ) -> IndexType {
@@ -241,7 +241,7 @@ impl Metadata {
             column_index,
             table_index,
             path_indices,
-            leaf_index,
+            column_id,
             virtual_computed_expr,
         });
         self.columns.push(column_entry);
@@ -370,8 +370,6 @@ impl Metadata {
             }
         }
 
-        // build leaf index in DFS order for primitive columns.
-        let mut leaf_index = 0;
         while let Some((indices, field)) = fields.pop_front() {
             if indices.is_empty() {
                 self.add_base_table_column(
@@ -379,7 +377,7 @@ impl Metadata {
                     field.data_type().clone(),
                     table_index,
                     None,
-                    None,
+                    Some(field.column_id),
                     None,
                     Some(field.computed_expr().unwrap().expr().clone()),
                 );
@@ -402,25 +400,29 @@ impl Metadata {
                     field.data_type().clone(),
                     table_index,
                     path_indices,
-                    None,
+                    Some(field.column_id),
                     None,
                     None,
                 );
 
-                let mut i = fields_type.len();
-                for (inner_field_name, inner_field_type) in
-                    fields_name.iter().zip(fields_type.iter()).rev()
+                let mut inner_column_id = field.column_id;
+                for (index, (inner_field_name, inner_field_type)) in
+                    fields_name.iter().zip(fields_type.iter()).enumerate()
                 {
-                    i -= 1;
                     let mut inner_indices = indices.clone();
-                    inner_indices.push(i);
+                    inner_indices.push(index);
                     // create tuple inner field
                     let inner_name = format!(
                         "{}:{}",
                         field.name(),
                         display_tuple_field_name(inner_field_name)
                     );
-                    let inner_field = TableField::new(&inner_name, inner_field_type.clone());
+                    let inner_field = TableField::new_from_column_id(
+                        &inner_name,
+                        inner_field_type.clone(),
+                        inner_column_id,
+                    );
+                    inner_column_id += inner_field_type.num_leaf_columns() as u32;
                     fields.push_front((inner_indices, inner_field));
                 }
             } else {
@@ -429,11 +431,10 @@ impl Metadata {
                     field.data_type().clone(),
                     table_index,
                     path_indices,
-                    Some(leaf_index),
+                    Some(field.column_id),
                     Some(indices[0] + 1),
                     None,
                 );
-                leaf_index += 1;
             }
         }
 
@@ -573,9 +574,8 @@ pub struct BaseTableColumn {
 
     /// Path indices for inner column of struct data type.
     pub path_indices: Option<Vec<usize>>,
-    /// Leaf index is the primitive column index in Parquet, constructed in DFS order.
-    /// None if the data type of column is struct.
-    pub leaf_index: Option<usize>,
+    /// The column id in table schema.
+    pub column_id: Option<u32>,
     /// Virtual computed expression, generated in query.
     pub virtual_computed_expr: Option<String>,
 }
diff --git a/src/query/sql/src/planner/optimizer/statistics/collect_statistics.rs b/src/query/sql/src/planner/optimizer/statistics/collect_statistics.rs
index 9f04d669d98e..836cf19aa405 100644
--- a/src/query/sql/src/planner/optimizer/statistics/collect_statistics.rs
+++ b/src/query/sql/src/planner/optimizer/statistics/collect_statistics.rs
@@ -80,19 +80,18 @@ impl CollectStatisticsOptimizer {
                 for column in columns.iter() {
                     if let ColumnEntry::BaseTableColumn(BaseTableColumn {
                         column_index,
-                        path_indices,
-                        leaf_index,
+                        column_id,
                         virtual_computed_expr,
                         ..
                     }) = column
                     {
-                        if path_indices.is_none() && virtual_computed_expr.is_none() {
-                            if let Some(col_id) = *leaf_index {
+                        if virtual_computed_expr.is_none() {
+                            if let Some(column_id) = *column_id {
                                 let col_stat = column_statistics_provider
-                                    .column_statistics(col_id as ColumnId);
+                                    .column_statistics(column_id as ColumnId);
                                 column_stats.insert(*column_index, col_stat.cloned());
                                 let histogram =
-                                    column_statistics_provider.histogram(col_id as ColumnId);
+                                    column_statistics_provider.histogram(column_id as ColumnId);
                                 histograms.insert(*column_index, histogram);
                             }
                         }
diff --git a/tests/sqllogictests/suites/base/05_ddl/05_0003_ddl_alter_table.test b/tests/sqllogictests/suites/base/05_ddl/05_0003_ddl_alter_table.test
index 7d1003527337..1287b596f930 100644
--- a/tests/sqllogictests/suites/base/05_ddl/05_0003_ddl_alter_table.test
+++ b/tests/sqllogictests/suites/base/05_ddl/05_0003_ddl_alter_table.test
@@ -255,3 +255,27 @@ DROP TABLE IF EXISTS t;
 
 statement ok
 DROP TABLE IF EXISTS t1;
+
+statement ok
+CREATE OR REPLACE TABLE t1 (id VARCHAR NULL, size VARCHAR NULL, create_time int NULL, path VARCHAR NULL);
+
+statement ok
+CREATE OR REPLACE TABLE t1_random like t1 Engine = Random;
+
+statement ok
+INSERT INTO  t1 SELECT * FROM t1_random LIMIT 10;
+
+statement ok
+ALTER TABLE t1 DROP COLUMN size;
+
+statement ok
+ALTER TABLE t1 DROP COLUMN path;
+
+statement ok
+SELECT * FROM t1 WHERE create_time > 111113;
+
+statement ok
+DROP TABLE IF EXISTS t1;
+
+statement ok
+DROP TABLE IF EXISTS t1_random;
diff --git a/tests/sqllogictests/suites/mode/standalone/explain/bloom_filter.test b/tests/sqllogictests/suites/mode/standalone/explain/bloom_filter.test
index 9582dc645922..18f12f481efa 100644
--- a/tests/sqllogictests/suites/mode/standalone/explain/bloom_filter.test
+++ b/tests/sqllogictests/suites/mode/standalone/explain/bloom_filter.test
@@ -128,11 +128,11 @@ explain select 1 from bloom_test_t where c2=3;
 EvalScalar
 ├── output columns: [1 (#3)]
 ├── expressions: [1]
-├── estimated rows: 1.00
+├── estimated rows: 2.67
 └── Filter
     ├── output columns: []
     ├── filters: [is_true(bloom_test_t.c2 (#1) = 3)]
-    ├── estimated rows: 1.00
+    ├── estimated rows: 2.67
     └── TableScan
         ├── table: default.default.bloom_test_t
         ├── output columns: [c2 (#1)]
@@ -150,11 +150,11 @@ explain select 1 from bloom_test_t where c3=12345;
 EvalScalar
 ├── output columns: [1 (#3)]
 ├── expressions: [1]
-├── estimated rows: 0.00
+├── estimated rows: 1.00
 └── Filter
     ├── output columns: []
     ├── filters: [is_true(bloom_test_t.c3 (#2) = 12345)]
-    ├── estimated rows: 0.00
+    ├── estimated rows: 1.00
     └── TableScan
         ├── table: default.default.bloom_test_t
         ├── output columns: [c3 (#2)]
diff --git a/tests/sqllogictests/suites/mode/standalone/explain/explain.test b/tests/sqllogictests/suites/mode/standalone/explain/explain.test
index fdc502f01325..9f19dc078761 100644
--- a/tests/sqllogictests/suites/mode/standalone/explain/explain.test
+++ b/tests/sqllogictests/suites/mode/standalone/explain/explain.test
@@ -1763,3 +1763,67 @@ INSERT OVERWRITE ALL INTO cat.db1.t1 VALUES  (order_id, 'PriorityHandling') INTO
 
 statement ok
 drop table  orders_placed
+
+# Test Tuple Statistics
+statement ok
+CREATE OR REPLACE TABLE t(a TUPLE(INT, INT));
+
+statement ok
+INSERT INTO t VALUES((1, 2)), ((3, 4));
+
+query T
+EXPLAIN SELECT * FROM t WHERE a.1 > 0;
+----
+Filter
+├── output columns: [t.a (#0)]
+├── filters: [is_true(t.a:"1" (#2) > 0)]
+├── estimated rows: 2.00
+└── TableScan
+    ├── table: default.default.t
+    ├── output columns: [a (#0), a:"1" (#2)]
+    ├── read rows: 2
+    ├── read size: < 1 KiB
+    ├── partitions total: 1
+    ├── partitions scanned: 1
+    ├── pruning stats: [segments: <range pruning: 1 to 1>, blocks: <range pruning: 1 to 1>]
+    ├── push downs: [filters: [is_true(t.a:"1" (#2) > 0)], limit: NONE]
+    └── estimated rows: 2.00
+
+query T
+EXPLAIN SELECT * FROM t WHERE a.1 > 1;
+----
+Filter
+├── output columns: [t.a (#0)]
+├── filters: [is_true(t.a:"1" (#2) > 1)]
+├── estimated rows: 1.00
+└── TableScan
+    ├── table: default.default.t
+    ├── output columns: [a (#0), a:"1" (#2)]
+    ├── read rows: 2
+    ├── read size: < 1 KiB
+    ├── partitions total: 1
+    ├── partitions scanned: 1
+    ├── pruning stats: [segments: <range pruning: 1 to 1>, blocks: <range pruning: 1 to 1>]
+    ├── push downs: [filters: [is_true(t.a:"1" (#2) > 1)], limit: NONE]
+    └── estimated rows: 2.00
+
+query T
+EXPLAIN SELECT * FROM t WHERE a.2 > 1;
+----
+Filter
+├── output columns: [t.a (#0)]
+├── filters: [is_true(t.a:"2" (#1) > 1)]
+├── estimated rows: 2.00
+└── TableScan
+    ├── table: default.default.t
+    ├── output columns: [a (#0), a:"2" (#1)]
+    ├── read rows: 2
+    ├── read size: < 1 KiB
+    ├── partitions total: 1
+    ├── partitions scanned: 1
+    ├── pruning stats: [segments: <range pruning: 1 to 1>, blocks: <range pruning: 1 to 1>]
+    ├── push downs: [filters: [is_true(t.a:"2" (#1) > 1)], limit: NONE]
+    └── estimated rows: 2.00
+
+statement ok
+DROP TABLE IF EXISTS t;
diff --git a/tests/sqllogictests/suites/mode/standalone/explain/prewhere_optimization.test b/tests/sqllogictests/suites/mode/standalone/explain/prewhere_optimization.test
index 619814786a02..7540375271ee 100644
--- a/tests/sqllogictests/suites/mode/standalone/explain/prewhere_optimization.test
+++ b/tests/sqllogictests/suites/mode/standalone/explain/prewhere_optimization.test
@@ -117,16 +117,16 @@ explain select * from t_where_optimizer where s:a > 0
 ----
 Filter
 ├── output columns: [t_where_optimizer.id (#0), t_where_optimizer.s (#1)]
-├── filters: [t_where_optimizer.s:a (#2) > 0]
+├── filters: [t_where_optimizer.s:a (#3) > 0]
 ├── estimated rows: 0.00
 └── TableScan
     ├── table: default.default.t_where_optimizer
-    ├── output columns: [id (#0), s (#1), s:a (#2)]
+    ├── output columns: [id (#0), s (#1), s:a (#3)]
     ├── read rows: 0
     ├── read size: 0
     ├── partitions total: 0
     ├── partitions scanned: 0
-    ├── push downs: [filters: [t_where_optimizer.s:a (#2) > 0], limit: NONE]
+    ├── push downs: [filters: [t_where_optimizer.s:a (#3) > 0], limit: NONE]
     └── estimated rows: 0.00
 
 statement ok
diff --git a/tests/sqllogictests/suites/mode/standalone/explain_native/bloom_filter.test b/tests/sqllogictests/suites/mode/standalone/explain_native/bloom_filter.test
index 7b0af1b11a56..6e4db9430542 100644
--- a/tests/sqllogictests/suites/mode/standalone/explain_native/bloom_filter.test
+++ b/tests/sqllogictests/suites/mode/standalone/explain_native/bloom_filter.test
@@ -115,7 +115,7 @@ TableScan
 ├── partitions scanned: 0
 ├── pruning stats: [segments: <range pruning: 3 to 3>, blocks: <range pruning: 3 to 1, bloom pruning: 1 to 0>]
 ├── push downs: [filters: [is_true(bloom_test_t.c2 (#1) = 3)], limit: NONE]
-└── estimated rows: 1.00
+└── estimated rows: 2.67
 
 statement ok
 drop table bloom_test_t
diff --git a/tests/sqllogictests/suites/mode/standalone/explain_native/explain.test b/tests/sqllogictests/suites/mode/standalone/explain_native/explain.test
index e2cbebda7b86..8f93ac474229 100644
--- a/tests/sqllogictests/suites/mode/standalone/explain_native/explain.test
+++ b/tests/sqllogictests/suites/mode/standalone/explain_native/explain.test
@@ -996,3 +996,55 @@ drop table t2;
 
 statement ok
 drop table t3;
+
+# Test Tuple Statistics
+statement ok
+CREATE OR REPLACE TABLE t(a TUPLE(INT, INT));
+
+statement ok
+INSERT INTO t VALUES((1, 2)), ((3, 4));
+
+query T
+EXPLAIN SELECT * FROM t WHERE a.1 > 0;
+----
+TableScan
+├── table: default.default.t
+├── output columns: [a (#0)]
+├── read rows: 2
+├── read size: < 1 KiB
+├── partitions total: 1
+├── partitions scanned: 1
+├── pruning stats: [segments: <range pruning: 1 to 1>, blocks: <range pruning: 1 to 1>]
+├── push downs: [filters: [is_true(t.a:"1" (#2) > 0)], limit: NONE]
+└── estimated rows: 2.00
+
+query T
+EXPLAIN SELECT * FROM t WHERE a.1 > 1;
+----
+TableScan
+├── table: default.default.t
+├── output columns: [a (#0)]
+├── read rows: 2
+├── read size: < 1 KiB
+├── partitions total: 1
+├── partitions scanned: 1
+├── pruning stats: [segments: <range pruning: 1 to 1>, blocks: <range pruning: 1 to 1>]
+├── push downs: [filters: [is_true(t.a:"1" (#2) > 1)], limit: NONE]
+└── estimated rows: 1.00
+
+query T
+EXPLAIN SELECT * FROM t WHERE a.2 > 1;
+----
+TableScan
+├── table: default.default.t
+├── output columns: [a (#0)]
+├── read rows: 2
+├── read size: < 1 KiB
+├── partitions total: 1
+├── partitions scanned: 1
+├── pruning stats: [segments: <range pruning: 1 to 1>, blocks: <range pruning: 1 to 1>]
+├── push downs: [filters: [is_true(t.a:"2" (#1) > 1)], limit: NONE]
+└── estimated rows: 2.00
+
+statement ok
+DROP TABLE IF EXISTS t;
diff --git a/tests/sqllogictests/suites/mode/standalone/explain_native/prewhere_optimization.test b/tests/sqllogictests/suites/mode/standalone/explain_native/prewhere_optimization.test
index c4b626237f79..4eff69639641 100644
--- a/tests/sqllogictests/suites/mode/standalone/explain_native/prewhere_optimization.test
+++ b/tests/sqllogictests/suites/mode/standalone/explain_native/prewhere_optimization.test
@@ -118,7 +118,7 @@ TableScan
 ├── read size: 0
 ├── partitions total: 0
 ├── partitions scanned: 0
-├── push downs: [filters: [t_where_optimizer.s:a (#2) > 0], limit: NONE]
+├── push downs: [filters: [t_where_optimizer.s:a (#3) > 0], limit: NONE]
 └── estimated rows: 0.00
 
 statement ok

From 022c5a7b0846b59b2ddd35dec32ab3c12d52b4db Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E7=82=8E=E6=B3=BC?= <drdr.xp@gmail.com>
Date: Sat, 28 Sep 2024 16:27:18 +0800
Subject: [PATCH 04/14] refactor: gc_dropped_db_by_id() (#16538)

---
 src/common/base/src/lib.rs          |   1 +
 src/common/base/src/vec_ext.rs      |  27 ++++++
 src/meta/api/src/schema_api_impl.rs | 143 +++++++++++++---------------
 src/meta/api/src/util.rs            |  14 +++
 4 files changed, 106 insertions(+), 79 deletions(-)
 create mode 100644 src/common/base/src/vec_ext.rs

diff --git a/src/common/base/src/lib.rs b/src/common/base/src/lib.rs
index 2fa7b8dd03dc..04ae30bd8bb0 100644
--- a/src/common/base/src/lib.rs
+++ b/src/common/base/src/lib.rs
@@ -33,6 +33,7 @@ pub mod headers;
 pub mod mem_allocator;
 pub mod rangemap;
 pub mod runtime;
+pub mod vec_ext;
 pub mod version;
 
 pub use runtime::dump_backtrace;
diff --git a/src/common/base/src/vec_ext.rs b/src/common/base/src/vec_ext.rs
new file mode 100644
index 000000000000..f78dcb3625ce
--- /dev/null
+++ b/src/common/base/src/vec_ext.rs
@@ -0,0 +1,27 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+pub trait VecExt<T> {
+    /// Remove the first element that is equal to the given item.
+    fn remove_first(&mut self, item: &T) -> Option<T>
+    where T: PartialEq;
+}
+
+impl<T> VecExt<T> for Vec<T> {
+    fn remove_first(&mut self, item: &T) -> Option<T>
+    where T: PartialEq {
+        let pos = self.iter().position(|x| x == item)?;
+        Some(self.remove(pos))
+    }
+}
diff --git a/src/meta/api/src/schema_api_impl.rs b/src/meta/api/src/schema_api_impl.rs
index e3b482986866..85bca0d09995 100644
--- a/src/meta/api/src/schema_api_impl.rs
+++ b/src/meta/api/src/schema_api_impl.rs
@@ -27,6 +27,7 @@ use chrono::DateTime;
 use chrono::Utc;
 use databend_common_base::base::uuid::Uuid;
 use databend_common_base::display::display_slice::DisplaySliceExt;
+use databend_common_base::vec_ext::VecExt;
 use databend_common_meta_app::app_error::AppError;
 use databend_common_meta_app::app_error::CommitTableMetaError;
 use databend_common_meta_app::app_error::CreateAsDropTableWithoutDropTime;
@@ -175,7 +176,6 @@ use databend_common_meta_types::ConditionResult;
 use databend_common_meta_types::MatchSeqExt;
 use databend_common_meta_types::MetaError;
 use databend_common_meta_types::MetaId;
-use databend_common_meta_types::TxnCondition;
 use databend_common_meta_types::TxnGetRequest;
 use databend_common_meta_types::TxnGetResponse;
 use databend_common_meta_types::TxnOp;
@@ -218,6 +218,7 @@ use crate::util::deserialize_struct_get_response;
 use crate::util::mget_pb_values;
 use crate::util::txn_delete_exact;
 use crate::util::txn_op_put_pb;
+use crate::util::txn_put_pb;
 use crate::util::txn_replace_exact;
 use crate::util::unknown_database_error;
 use crate::SchemaApi;
@@ -3707,100 +3708,84 @@ async fn gc_dropped_db_by_id(
     db_name: String,
 ) -> Result<(), KVAppError> {
     // List tables by tenant, db_id, table_name.
-    let dbid_idlist = DatabaseIdHistoryIdent::new(tenant, db_name);
-    let (db_id_list_seq, db_id_list_opt) = kv_api.get_pb_seq_and_value(&dbid_idlist).await?;
+    let db_id_history_ident = DatabaseIdHistoryIdent::new(tenant, db_name);
+    let Some(seq_dbid_list) = kv_api.get_pb(&db_id_history_ident).await? else {
+        return Ok(());
+    };
+
+    let mut db_id_list = seq_dbid_list.data;
+
+    // If the db_id is not in the list, return.
+    if db_id_list.id_list.remove_first(&db_id).is_none() {
+        return Ok(());
+    }
 
-    let mut db_id_list = match db_id_list_opt {
-        Some(list) => list,
-        None => return Ok(()),
+    let dbid = DatabaseId { db_id };
+    let Some(seq_db_meta) = kv_api.get_pb(&dbid).await? else {
+        return Ok(());
     };
-    for (i, dbid) in db_id_list.id_list.iter().enumerate() {
-        if *dbid != db_id {
-            continue;
-        }
-        let dbid = DatabaseId { db_id };
-        let (db_meta_seq, _db_meta) = kv_api.get_pb_seq_and_value(&dbid).await?;
-        if db_meta_seq == 0 {
-            return Ok(());
-        }
-        let id_to_name = DatabaseIdToName { db_id };
-        let (name_ident_seq, _name_ident) = kv_api.get_pb_seq_and_value(&id_to_name).await?;
-        if name_ident_seq == 0 {
-            return Ok(());
-        }
 
-        let dbid_tbname_idlist = TableIdHistoryIdent {
-            database_id: db_id,
-            table_name: "".to_string(),
-        };
+    // TODO: enable this when gc_in_progress is set.
+    // if !seq_db_meta.gc_in_progress {
+    //     let err = UnknownDatabaseId::new(
+    //         db_id,
+    //         "database is not in gc_in_progress state, \
+    //         can not gc. \
+    //         First mark the database as gc_in_progress, \
+    //         then gc is allowed.",
+    //     );
+    //     return Err(AppError::from(err).into());
+    // }
 
-        let dir_name = DirName::new(dbid_tbname_idlist);
+    let id_to_name = DatabaseIdToName { db_id };
+    let Some(seq_name) = kv_api.get_pb(&id_to_name).await? else {
+        return Ok(());
+    };
 
-        let table_id_list_keys = list_keys(kv_api, &dir_name).await?;
-        let keys: Vec<String> = table_id_list_keys
-            .iter()
-            .map(|table_id_list_key| {
-                TableIdHistoryIdent {
-                    database_id: db_id,
-                    table_name: table_id_list_key.table_name.clone(),
-                }
-                .to_string_key()
-            })
-            .collect();
+    let table_history_ident = TableIdHistoryIdent {
+        database_id: db_id,
+        table_name: "dummy".to_string(),
+    };
+    let dir_name = DirName::new(table_history_ident);
 
-        let mut txn = TxnRequest::default();
+    let table_history_items = kv_api.list_pb_vec(&dir_name).await?;
 
-        for c in keys.chunks(DEFAULT_MGET_SIZE) {
-            let tb_id_list_seq_vec: Vec<(u64, Option<TableIdList>)> =
-                mget_pb_values(kv_api, c).await?;
-            let mut iter = c.iter();
-            for (tb_id_list_seq, tb_id_list_opt) in tb_id_list_seq_vec {
-                let tb_id_list = match tb_id_list_opt {
-                    Some(list) => list,
-                    None => {
-                        continue;
-                    }
-                };
+    let mut txn = TxnRequest::default();
 
-                for tb_id in tb_id_list.id_list {
-                    let table_id_ident = TableId { table_id: tb_id };
-                    remove_copied_files_for_dropped_table(kv_api, &table_id_ident).await?;
-                    remove_data_for_dropped_table(kv_api, &table_id_ident, &mut txn).await?;
-                    remove_index_for_dropped_table(kv_api, tenant, &table_id_ident, &mut txn)
-                        .await?;
-                }
+    for (ident, table_history) in table_history_items {
+        for tb_id in table_history.id_list.iter() {
+            let table_id_ident = TableId { table_id: *tb_id };
 
-                let id_key = iter.next().unwrap();
-                txn.if_then.push(TxnOp::delete(id_key));
-                txn.condition
-                    .push(TxnCondition::eq_seq(id_key, tb_id_list_seq));
-            }
+            // TODO: mark table as gc_in_progress
 
-            // for id_key in c {
-            // if_then.push(txn_op_del(id_key));
-            // }
-        }
-        db_id_list.id_list.remove(i);
-        txn.condition
-            .push(txn_cond_seq(&dbid_idlist, Eq, db_id_list_seq));
-        if db_id_list.id_list.is_empty() {
-            txn.if_then.push(txn_op_del(&dbid_idlist));
-        } else {
-            // save new db id list
-            txn.if_then
-                .push(txn_op_put(&dbid_idlist, serialize_struct(&db_id_list)?));
+            remove_copied_files_for_dropped_table(kv_api, &table_id_ident).await?;
+            remove_data_for_dropped_table(kv_api, &table_id_ident, &mut txn).await?;
+            remove_index_for_dropped_table(kv_api, tenant, &table_id_ident, &mut txn).await?;
         }
 
-        txn.condition.push(txn_cond_seq(&dbid, Eq, db_meta_seq));
-        txn.if_then.push(txn_op_del(&dbid));
         txn.condition
-            .push(txn_cond_seq(&id_to_name, Eq, name_ident_seq));
-        txn.if_then.push(txn_op_del(&id_to_name));
+            .push(txn_cond_eq_seq(&ident, table_history.seq));
+        txn.if_then.push(txn_op_del(&ident));
+    }
 
-        let _resp = kv_api.transaction(txn).await?;
-        break;
+    txn.condition
+        .push(txn_cond_eq_seq(&db_id_history_ident, seq_dbid_list.seq));
+    if db_id_list.id_list.is_empty() {
+        txn.if_then.push(txn_op_del(&db_id_history_ident));
+    } else {
+        // save new db id list
+        txn.if_then
+            .push(txn_put_pb(&db_id_history_ident, &db_id_list)?);
     }
 
+    txn.condition.push(txn_cond_eq_seq(&dbid, seq_db_meta.seq));
+    txn.if_then.push(txn_op_del(&dbid));
+    txn.condition
+        .push(txn_cond_eq_seq(&id_to_name, seq_name.seq));
+    txn.if_then.push(txn_op_del(&id_to_name));
+
+    let _resp = kv_api.transaction(txn).await?;
+
     Ok(())
 }
 
diff --git a/src/meta/api/src/util.rs b/src/meta/api/src/util.rs
index 72526372d0b7..df60eb580457 100644
--- a/src/meta/api/src/util.rs
+++ b/src/meta/api/src/util.rs
@@ -305,6 +305,20 @@ pub fn txn_cond_seq(key: &impl kvapi::Key, op: ConditionResult, seq: u64) -> Txn
     }
 }
 
+pub fn txn_put_pb<K>(key: &K, value: &K::ValueType) -> Result<TxnOp, InvalidArgument>
+where
+    K: kvapi::Key,
+    K::ValueType: FromToProto + 'static,
+{
+    let p = value.to_pb().map_err(|e| InvalidArgument::new(e, ""))?;
+
+    let mut buf = vec![];
+    prost::Message::encode(&p, &mut buf).map_err(|e| InvalidArgument::new(e, ""))?;
+
+    Ok(TxnOp::put(key.to_string_key(), buf))
+}
+
+/// Deprecate this. Replace it with `txn_put_pb().with_ttl()`
 pub fn txn_op_put_pb<K>(
     key: &K,
     value: &K::ValueType,

From d8d6b47e70d659e1e5e038684ba2e5a8ad3545b7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E7=82=8E=E6=B3=BC?= <drdr.xp@gmail.com>
Date: Sat, 28 Sep 2024 21:38:34 +0800
Subject: [PATCH 05/14] fix: Only when all tables are included, the db can be
 dropped (#16539)

---
 src/meta/api/src/schema_api_impl.rs | 113 +++++-----------------------
 1 file changed, 18 insertions(+), 95 deletions(-)

diff --git a/src/meta/api/src/schema_api_impl.rs b/src/meta/api/src/schema_api_impl.rs
index 85bca0d09995..404e6b7f1cf2 100644
--- a/src/meta/api/src/schema_api_impl.rs
+++ b/src/meta/api/src/schema_api_impl.rs
@@ -2684,7 +2684,7 @@ impl<KV: kvapi::KVApi<Error = MetaError> + ?Sized> SchemaApi for KV {
                 let table_infos = do_get_table_history(self, db_filter, capacity).await?;
 
                 // A DB can be removed only when all its tables are removed.
-                if vacuum_db && capacity >= table_infos.len() {
+                if vacuum_db && capacity > table_infos.len() {
                     vacuum_ids.push(DroppedId::Db {
                         db_id: db_info.database_id.db_id,
                         db_name: db_info.name_ident.database_name().to_string(),
@@ -3578,31 +3578,6 @@ async fn batch_filter_table_info(
     Ok(())
 }
 
-type TableFilterInfoList<'a> = Vec<(
-    Range<Option<DateTime<Utc>>>,
-    &'a Arc<DatabaseInfo>,
-    u64,
-    String,
-)>;
-
-#[logcall::logcall(input = "")]
-#[fastrace::trace]
-async fn get_gc_table_info(
-    kv_api: &(impl kvapi::KVApi<Error = MetaError> + ?Sized),
-    limit: usize,
-    table_id_list: &TableFilterInfoList<'_>,
-) -> Result<Vec<(Arc<TableInfo>, u64)>, KVAppError> {
-    let table_id_list = &table_id_list[..std::cmp::min(limit, table_id_list.len())];
-
-    let mut filter_tb_infos = vec![];
-
-    for chunk in table_id_list.chunks(DEFAULT_MGET_SIZE) {
-        batch_filter_table_info(kv_api, chunk, &mut filter_tb_infos).await?;
-    }
-
-    Ok(filter_tb_infos)
-}
-
 #[logcall::logcall(input = "")]
 #[fastrace::trace]
 async fn do_get_table_history(
@@ -3610,88 +3585,35 @@ async fn do_get_table_history(
     db_filter: (Range<Option<DateTime<Utc>>>, Arc<DatabaseInfo>),
     limit: usize,
 ) -> Result<Vec<(Arc<TableInfo>, u64)>, KVAppError> {
-    let mut filter_tb_infos = vec![];
-
-    // step 1: list db table name with db id
-    let mut filter_db_info_with_table_id_key_list: Vec<_> = vec![];
-
     let (drop_time_range, db_info) = db_filter;
     let db_id = db_info.database_id.db_id;
 
-    // List tables by tenant, db_id, table_name.
     let dbid_tbname_idlist = TableIdHistoryIdent {
         database_id: db_id,
         table_name: "dummy".to_string(),
     };
     let dir_name = DirName::new(dbid_tbname_idlist);
-    let strm = kv_api.list_pb_keys(&dir_name).await?;
-    let table_id_list_keys = strm.try_collect::<Vec<_>>().await?;
-
-    let keys = table_id_list_keys
-        .iter()
-        .map(|table_id_list_key| (drop_time_range.clone(), &db_info, table_id_list_key.clone()))
-        .collect::<Vec<_>>();
-
-    filter_db_info_with_table_id_key_list.extend(keys);
-
-    // step 2: list all table id of table by table name
-    let keys = filter_db_info_with_table_id_key_list
-        .iter()
-        .map(|(_, db_info, table_id_list_key)| TableIdHistoryIdent {
-            database_id: db_info.database_id.db_id,
-            table_name: table_id_list_key.table_name.clone(),
-        })
-        .collect::<Vec<_>>();
-
-    let mut filter_db_info_with_table_id_list: TableFilterInfoList<'_> = vec![];
-    let mut table_id_list_keys_iter = filter_db_info_with_table_id_key_list.into_iter();
-    for c in keys.chunks(DEFAULT_MGET_SIZE) {
-        let strm = kv_api.get_pb_values(c.to_vec()).await?;
-        let table_id_list_vec = strm
-            .try_filter_map(|x| async move { Ok(x) })
-            .try_collect::<Vec<_>>()
-            .await?;
+    let table_history_kvs = kv_api.list_pb_vec(&dir_name).await?;
 
-        for seq_table_id_list in table_id_list_vec {
-            let (filter, db_info, table_id_list_key) = table_id_list_keys_iter.next().unwrap();
-            let tb_id_list = seq_table_id_list.data;
-
-            let id_list: Vec<_> = tb_id_list
-                .id_list
-                .iter()
-                .map(|id| {
-                    (
-                        filter.clone(),
-                        db_info,
-                        *id,
-                        table_id_list_key.table_name.clone(),
-                    )
-                })
-                .collect();
+    let mut the_list = vec![];
 
-            filter_db_info_with_table_id_list.extend(id_list);
-            if filter_db_info_with_table_id_list.len() < DEFAULT_MGET_SIZE {
-                continue;
-            }
-
-            let ret = get_gc_table_info(kv_api, limit, &filter_db_info_with_table_id_list).await?;
-            filter_tb_infos.extend(ret);
-            filter_db_info_with_table_id_list.clear();
-
-            if filter_tb_infos.len() >= limit {
-                return Ok(filter_tb_infos);
-            }
+    for (ident, table_history) in table_history_kvs {
+        for table_id in table_history.id_list.iter() {
+            the_list.push((
+                drop_time_range.clone(),
+                &db_info,
+                *table_id,
+                ident.table_name.clone(),
+            ));
         }
+    }
 
-        if !filter_db_info_with_table_id_list.is_empty() {
-            let ret = get_gc_table_info(kv_api, limit, &filter_db_info_with_table_id_list).await?;
-            filter_tb_infos.extend(ret);
-            filter_db_info_with_table_id_list.clear();
+    let mut filter_tb_infos = vec![];
 
-            if filter_tb_infos.len() >= limit {
-                return Ok(filter_tb_infos);
-            }
-        }
+    for c in the_list[..std::cmp::min(limit, the_list.len())].chunks(DEFAULT_MGET_SIZE) {
+        let mut infos = vec![];
+        batch_filter_table_info(kv_api, c, &mut infos).await?;
+        filter_tb_infos.extend(infos);
     }
 
     Ok(filter_tb_infos)
@@ -3880,6 +3802,7 @@ async fn update_txn_to_remove_table_history(
         txn.if_then
             .push(txn_op_put_pb(table_id_history_ident, &history, None)?);
     }
+
     Ok(())
 }
 

From c6bd10b9563e937a1bd2694822a0e825d9d2079e Mon Sep 17 00:00:00 2001
From: sundyli <543950155@qq.com>
Date: Sat, 28 Sep 2024 23:44:00 +0800
Subject: [PATCH 06/14] chore(query): add more asserts (#16536)

* add tests

* add tests
---
 src/query/catalog/src/plan/stream_column.rs      |  5 +----
 src/query/expression/src/block.rs                | 14 ++++++++++++--
 src/query/expression/src/kernels/filter.rs       | 13 ++++++++++++-
 src/query/expression/src/kernels/sort_compare.rs |  2 ++
 4 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/src/query/catalog/src/plan/stream_column.rs b/src/query/catalog/src/plan/stream_column.rs
index fd10a1c80bd2..40cdc1d0db72 100644
--- a/src/query/catalog/src/plan/stream_column.rs
+++ b/src/query/catalog/src/plan/stream_column.rs
@@ -126,10 +126,7 @@ impl StreamColumnMeta {
 }
 
 pub fn build_origin_block_row_num(num_rows: usize) -> BlockEntry {
-    let mut row_ids = Vec::with_capacity(num_rows);
-    for i in 0..num_rows {
-        row_ids.push(i as u64);
-    }
+    let row_ids = (0..num_rows as u64).collect();
     let column = Value::Column(UInt64Type::from_data(row_ids));
 
     BlockEntry::new(
diff --git a/src/query/expression/src/block.rs b/src/query/expression/src/block.rs
index 7911dd33d14c..820d5841d4a3 100644
--- a/src/query/expression/src/block.rs
+++ b/src/query/expression/src/block.rs
@@ -117,7 +117,6 @@ impl DataBlock {
         num_rows: usize,
         meta: Option<BlockMetaInfoPtr>,
     ) -> Self {
-        #[cfg(debug_assertions)]
         Self::check_columns_valid(&columns, num_rows).unwrap();
 
         Self {
@@ -130,6 +129,7 @@ impl DataBlock {
     fn check_columns_valid(columns: &[BlockEntry], num_rows: usize) -> Result<()> {
         for entry in columns.iter() {
             if let Value::Column(c) = &entry.value {
+                #[cfg(debug_assertions)]
                 c.check_valid()?;
                 if c.len() != num_rows {
                     return Err(ErrorCode::Internal(format!(
@@ -264,6 +264,12 @@ impl DataBlock {
     }
 
     pub fn slice(&self, range: Range<usize>) -> Self {
+        assert!(
+            range.end <= self.num_rows(),
+            "range {:?} out of len {}",
+            range,
+            self.num_rows()
+        );
         let columns = self
             .columns()
             .iter()
@@ -279,7 +285,11 @@ impl DataBlock {
             .collect();
         Self {
             columns,
-            num_rows: range.end - range.start,
+            num_rows: if range.is_empty() {
+                0
+            } else {
+                range.end - range.start
+            },
             meta: self.meta.clone(),
         }
     }
diff --git a/src/query/expression/src/kernels/filter.rs b/src/query/expression/src/kernels/filter.rs
index 053d7842b8cf..1b730323a24f 100644
--- a/src/query/expression/src/kernels/filter.rs
+++ b/src/query/expression/src/kernels/filter.rs
@@ -112,6 +112,7 @@ struct FilterVisitor<'a> {
     filter: &'a Bitmap,
     result: Option<Value<AnyType>>,
     num_rows: usize,
+    original_rows: usize,
 }
 
 impl<'a> FilterVisitor<'a> {
@@ -121,6 +122,7 @@ impl<'a> FilterVisitor<'a> {
             filter,
             result: None,
             num_rows,
+            original_rows: filter.len(),
         }
     }
 }
@@ -130,6 +132,8 @@ impl<'a> ValueVisitor for FilterVisitor<'a> {
         match value {
             Value::Scalar(c) => self.visit_scalar(c),
             Value::Column(c) => {
+                assert!(c.len() == self.original_rows);
+
                 if c.len() == self.num_rows || c.len() == 0 {
                     self.result = Some(Value::Column(c));
                 } else if self.num_rows == 0 {
@@ -255,7 +259,14 @@ impl<'a> ValueVisitor for FilterVisitor<'a> {
         Ok(())
     }
 
-    fn visit_boolean(&mut self, bitmap: Bitmap) -> Result<()> {
+    fn visit_boolean(&mut self, mut bitmap: Bitmap) -> Result<()> {
+        // faster path for all bits set
+        if bitmap.unset_bits() == 0 {
+            bitmap.slice(0, self.num_rows);
+            self.result = Some(Value::Column(BooleanType::upcast_column(bitmap)));
+            return Ok(());
+        }
+
         let capacity = self.num_rows.saturating_add(7) / 8;
         let mut builder: Vec<u8> = Vec::with_capacity(capacity);
         let mut builder_ptr = builder.as_mut_ptr();
diff --git a/src/query/expression/src/kernels/sort_compare.rs b/src/query/expression/src/kernels/sort_compare.rs
index f492ba7178c9..45d91f0098d0 100644
--- a/src/query/expression/src/kernels/sort_compare.rs
+++ b/src/query/expression/src/kernels/sort_compare.rs
@@ -263,6 +263,7 @@ impl ValueVisitor for SortCompare {
     // faster path for numeric
     fn visit_number<T: Number>(&mut self, column: Buffer<T>) -> Result<()> {
         let values = column.as_slice();
+        assert!(values.len() == self.rows);
         self.generic_sort(values, |c, idx| c[idx as usize], |a: T, b: T| a.cmp(&b));
         Ok(())
     }
@@ -276,6 +277,7 @@ impl ValueVisitor for SortCompare {
     }
 
     fn visit_typed_column<T: ValueType>(&mut self, col: T::Column) -> Result<()> {
+        assert!(T::column_len(&col) == self.rows);
         self.generic_sort(
             &col,
             |c, idx| -> T::ScalarRef<'_> { unsafe { T::index_column_unchecked(c, idx as _) } },

From 4129816b5e4fc72ce688879599dc2252e70058ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E7=82=8E=E6=B3=BC?= <drdr.xp@gmail.com>
Date: Sun, 29 Sep 2024 08:47:22 +0800
Subject: [PATCH 07/14] refactor: SchemaAPI::do_get_table_history (#16540)

---
 src/meta/api/src/schema_api_impl.rs | 116 +++++++++++-----------------
 1 file changed, 44 insertions(+), 72 deletions(-)

diff --git a/src/meta/api/src/schema_api_impl.rs b/src/meta/api/src/schema_api_impl.rs
index 404e6b7f1cf2..88304f4432a1 100644
--- a/src/meta/api/src/schema_api_impl.rs
+++ b/src/meta/api/src/schema_api_impl.rs
@@ -3528,56 +3528,6 @@ fn build_upsert_table_deduplicated_label(deduplicated_label: String) -> TxnOp {
     )
 }
 
-#[allow(clippy::type_complexity)]
-#[logcall::logcall(input = "")]
-#[fastrace::trace]
-async fn batch_filter_table_info(
-    kv_api: &(impl kvapi::KVApi<Error = MetaError> + ?Sized),
-    args: &[(
-        Range<Option<DateTime<Utc>>>,
-        &Arc<DatabaseInfo>,
-        u64,
-        String,
-    )],
-    filter_tb_infos: &mut Vec<(Arc<TableInfo>, u64)>,
-) -> Result<(), KVAppError> {
-    let table_id_idents = args
-        .iter()
-        .map(|(_f, _db, table_id, _table_name)| TableId::new(*table_id));
-
-    let seq_metas = kv_api.get_pb_values_vec(table_id_idents).await?;
-
-    for (seq_meta, (rng, db_info, table_id, table_name)) in seq_metas.into_iter().zip(args.iter()) {
-        let Some(seq_meta) = seq_meta else {
-            error!(
-                "batch_filter_table_info cannot find {:?} table_meta",
-                table_id
-            );
-            continue;
-        };
-
-        if !rng.contains(&seq_meta.data.drop_on) {
-            continue;
-        }
-
-        let tb_info = TableInfo {
-            ident: TableIdent {
-                table_id: *table_id,
-                seq: seq_meta.seq,
-            },
-            desc: format!("'{}'.'{}'", db_info.name_ident.database_name(), table_name,),
-            name: (*table_name).clone(),
-            meta: seq_meta.data,
-            db_type: DatabaseType::NormalDB,
-            catalog_info: Default::default(),
-        };
-
-        filter_tb_infos.push((Arc::new(tb_info), db_info.database_id.db_id));
-    }
-
-    Ok(())
-}
-
 #[logcall::logcall(input = "")]
 #[fastrace::trace]
 async fn do_get_table_history(
@@ -3595,25 +3545,48 @@ async fn do_get_table_history(
     let dir_name = DirName::new(dbid_tbname_idlist);
     let table_history_kvs = kv_api.list_pb_vec(&dir_name).await?;
 
-    let mut the_list = vec![];
+    let mut args = vec![];
 
     for (ident, table_history) in table_history_kvs {
         for table_id in table_history.id_list.iter() {
-            the_list.push((
-                drop_time_range.clone(),
-                &db_info,
-                *table_id,
-                ident.table_name.clone(),
-            ));
+            args.push((TableId::new(*table_id), ident.table_name.clone()));
         }
     }
 
     let mut filter_tb_infos = vec![];
 
-    for c in the_list[..std::cmp::min(limit, the_list.len())].chunks(DEFAULT_MGET_SIZE) {
-        let mut infos = vec![];
-        batch_filter_table_info(kv_api, c, &mut infos).await?;
-        filter_tb_infos.extend(infos);
+    for chunk in args[..std::cmp::min(limit, args.len())].chunks(DEFAULT_MGET_SIZE) {
+        let table_id_idents = chunk.iter().map(|(table_id, _)| table_id.clone());
+
+        let seq_metas = kv_api.get_pb_values_vec(table_id_idents).await?;
+
+        for (seq_meta, (table_id, table_name)) in seq_metas.into_iter().zip(chunk.iter()) {
+            let Some(seq_meta) = seq_meta else {
+                error!(
+                    "batch_filter_table_info cannot find {:?} table_meta",
+                    table_id
+                );
+                continue;
+            };
+
+            if !drop_time_range.contains(&seq_meta.data.drop_on) {
+                continue;
+            }
+
+            let tb_info = TableInfo {
+                ident: TableIdent {
+                    table_id: table_id.table_id,
+                    seq: seq_meta.seq,
+                },
+                desc: format!("'{}'.'{}'", db_info.name_ident.database_name(), table_name,),
+                name: (*table_name).clone(),
+                meta: seq_meta.data,
+                db_type: DatabaseType::NormalDB,
+                catalog_info: Default::default(),
+            };
+
+            filter_tb_infos.push((Arc::new(tb_info), db_info.database_id.db_id));
+        }
     }
 
     Ok(filter_tb_infos)
@@ -3681,7 +3654,7 @@ async fn gc_dropped_db_by_id(
             // TODO: mark table as gc_in_progress
 
             remove_copied_files_for_dropped_table(kv_api, &table_id_ident).await?;
-            remove_data_for_dropped_table(kv_api, &table_id_ident, &mut txn).await?;
+            let _ = remove_data_for_dropped_table(kv_api, &table_id_ident, &mut txn).await?;
             remove_index_for_dropped_table(kv_api, tenant, &table_id_ident, &mut txn).await?;
         }
 
@@ -3731,7 +3704,7 @@ async fn gc_dropped_table_by_id(
         let mut txn = TxnRequest::default();
 
         // 1)
-        remove_data_for_dropped_table(kv_api, table_id_ident, &mut txn).await?;
+        let _ = remove_data_for_dropped_table(kv_api, table_id_ident, &mut txn).await?;
 
         // 2)
         let table_id_history_ident = TableIdHistoryIdent {
@@ -3814,21 +3787,20 @@ async fn remove_data_for_dropped_table(
     kv_api: &(impl kvapi::KVApi<Error = MetaError> + ?Sized),
     table_id: &TableId,
     txn: &mut TxnRequest,
-) -> Result<(), KVAppError> {
+) -> Result<Result<(), String>, MetaError> {
     let seq_meta = kv_api.get_pb(table_id).await?;
 
     let Some(seq_meta) = seq_meta else {
-        error!(
-            "gc_dropped_table_by_id cannot find {:?} table_meta",
-            table_id
-        );
-        return Ok(());
+        let err = format!("cannot find TableMeta by id: {:?}, ", table_id);
+        error!("{}", err);
+        return Ok(Err(err));
     };
 
     // TODO: enable this check. Currently when gc db, the table may not be dropped.
     // if seq_meta.data.drop_on.is_none() {
-    //     warn!("gc_dropped_table_by_id {:?} is not dropped", table_id);
-    //     return Ok(());
+    //     let err = format!("Table {:?} is not dropped, can not remove", table_id);
+    //     warn!("{}", err);
+    //     return Ok(Err(err));
     // }
 
     txn_delete_exact(txn, table_id, seq_meta.seq);
@@ -3844,7 +3816,7 @@ async fn remove_data_for_dropped_table(
         txn_delete_exact(txn, &id_to_name, seq_name.seq);
     }
 
-    Ok(())
+    Ok(Ok(()))
 }
 
 async fn remove_index_for_dropped_table(

From 5bfafbeb055dc19c8a565e44366b14595ba6717a Mon Sep 17 00:00:00 2001
From: arkzuse <88483186+arkzuse@users.noreply.github.com>
Date: Sun, 29 Sep 2024 07:56:41 +0530
Subject: [PATCH 08/14] feat: short sql length setting (#16502)

feature: short sql length setting
---
 src/common/base/src/base/string.rs            |  9 +++----
 src/common/base/tests/it/string.rs            | 25 ++++++++++++++-----
 .../service/src/interpreters/interpreter.rs   |  5 +++-
 .../src/servers/http/clickhouse_handler.rs    | 11 +++++++-
 .../src/servers/http/v1/query/http_query.rs   |  2 +-
 .../service/src/sessions/query_ctx_shared.rs  |  7 +++++-
 src/query/settings/src/settings_default.rs    |  6 +++++
 .../settings/src/settings_getter_setter.rs    |  8 ++++++
 src/query/settings/tests/it/setting.rs        | 19 ++++++++++++++
 9 files changed, 77 insertions(+), 15 deletions(-)

diff --git a/src/common/base/src/base/string.rs b/src/common/base/src/base/string.rs
index e240e2630d13..0b462d0d4c93 100644
--- a/src/common/base/src/base/string.rs
+++ b/src/common/base/src/base/string.rs
@@ -193,8 +193,7 @@ pub fn mask_connection_info(sql: &str) -> String {
 /// Maximum length of the SQL query to be displayed or log.
 /// If the query exceeds this length and starts with keywords,
 /// it will be truncated and appended with the remaining length.
-pub fn short_sql(sql: String) -> String {
-    const MAX_LENGTH: usize = 128;
+pub fn short_sql(sql: String, max_length: u64) -> String {
     let keywords = ["INSERT"];
 
     fn starts_with_any(query: &str, keywords: &[&str]) -> bool {
@@ -209,10 +208,10 @@ pub fn short_sql(sql: String) -> String {
     // of multiple Unicode code points.
     // This ensures that we handle complex characters like emojis or
     // accented characters properly.
-    if query.graphemes(true).count() > MAX_LENGTH && starts_with_any(query, &keywords) {
-        let truncated: String = query.graphemes(true).take(MAX_LENGTH).collect();
+    if query.graphemes(true).count() > max_length as usize && starts_with_any(query, &keywords) {
+        let truncated: String = query.graphemes(true).take(max_length as usize).collect();
         let original_length = query.graphemes(true).count();
-        let remaining_length = original_length.saturating_sub(MAX_LENGTH);
+        let remaining_length = original_length.saturating_sub(max_length as usize);
         // Append the remaining length indicator
         truncated + &format!("...[{} more characters]", remaining_length)
     } else {
diff --git a/src/common/base/tests/it/string.rs b/src/common/base/tests/it/string.rs
index 4b49c57f2673..5cdf951f7b4d 100644
--- a/src/common/base/tests/it/string.rs
+++ b/src/common/base/tests/it/string.rs
@@ -111,9 +111,10 @@ fn test_mask_connection_info() {
 
 #[test]
 fn test_short_sql() {
+    let max_length: u64 = 128;
     // Test case 1: SQL query shorter than 128 characters
     let sql1 = "SELECT * FROM users WHERE id = 1;".to_string();
-    assert_eq!(short_sql(sql1.clone()), sql1);
+    assert_eq!(short_sql(sql1.clone(), max_length), sql1);
 
     // Test case 2: SQL query longer than 128 characters and starts with "INSERT"
     let long_sql_insert = "INSERT INTO users (id, name, email) VALUES ".to_string()
@@ -123,23 +124,32 @@ fn test_short_sql() {
         let truncated: String = long_sql_insert.graphemes(true).take(128).collect();
         truncated + &format!("...[{} more characters]", expected_length_insert)
     };
-    assert_eq!(short_sql(long_sql_insert.clone()), expected_result_insert);
+    assert_eq!(
+        short_sql(long_sql_insert.clone(), max_length),
+        expected_result_insert
+    );
 
     // Test case 3: SQL query longer than 128 characters but does not start with "INSERT"
     let long_sql_update =
         "UPDATE users SET name = 'John' WHERE id = 1;".to_string() + &"id = 1 OR ".repeat(20); // Make sure this creates a string longer than 128 characters
-    assert_eq!(short_sql(long_sql_update.clone()), long_sql_update);
+    assert_eq!(
+        short_sql(long_sql_update.clone(), max_length),
+        long_sql_update
+    );
 
     // Test case 4: Empty SQL query
     let empty_sql = "".to_string();
-    assert_eq!(short_sql(empty_sql.clone()), empty_sql);
+    assert_eq!(short_sql(empty_sql.clone(), max_length), empty_sql);
 
     // Test case 5: SQL query with leading whitespace
     let sql_with_whitespace =
         "   INSERT INTO users (id, name, email) VALUES (1, 'John Doe', 'john@example.com');"
             .to_string();
     let trimmed_sql = sql_with_whitespace.trim_start().to_string();
-    assert_eq!(short_sql(sql_with_whitespace.clone()), trimmed_sql);
+    assert_eq!(
+        short_sql(sql_with_whitespace.clone(), max_length),
+        trimmed_sql
+    );
 
     // Test case 6: SQL query with multiple emojis to test truncation at an emoji point
     let emoji_sql = "INSERT INTO users (id, name) VALUES (1, 'John Doe 😊😊😊😊😊😊😊😊😊😊');"
@@ -150,5 +160,8 @@ fn test_short_sql() {
         let remaining_length = emoji_sql.graphemes(true).count().saturating_sub(128);
         truncated + &format!("...[{} more characters]", remaining_length)
     };
-    assert_eq!(short_sql(emoji_sql.clone()), expected_emoji_result);
+    assert_eq!(
+        short_sql(emoji_sql.clone(), max_length),
+        expected_emoji_result
+    );
 }
diff --git a/src/query/service/src/interpreters/interpreter.rs b/src/query/service/src/interpreters/interpreter.rs
index fa1c9e311377..c74892dc32a0 100644
--- a/src/query/service/src/interpreters/interpreter.rs
+++ b/src/query/service/src/interpreters/interpreter.rs
@@ -207,7 +207,10 @@ pub async fn interpreter_plan_sql(ctx: Arc<QueryContext>, sql: &str) -> Result<(
         Arc::new(ServiceQueryExecutor::new(ctx.clone())),
     );
     let result = planner.plan_sql(sql).await;
-    let short_sql = short_sql(sql.to_string());
+    let short_sql = short_sql(
+        sql.to_string(),
+        ctx.get_settings().get_short_sql_max_length()?,
+    );
     let mut stmt = if let Ok((_, extras)) = &result {
         Some(extras.statement.clone())
     } else {
diff --git a/src/query/service/src/servers/http/clickhouse_handler.rs b/src/query/service/src/servers/http/clickhouse_handler.rs
index 8020159f1d14..22f41136144f 100644
--- a/src/query/service/src/servers/http/clickhouse_handler.rs
+++ b/src/query/service/src/servers/http/clickhouse_handler.rs
@@ -330,7 +330,16 @@ pub async fn clickhouse_handler_post(
         // other parts of the request already logged in middleware
         let len = sql.len();
         let msg = if len > n {
-            format!("{}...(omit {} bytes)", short_sql(sql.clone()), len - n)
+            format!(
+                "{}...(omit {} bytes)",
+                short_sql(
+                    sql.clone(),
+                    ctx.get_settings()
+                        .get_short_sql_max_length()
+                        .unwrap_or(1000)
+                ),
+                len - n
+            )
         } else {
             sql.to_string()
         };
diff --git a/src/query/service/src/servers/http/v1/query/http_query.rs b/src/query/service/src/servers/http/v1/query/http_query.rs
index 8641e665392f..f714a72b9ba0 100644
--- a/src/query/service/src/servers/http/v1/query/http_query.rs
+++ b/src/query/service/src/servers/http/v1/query/http_query.rs
@@ -130,7 +130,7 @@ impl Debug for HttpQueryRequest {
         f.debug_struct("HttpQueryRequest")
             .field("session_id", &self.session_id)
             .field("session", &self.session)
-            .field("sql", &short_sql(self.sql.clone()))
+            .field("sql", &short_sql(self.sql.clone(), 1000))
             .field("pagination", &self.pagination)
             .field("string_fields", &self.string_fields)
             .field("stage_attachment", &self.stage_attachment)
diff --git a/src/query/service/src/sessions/query_ctx_shared.rs b/src/query/service/src/sessions/query_ctx_shared.rs
index 55c1f44bdd7a..cb073e69e772 100644
--- a/src/query/service/src/sessions/query_ctx_shared.rs
+++ b/src/query/service/src/sessions/query_ctx_shared.rs
@@ -487,7 +487,12 @@ impl QueryContextShared {
     pub fn attach_query_str(&self, kind: QueryKind, query: String) {
         {
             let mut running_query = self.running_query.write();
-            *running_query = Some(short_sql(query));
+            *running_query = Some(short_sql(
+                query,
+                self.get_settings()
+                    .get_short_sql_max_length()
+                    .unwrap_or(1000),
+            ));
         }
 
         {
diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs
index bef9c5ae9dbc..a1a6ddb005f1 100644
--- a/src/query/settings/src/settings_default.rs
+++ b/src/query/settings/src/settings_default.rs
@@ -909,6 +909,12 @@ impl DefaultSettings {
                     mode: SettingMode::Both,
                     range: Some(SettingRange::Numeric(0..=u64::MAX)),
                 }),
+                ("short_sql_max_length", DefaultSettingValue {
+                    value: UserSettingValue::UInt64(128),
+                    desc: "Sets the maximum length for truncating SQL queries in short_sql function.",
+                    mode: SettingMode::Both,
+                    range: Some(SettingRange::Numeric(1..=1024*1024)),
+                }),
             ]);
 
             Ok(Arc::new(DefaultSettings {
diff --git a/src/query/settings/src/settings_getter_setter.rs b/src/query/settings/src/settings_getter_setter.rs
index 521fa92c11d4..e54806e1550f 100644
--- a/src/query/settings/src/settings_getter_setter.rs
+++ b/src/query/settings/src/settings_getter_setter.rs
@@ -748,4 +748,12 @@ impl Settings {
     pub fn get_max_spill_io_requests(&self) -> Result<u64> {
         self.try_get_u64("max_spill_io_requests")
     }
+
+    pub fn get_short_sql_max_length(&self) -> Result<u64> {
+        self.try_get_u64("short_sql_max_length")
+    }
+
+    pub fn set_short_sql_max_length(&self, val: u64) -> Result<()> {
+        self.try_set_u64("short_sql_max_length", val)
+    }
 }
diff --git a/src/query/settings/tests/it/setting.rs b/src/query/settings/tests/it/setting.rs
index b318034a0d79..f99634773add 100644
--- a/src/query/settings/tests/it/setting.rs
+++ b/src/query/settings/tests/it/setting.rs
@@ -98,6 +98,25 @@ async fn test_set_settings() {
         let expect = "WrongValueForVariable. Code: 2803, Text = Value xx is not within the allowed values [\"None\", \"LZ4\", \"ZSTD\"].";
         assert_eq!(expect, format!("{}", result.unwrap_err()));
     }
+
+    // Number Range
+    {
+        // Ok
+        settings
+            .set_setting("short_sql_max_length".to_string(), "1000".to_string())
+            .unwrap();
+
+        // Range 1024*1024
+        let result =
+            settings.set_setting("short_sql_max_length".to_string(), "1048577".to_string());
+        let expect = "WrongValueForVariable. Code: 2803, Text = Value 1048577 is not within the range [1, 1048576].";
+        assert_eq!(expect, format!("{}", result.unwrap_err()));
+
+        // Range 1
+        let result = settings.set_setting("short_sql_max_length".to_string(), "0".to_string());
+        let expect = "WrongValueForVariable. Code: 2803, Text = Value 0 is not within the range [1, 1048576].";
+        assert_eq!(expect, format!("{}", result.unwrap_err()));
+    }
 }
 
 #[tokio::test(flavor = "multi_thread", worker_threads = 1)]

From de5a3d6adb4b4cfd20dec1cd401e3bdbaa5192b8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E7=82=8E=E6=B3=BC?= <drdr.xp@gmail.com>
Date: Sun, 29 Sep 2024 15:50:53 +0800
Subject: [PATCH 09/14] refactor: `DroppedId` for listing db/tables for gc
 (#16542)

There is no need to store tables inside `DroppedId::Db`:
the tables belonging to a DB for gc can still be stored in `DroppedId::Table`.
---
 src/meta/api/src/schema_api_impl.rs           |  44 +++---
 src/meta/api/src/schema_api_test_suite.rs     | 145 +++++++-----------
 src/meta/app/src/schema/table.rs              |  11 +-
 .../interpreter_vacuum_drop_tables.rs         |  16 +-
 4 files changed, 77 insertions(+), 139 deletions(-)

diff --git a/src/meta/api/src/schema_api_impl.rs b/src/meta/api/src/schema_api_impl.rs
index 88304f4432a1..877ffdcb35f2 100644
--- a/src/meta/api/src/schema_api_impl.rs
+++ b/src/meta/api/src/schema_api_impl.rs
@@ -2678,31 +2678,25 @@ impl<KV: kvapi::KVApi<Error = MetaError> + ?Sized> SchemaApi for KV {
                     drop_time_range.clone()
                 };
 
-                let db_filter = (table_drop_time_range, db_info.clone());
-
                 let capacity = the_limit - vacuum_table_infos.len();
-                let table_infos = do_get_table_history(self, db_filter, capacity).await?;
+                let table_infos =
+                    do_get_table_history(self, table_drop_time_range, db_info.clone(), capacity)
+                        .await?;
+
+                for (table_info, db_id) in table_infos.iter() {
+                    vacuum_ids.push(DroppedId::new_table(
+                        *db_id,
+                        table_info.ident.table_id,
+                        table_info.name.clone(),
+                    ));
+                }
 
                 // A DB can be removed only when all its tables are removed.
                 if vacuum_db && capacity > table_infos.len() {
                     vacuum_ids.push(DroppedId::Db {
                         db_id: db_info.database_id.db_id,
                         db_name: db_info.name_ident.database_name().to_string(),
-                        tables: table_infos
-                            .iter()
-                            .map(|(table_info, _)| {
-                                (table_info.ident.table_id, table_info.name.clone())
-                            })
-                            .collect(),
                     });
-                } else {
-                    for (table_info, db_id) in table_infos.iter().take(capacity) {
-                        vacuum_ids.push(DroppedId::new_table(
-                            *db_id,
-                            table_info.ident.table_id,
-                            table_info.name.clone(),
-                        ));
-                    }
                 }
 
                 vacuum_table_infos.extend(
@@ -2742,8 +2736,8 @@ impl<KV: kvapi::KVApi<Error = MetaError> + ?Sized> SchemaApi for KV {
             name_ident: tenant_dbname.clone(),
             meta: db_meta,
         });
-        let db_filter = (drop_time_range.clone(), db_info);
-        let table_infos = do_get_table_history(self, db_filter, the_limit).await?;
+        let table_infos =
+            do_get_table_history(self, drop_time_range.clone(), db_info, the_limit).await?;
         let mut drop_ids = vec![];
         let mut drop_table_infos = vec![];
 
@@ -2766,11 +2760,9 @@ impl<KV: kvapi::KVApi<Error = MetaError> + ?Sized> SchemaApi for KV {
     async fn gc_drop_tables(&self, req: GcDroppedTableReq) -> Result<(), KVAppError> {
         for drop_id in req.drop_ids {
             match drop_id {
-                DroppedId::Db {
-                    db_id,
-                    db_name,
-                    tables: _,
-                } => gc_dropped_db_by_id(self, db_id, &req.tenant, db_name).await?,
+                DroppedId::Db { db_id, db_name } => {
+                    gc_dropped_db_by_id(self, db_id, &req.tenant, db_name).await?
+                }
                 DroppedId::Table { name, id } => {
                     gc_dropped_table_by_id(self, &req.tenant, &name, &id).await?
                 }
@@ -3532,10 +3524,10 @@ fn build_upsert_table_deduplicated_label(deduplicated_label: String) -> TxnOp {
 #[fastrace::trace]
 async fn do_get_table_history(
     kv_api: &(impl kvapi::KVApi<Error = MetaError> + ?Sized),
-    db_filter: (Range<Option<DateTime<Utc>>>, Arc<DatabaseInfo>),
+    drop_time_range: Range<Option<DateTime<Utc>>>,
+    db_info: Arc<DatabaseInfo>,
     limit: usize,
 ) -> Result<Vec<(Arc<TableInfo>, u64)>, KVAppError> {
-    let (drop_time_range, db_info) = db_filter;
     let db_id = db_info.database_id.db_id;
 
     let dbid_tbname_idlist = TableIdHistoryIdent {
diff --git a/src/meta/api/src/schema_api_test_suite.rs b/src/meta/api/src/schema_api_test_suite.rs
index e17b6b752288..9f6e253cd309 100644
--- a/src/meta/api/src/schema_api_test_suite.rs
+++ b/src/meta/api/src/schema_api_test_suite.rs
@@ -4071,8 +4071,9 @@ impl SchemaApiTestSuite {
         };
         let created_on = Utc::now();
 
-        let mut drop_ids_1 = vec![];
-        let mut drop_ids_2 = vec![];
+        // The expected drop_ids built with and without retention time boundary
+        let mut drop_ids_boundary = vec![];
+        let mut drop_ids_no_boundary = vec![];
 
         // first create a database drop within filter time
         info!("--- create db1");
@@ -4081,42 +4082,38 @@ impl SchemaApiTestSuite {
             let req = CreateDatabaseReq {
                 create_option: CreateOption::Create,
                 name_ident: db_name.clone(),
-                meta: DatabaseMeta {
-                    engine: "".to_string(),
-                    ..DatabaseMeta::default()
-                },
+                meta: DatabaseMeta::default(),
             };
 
             let res = mt.create_database(req).await?;
-            drop_ids_1.push(DroppedId::Db {
-                db_id: *res.db_id,
-                db_name: db_name.database_name().to_string(),
-                tables: vec![],
-            });
-            drop_ids_2.push(DroppedId::Db {
-                db_id: *res.db_id,
-                db_name: db_name.database_name().to_string(),
-                tables: vec![],
-            });
+            let db1_id = res.db_id.db_id;
 
             let req = CreateTableReq {
                 create_option: CreateOption::Create,
-                name_ident: TableNameIdent {
-                    tenant: Tenant::new_or_err(tenant_name, func_name!())?,
-                    db_name: "db1".to_string(),
-                    table_name: "tb1".to_string(),
-                },
-
+                name_ident: TableNameIdent::new(&tenant, "db1", "tb1"),
                 table_meta: table_meta(created_on),
                 as_dropped: false,
             };
-            let _resp = mt.create_table(req.clone()).await?;
+            let resp = mt.create_table(req.clone()).await?;
+            let db1_tb1_id = resp.table_id;
 
             mt.drop_database(DropDatabaseReq {
                 if_exists: false,
                 name_ident: DatabaseNameIdent::new(&tenant, "db1"),
             })
             .await?;
+
+            drop_ids_boundary.push(DroppedId::new_table(db1_id, db1_tb1_id, "tb1"));
+            drop_ids_boundary.push(DroppedId::Db {
+                db_id: db1_id,
+                db_name: db_name.database_name().to_string(),
+            });
+
+            drop_ids_no_boundary.push(DroppedId::new_table(db1_id, db1_tb1_id, "tb1"));
+            drop_ids_no_boundary.push(DroppedId::Db {
+                db_id: db1_id,
+                db_name: db_name.database_name().to_string(),
+            });
         }
 
         // second create a database drop outof filter time, but has a table drop within filter time
@@ -4125,18 +4122,14 @@ impl SchemaApiTestSuite {
             let create_db_req = CreateDatabaseReq {
                 create_option: CreateOption::Create,
                 name_ident: DatabaseNameIdent::new(&tenant, "db2"),
-                meta: DatabaseMeta {
-                    engine: "".to_string(),
-                    ..DatabaseMeta::default()
-                },
+                meta: Default::default(),
             };
 
             let res = mt.create_database(create_db_req.clone()).await?;
-            let db_id = res.db_id;
-            drop_ids_2.push(DroppedId::Db {
-                db_id: *db_id,
+            let db2_id = res.db_id;
+            drop_ids_no_boundary.push(DroppedId::Db {
+                db_id: *db2_id,
                 db_name: "db2".to_string(),
-                tables: vec![],
             });
 
             info!("--- create and drop db2.tb1");
@@ -4153,16 +4146,21 @@ impl SchemaApiTestSuite {
                     as_dropped: false,
                 };
                 let resp = mt.create_table(req.clone()).await?;
-                drop_ids_1.push(DroppedId::new_table(
+                drop_ids_boundary.push(DroppedId::new_table(
                     *res.db_id,
                     resp.table_id,
                     table_name.table_name.clone(),
                 ));
+                drop_ids_no_boundary.push(DroppedId::new_table(
+                    *db2_id,
+                    resp.table_id,
+                    table_name.table_name.clone(),
+                ));
 
                 mt.drop_table_by_id(DropTableByIdReq {
                     if_exists: false,
                     tenant: req.name_ident.tenant.clone(),
-                    db_id: *db_id,
+                    db_id: *db2_id,
                     table_name: req.name_ident.table_name.clone(),
                     tb_id: resp.table_id,
                     engine: "FUSE".to_string(),
@@ -4176,12 +4174,7 @@ impl SchemaApiTestSuite {
                 let mut table_meta = table_meta(created_on);
                 let req = CreateTableReq {
                     create_option: CreateOption::Create,
-                    name_ident: TableNameIdent {
-                        tenant: Tenant::new_or_err(tenant_name, func_name!())?,
-                        db_name: "db2".to_string(),
-                        table_name: "tb2".to_string(),
-                    },
-
+                    name_ident: TableNameIdent::new(&tenant, "db2", "tb2"),
                     table_meta: table_meta.clone(),
                     as_dropped: false,
                 };
@@ -4189,7 +4182,7 @@ impl SchemaApiTestSuite {
                 mt.drop_table_by_id(DropTableByIdReq {
                     if_exists: false,
                     tenant: req.name_ident.tenant.clone(),
-                    db_id: *db_id,
+                    db_id: *db2_id,
                     table_name: req.name_ident.table_name.clone(),
                     tb_id: resp.table_id,
                     engine: "FUSE".to_string(),
@@ -4201,22 +4194,25 @@ impl SchemaApiTestSuite {
                 table_meta.drop_on = Some(created_on + Duration::seconds(100));
                 let data = serialize_struct(&table_meta)?;
                 upsert_test_data(mt.as_kv_api(), &id_key, data).await?;
+
+                drop_ids_no_boundary.push(DroppedId::new_table(
+                    *db2_id,
+                    resp.table_id,
+                    "tb2".to_string(),
+                ));
             }
 
             info!("--- create db2.tb3");
+            let db2_tb3_id;
             {
                 let req = CreateTableReq {
                     create_option: CreateOption::Create,
-                    name_ident: TableNameIdent {
-                        tenant: Tenant::new_or_err(tenant_name, func_name!())?,
-                        db_name: "db2".to_string(),
-                        table_name: "tb3".to_string(),
-                    },
-
+                    name_ident: TableNameIdent::new(&tenant, "db2", "tb3"),
                     table_meta: table_meta(created_on),
                     as_dropped: false,
                 };
-                let _resp = mt.create_table(req.clone()).await?;
+                let resp = mt.create_table(req.clone()).await?;
+                db2_tb3_id = resp.table_id;
             }
 
             mt.drop_database(DropDatabaseReq {
@@ -4227,9 +4223,11 @@ impl SchemaApiTestSuite {
             // change db meta to make this db drop time outof filter time
             let mut drop_db_meta = create_db_req.meta.clone();
             drop_db_meta.drop_on = Some(created_on + Duration::seconds(100));
-            let id_key = db_id;
+            let id_key = db2_id;
             let data = serialize_struct(&drop_db_meta)?;
             upsert_test_data(mt.as_kv_api(), &id_key, data).await?;
+
+            drop_ids_no_boundary.push(DroppedId::new_table(*db2_id, db2_tb3_id, "tb3".to_string()));
         }
 
         // third create a database not dropped, but has a table drop within filter time
@@ -4237,10 +4235,7 @@ impl SchemaApiTestSuite {
             let create_db_req = CreateDatabaseReq {
                 create_option: CreateOption::Create,
                 name_ident: DatabaseNameIdent::new(&tenant, "db3"),
-                meta: DatabaseMeta {
-                    engine: "".to_string(),
-                    ..DatabaseMeta::default()
-                },
+                meta: Default::default(),
             };
 
             let res = mt.create_database(create_db_req.clone()).await?;
@@ -4250,26 +4245,13 @@ impl SchemaApiTestSuite {
             {
                 let req = CreateTableReq {
                     create_option: CreateOption::Create,
-                    name_ident: TableNameIdent {
-                        tenant: Tenant::new_or_err(tenant_name, func_name!())?,
-                        db_name: "db3".to_string(),
-                        table_name: "tb1".to_string(),
-                    },
-
+                    name_ident: TableNameIdent::new(&tenant, "db3", "tb1"),
                     table_meta: table_meta(created_on),
                     as_dropped: false,
                 };
                 let resp = mt.create_table(req.clone()).await?;
-                drop_ids_1.push(DroppedId::new_table(
-                    *db_id,
-                    resp.table_id,
-                    "tb1".to_string(),
-                ));
-                drop_ids_2.push(DroppedId::new_table(
-                    *db_id,
-                    resp.table_id,
-                    "tb1".to_string(),
-                ));
+                drop_ids_boundary.push(DroppedId::new_table(*db_id, resp.table_id, "tb1"));
+                drop_ids_no_boundary.push(DroppedId::new_table(*db_id, resp.table_id, "tb1"));
                 mt.drop_table_by_id(DropTableByIdReq {
                     if_exists: false,
                     tenant: req.name_ident.tenant.clone(),
@@ -4287,21 +4269,12 @@ impl SchemaApiTestSuite {
                 let mut table_meta = table_meta(created_on);
                 let req = CreateTableReq {
                     create_option: CreateOption::Create,
-                    name_ident: TableNameIdent {
-                        tenant: Tenant::new_or_err(tenant_name, func_name!())?,
-                        db_name: "db3".to_string(),
-                        table_name: "tb2".to_string(),
-                    },
-
+                    name_ident: TableNameIdent::new(&tenant, "db3", "tb2"),
                     table_meta: table_meta.clone(),
                     as_dropped: false,
                 };
                 let resp = mt.create_table(req.clone()).await?;
-                drop_ids_2.push(DroppedId::new_table(
-                    *db_id,
-                    resp.table_id,
-                    "tb2".to_string(),
-                ));
+                drop_ids_no_boundary.push(DroppedId::new_table(*db_id, resp.table_id, "tb2"));
                 mt.drop_table_by_id(DropTableByIdReq {
                     if_exists: false,
                     tenant: req.name_ident.tenant.clone(),
@@ -4323,12 +4296,7 @@ impl SchemaApiTestSuite {
             {
                 let req = CreateTableReq {
                     create_option: CreateOption::Create,
-                    name_ident: TableNameIdent {
-                        tenant: Tenant::new_or_err(tenant_name, func_name!())?,
-                        db_name: "db3".to_string(),
-                        table_name: "tb3".to_string(),
-                    },
-
+                    name_ident: TableNameIdent::new(&tenant, "db3", "tb3"),
                     table_meta: table_meta(created_on),
                     as_dropped: false,
                 };
@@ -4348,7 +4316,7 @@ impl SchemaApiTestSuite {
                 .map(|x| x.cmp_key())
                 .collect::<BTreeSet<_>>();
 
-            let want = drop_ids_1
+            let want = drop_ids_boundary
                 .iter()
                 .map(|x| x.cmp_key())
                 .collect::<BTreeSet<_>>();
@@ -4360,8 +4328,7 @@ impl SchemaApiTestSuite {
                 "'db2'.'tb1'".to_string(),
                 "'db3'.'tb1'".to_string(),
             ]
-            .iter()
-            .cloned()
+            .into_iter()
             .collect();
             let actual: BTreeSet<String> = resp
                 .drop_table_infos
@@ -4382,7 +4349,7 @@ impl SchemaApiTestSuite {
                 .map(|x| x.cmp_key())
                 .collect::<BTreeSet<_>>();
 
-            let want = drop_ids_2
+            let want = drop_ids_no_boundary
                 .iter()
                 .map(|x| x.cmp_key())
                 .collect::<BTreeSet<_>>();
diff --git a/src/meta/app/src/schema/table.rs b/src/meta/app/src/schema/table.rs
index 7542b8177ddd..0b4df049bbe5 100644
--- a/src/meta/app/src/schema/table.rs
+++ b/src/meta/app/src/schema/table.rs
@@ -984,15 +984,8 @@ impl ListDroppedTableReq {
 
 #[derive(Clone, Debug, PartialEq, Eq)]
 pub enum DroppedId {
-    Db {
-        db_id: u64,
-        db_name: String,
-        tables: Vec<(u64, String)>,
-    },
-    Table {
-        name: DBIdTableName,
-        id: TableId,
-    },
+    Db { db_id: u64, db_name: String },
+    Table { name: DBIdTableName, id: TableId },
 }
 
 impl DroppedId {
diff --git a/src/query/service/src/interpreters/interpreter_vacuum_drop_tables.rs b/src/query/service/src/interpreters/interpreter_vacuum_drop_tables.rs
index d1b87942fb07..bf6704d5fed4 100644
--- a/src/query/service/src/interpreters/interpreter_vacuum_drop_tables.rs
+++ b/src/query/service/src/interpreters/interpreter_vacuum_drop_tables.rs
@@ -172,23 +172,9 @@ impl Interpreter for VacuumDropTablesInterpreter {
             let mut success_dropped_ids = vec![];
             for drop_id in drop_ids {
                 match &drop_id {
-                    DroppedId::Db {
-                        db_id,
-                        db_name,
-                        tables,
-                    } => {
+                    DroppedId::Db { db_id: _, db_name } => {
                         if !failed_dbs.contains(db_name) {
                             success_dropped_ids.push(drop_id);
-                        } else {
-                            for (table_id, table_name) in tables.iter() {
-                                if !failed_tables.contains(table_id) {
-                                    success_dropped_ids.push(DroppedId::new_table(
-                                        *db_id,
-                                        *table_id,
-                                        table_name.clone(),
-                                    ));
-                                }
-                            }
                         }
                     }
                     DroppedId::Table { name: _, id } => {

From dd45c198baebe945b5bfe3fb0b8a09965daff1b2 Mon Sep 17 00:00:00 2001
From: everpcpc <everpcpc@users.noreply.github.com>
Date: Sun, 29 Sep 2024 18:41:41 +0800
Subject: [PATCH 10/14] chore: update repo for workflows (#16546)

* z

* chore: update repo for workflows

* z
---
 .github/ISSUE_TEMPLATE/20_bug_report.yml  |  6 +++---
 .github/ISSUE_TEMPLATE/config.yml         |  2 +-
 .github/actions/publish_binary/action.yml |  4 ++--
 .github/actions/publish_deb/action.yml    | 10 +++++-----
 .github/actions/setup_bendsql/action.yml  |  4 ++--
 .github/scripts/notify_release.js         |  2 +-
 .github/workflows/release.yml             |  8 ++++----
 7 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/20_bug_report.yml b/.github/ISSUE_TEMPLATE/20_bug_report.yml
index e9c1e2ec7db7..9e7a44d69e56 100644
--- a/.github/ISSUE_TEMPLATE/20_bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/20_bug_report.yml
@@ -8,17 +8,17 @@ body:
       value: |
         Thank you very much for submitting feedback to Databend to help Databend develop better.
 
-        If it is an idea or help wanted, please go to: [Discussion](https://github.com/datafuselabs/databend/discussions)
+        If it is an idea or help wanted, please go to: [Discussion](https://github.com/databendlabs/databend/discussions)
 
   - type: checkboxes
     attributes:
       label: Search before asking
       description: >
-        Please make sure to search in the [issues](https://github.com/datafuselabs/databend/issues) first to see
+        Please make sure to search in the [issues](https://github.com/databendlabs/databend/issues) first to see
         whether the same issue was reported already.
       options:
         - label: >
-            I had searched in the [issues](https://github.com/datafuselabs/databend/issues) and found no similar
+            I had searched in the [issues](https://github.com/databendlabs/databend/issues) and found no similar
             issues.
           required: true
 
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
index 497a278cdfd5..c7fae142eb61 100644
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,5 +1,5 @@
 blank_issues_enabled: true
 contact_links:
   - name: Question
-    url: https://github.com/datafuselabs/databend/discussions/categories/q-a
+    url: https://github.com/databendlabs/databend/discussions/categories/q-a
     about: Please ask and answer questions on the discussions Q&A category.
diff --git a/.github/actions/publish_binary/action.yml b/.github/actions/publish_binary/action.yml
index 1220c7e7df15..f73fd02f419d 100644
--- a/.github/actions/publish_binary/action.yml
+++ b/.github/actions/publish_binary/action.yml
@@ -44,7 +44,7 @@ runs:
       if: inputs.category == 'default'
       run: |
         aws s3 cp ${{ steps.name.outputs.name }}.tar.gz s3://repo/databend/${{ inputs.version }}/${{ steps.name.outputs.name }}.tar.gz --no-progress
-        gh api /repos/datafuselabs/databend/tags > tags.json
+        gh api /repos/databendlabs/databend/tags > tags.json
         aws s3 cp ./tags.json s3://repo/databend/tags.json
-        gh api /repos/datafuselabs/databend/releases > releases.json
+        gh api /repos/databendlabs/databend/releases > releases.json
         aws s3 cp ./releases.json s3://repo/databend/releases.json
diff --git a/.github/actions/publish_deb/action.yml b/.github/actions/publish_deb/action.yml
index 19703fd07720..c64911b0a3e2 100644
--- a/.github/actions/publish_deb/action.yml
+++ b/.github/actions/publish_deb/action.yml
@@ -32,8 +32,8 @@ runs:
         version=${{ inputs.version }}
         deb_version=${version/-/.}
         deb_version=${deb_version/v/}
-        wget -q https://github.com/datafuselabs/databend/releases/download/${version}/databend_${deb_version}_amd64.deb
-        wget -q https://github.com/datafuselabs/databend/releases/download/${version}/databend_${deb_version}_arm64.deb
+        wget -q https://github.com/databendlabs/databend/releases/download/${version}/databend_${deb_version}_amd64.deb
+        wget -q https://github.com/databendlabs/databend/releases/download/${version}/databend_${deb_version}_arm64.deb
         reprepro includedeb stable databend_${deb_version}_amd64.deb
         reprepro includedeb stable databend_${deb_version}_arm64.deb
 
@@ -41,10 +41,10 @@ runs:
       shell: bash
       working-directory: scripts/distribution/deb
       run: |
-        version=$(gh release view --repo datafuselabs/bendsql --json name | jq -r '.name')
+        version=$(gh release view --repo databendlabs/bendsql --json name | jq -r '.name')
         deb_version=${version/v/}
-        wget -q https://github.com/datafuselabs/bendsql/releases/download/${version}/bendsql_${deb_version}_amd64.deb
-        wget -q https://github.com/datafuselabs/bendsql/releases/download/${version}/bendsql_${deb_version}_arm64.deb
+        wget -q https://github.com/databendlabs/bendsql/releases/download/${version}/bendsql_${deb_version}_amd64.deb
+        wget -q https://github.com/databendlabs/bendsql/releases/download/${version}/bendsql_${deb_version}_arm64.deb
         reprepro includedeb stable bendsql_${deb_version}_amd64.deb
         reprepro includedeb stable bendsql_${deb_version}_arm64.deb
 
diff --git a/.github/actions/setup_bendsql/action.yml b/.github/actions/setup_bendsql/action.yml
index 6fc2daca1352..79718f8744d9 100644
--- a/.github/actions/setup_bendsql/action.yml
+++ b/.github/actions/setup_bendsql/action.yml
@@ -10,7 +10,7 @@ runs:
         if bendsql --version; then
           exit 0
         fi
-        curl --retry 5 -Lo /tmp/bendsql.tar.gz https://github.com/datafuselabs/bendsql/releases/download/v0.18.3/bendsql-x86_64-unknown-linux-gnu.tar.gz
+        curl --retry 5 -Lo /tmp/bendsql.tar.gz https://github.com/databendlabs/bendsql/releases/download/v0.18.3/bendsql-x86_64-unknown-linux-gnu.tar.gz
         tar -xzf /tmp/bendsql.tar.gz -C /tmp
         mv /tmp/bendsql /usr/local/bin/bendsql
         bendsql --version
@@ -21,7 +21,7 @@ runs:
         if bendsql --version; then
           exit 0
         fi
-        curl --retry 5 -Lo /tmp/bendsql.tar.gz https://github.com/datafuselabs/bendsql/releases/download/v0.18.3/bendsql-x86_64-apple-darwin.tar.gz
+        curl --retry 5 -Lo /tmp/bendsql.tar.gz https://github.com/databendlabs/bendsql/releases/download/v0.18.3/bendsql-x86_64-apple-darwin.tar.gz
         tar -xzf /tmp/bendsql.tar.gz -C /tmp
         mv /tmp/bendsql /usr/local/bin/bendsql
         bendsql --version
diff --git a/.github/scripts/notify_release.js b/.github/scripts/notify_release.js
index f6646208d4a0..6463c1b986c9 100644
--- a/.github/scripts/notify_release.js
+++ b/.github/scripts/notify_release.js
@@ -43,7 +43,7 @@ module.exports = async ({ context, core }) => {
               {
                 tag: "a",
                 text: "Release Notes",
-                href: `https://github.com/datafuselabs/databend/releases/tag/${VERSION}`,
+                href: `https://github.com/databendlabs/databend/releases/tag/${VERSION}`,
               },
             ],
           ],
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 7f892ac3dd28..a08a02c1b58c 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -71,7 +71,7 @@ jobs:
       - name: Checkout Docs
         uses: actions/checkout@v4
         with:
-          repository: datafuselabs/databend-docs
+          repository: databendlabs/databend-docs
           ref: main
       - name: Get date
         shell: bash
@@ -83,10 +83,10 @@ jobs:
           mkdir -p docs/release-stable
           df="docs/release-stable/${{ env.DATE }}_${{ needs.create_release.outputs.version }}.md"
           echo "---" > $df
-          gh release view --repo datafuselabs/databend ${{ needs.create_release.outputs.version }} >> $df
+          gh release view --repo databendlabs/databend ${{ needs.create_release.outputs.version }} >> $df
           sed -i -E 's/^--$/---/g' $df
           sed -i -E '/^asset:/d' $df
-          sed -i -E 's_https://github.com/datafuselabs/databend/pull/([0-9]+)_[#\1](https://github.com/datafuselabs/databend/pull/\1)_g' $df
+          sed -i -E 's_https://github.com/databendlabs/databend/pull/([0-9]+)_[#\1](https://github.com/databendlabs/databend/pull/\1)_g' $df
           git add docs/release-stable
           git status
       - uses: peter-evans/create-pull-request@v4
@@ -649,7 +649,7 @@ jobs:
   #     - name: checkout share endpoint
   #       uses: actions/checkout@v4
   #       with:
-  #         repository: datafuselabs/share-endpoint
+  #         repository: databendlabs/share-endpoint
   #         token: ${{ secrets.DATABEND_BOT_TOKEN }}
   #         path: share-endpoint
   #     - name: Download artifacts

From 0ed932bd060203568ebed40acb4acf11b02cdc67 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E7=82=8E=E6=B3=BC?= <drdr.xp@gmail.com>
Date: Mon, 30 Sep 2024 11:32:41 +0800
Subject: [PATCH 11/14] refactor: `get_history_tables_for_gc()` should not
 return `TableInfo`, but just table name, id and values (#16545)

`SchemaApi` can not provide enough information to build a valid
`TableInfo`, such as, it does not know about catalog type and database
type. Therefore `get_history_tables_for_gc()` should just return `table
name, table id and the table meta` to its caller to let the build a
`TableInfo` if needed.
---
 src/meta/api/src/schema_api_impl.rs  | 96 +++++++++++++++-------------
 src/meta/app/src/schema/mod.rs       |  1 +
 src/meta/app/src/schema/table.rs     | 12 ++++
 src/meta/app/src/schema/table_niv.rs | 49 ++++++++++++++
 4 files changed, 112 insertions(+), 46 deletions(-)
 create mode 100644 src/meta/app/src/schema/table_niv.rs

diff --git a/src/meta/api/src/schema_api_impl.rs b/src/meta/api/src/schema_api_impl.rs
index 877ffdcb35f2..ffdf39541597 100644
--- a/src/meta/api/src/schema_api_impl.rs
+++ b/src/meta/api/src/schema_api_impl.rs
@@ -67,6 +67,7 @@ use databend_common_meta_app::schema::index_id_ident::IndexIdIdent;
 use databend_common_meta_app::schema::index_id_to_name_ident::IndexIdToNameIdent;
 use databend_common_meta_app::schema::index_name_ident::IndexName;
 use databend_common_meta_app::schema::least_visible_time_ident::LeastVisibleTimeIdent;
+use databend_common_meta_app::schema::table_niv::TableNIV;
 use databend_common_meta_app::schema::CatalogIdToNameIdent;
 use databend_common_meta_app::schema::CatalogInfo;
 use databend_common_meta_app::schema::CatalogMeta;
@@ -2679,32 +2680,34 @@ impl<KV: kvapi::KVApi<Error = MetaError> + ?Sized> SchemaApi for KV {
                 };
 
                 let capacity = the_limit - vacuum_table_infos.len();
-                let table_infos =
-                    do_get_table_history(self, table_drop_time_range, db_info.clone(), capacity)
-                        .await?;
-
-                for (table_info, db_id) in table_infos.iter() {
-                    vacuum_ids.push(DroppedId::new_table(
-                        *db_id,
-                        table_info.ident.table_id,
-                        table_info.name.clone(),
-                    ));
+                let table_nivs = get_history_tables_for_gc(
+                    self,
+                    table_drop_time_range,
+                    db_info.database_id.db_id,
+                    capacity,
+                )
+                .await?;
+
+                for table_niv in table_nivs.iter() {
+                    vacuum_ids.push(DroppedId::from(table_niv.clone()));
                 }
 
                 // A DB can be removed only when all its tables are removed.
-                if vacuum_db && capacity > table_infos.len() {
+                if vacuum_db && capacity > table_nivs.len() {
                     vacuum_ids.push(DroppedId::Db {
                         db_id: db_info.database_id.db_id,
                         db_name: db_info.name_ident.database_name().to_string(),
                     });
                 }
 
-                vacuum_table_infos.extend(
-                    table_infos
-                        .iter()
-                        .take(capacity)
-                        .map(|(table_info, _)| table_info.clone()),
-                );
+                vacuum_table_infos.extend(table_nivs.iter().take(capacity).map(|niv| {
+                    Arc::new(TableInfo::new(
+                        db_info.name_ident.database_name(),
+                        &niv.name().table_name,
+                        TableIdent::new(niv.id().table_id, niv.value().seq),
+                        niv.value().data.clone(),
+                    ))
+                }));
             }
 
             return Ok(ListDroppedTableResp {
@@ -2736,18 +2739,26 @@ impl<KV: kvapi::KVApi<Error = MetaError> + ?Sized> SchemaApi for KV {
             name_ident: tenant_dbname.clone(),
             meta: db_meta,
         });
-        let table_infos =
-            do_get_table_history(self, drop_time_range.clone(), db_info, the_limit).await?;
+        let table_nivs = get_history_tables_for_gc(
+            self,
+            drop_time_range.clone(),
+            db_info.database_id.db_id,
+            the_limit,
+        )
+        .await?;
+
         let mut drop_ids = vec![];
         let mut drop_table_infos = vec![];
 
-        for (table_info, db_id) in table_infos.iter().take(the_limit) {
-            drop_ids.push(DroppedId::new_table(
-                *db_id,
-                table_info.ident.table_id,
-                table_info.name.clone(),
-            ));
-            drop_table_infos.push(table_info.clone());
+        for niv in table_nivs.iter() {
+            drop_ids.push(DroppedId::from(niv.clone()));
+
+            drop_table_infos.push(Arc::new(TableInfo::new(
+                db_info.name_ident.database_name(),
+                &niv.name().table_name,
+                TableIdent::new(niv.id().table_id, niv.value().seq),
+                niv.value().data.clone(),
+            )));
         }
 
         Ok(ListDroppedTableResp {
@@ -3520,21 +3531,22 @@ fn build_upsert_table_deduplicated_label(deduplicated_label: String) -> TxnOp {
     )
 }
 
+/// Lists all dropped and non-dropped tables belonging to a Database,
+/// returns those tables that are eligible for garbage collection,
+/// i.e., whose dropped time is in the specified range.
 #[logcall::logcall(input = "")]
 #[fastrace::trace]
-async fn do_get_table_history(
+async fn get_history_tables_for_gc(
     kv_api: &(impl kvapi::KVApi<Error = MetaError> + ?Sized),
     drop_time_range: Range<Option<DateTime<Utc>>>,
-    db_info: Arc<DatabaseInfo>,
+    db_id: u64,
     limit: usize,
-) -> Result<Vec<(Arc<TableInfo>, u64)>, KVAppError> {
-    let db_id = db_info.database_id.db_id;
-
-    let dbid_tbname_idlist = TableIdHistoryIdent {
+) -> Result<Vec<TableNIV>, KVAppError> {
+    let ident = TableIdHistoryIdent {
         database_id: db_id,
         table_name: "dummy".to_string(),
     };
-    let dir_name = DirName::new(dbid_tbname_idlist);
+    let dir_name = DirName::new(ident);
     let table_history_kvs = kv_api.list_pb_vec(&dir_name).await?;
 
     let mut args = vec![];
@@ -3565,19 +3577,11 @@ async fn do_get_table_history(
                 continue;
             }
 
-            let tb_info = TableInfo {
-                ident: TableIdent {
-                    table_id: table_id.table_id,
-                    seq: seq_meta.seq,
-                },
-                desc: format!("'{}'.'{}'", db_info.name_ident.database_name(), table_name,),
-                name: (*table_name).clone(),
-                meta: seq_meta.data,
-                db_type: DatabaseType::NormalDB,
-                catalog_info: Default::default(),
-            };
-
-            filter_tb_infos.push((Arc::new(tb_info), db_info.database_id.db_id));
+            filter_tb_infos.push(TableNIV::new(
+                DBIdTableName::new(db_id, table_name.clone()),
+                table_id.clone(),
+                seq_meta,
+            ));
         }
     }
 
diff --git a/src/meta/app/src/schema/mod.rs b/src/meta/app/src/schema/mod.rs
index e7e12d9e7bce..81a6f5d72f09 100644
--- a/src/meta/app/src/schema/mod.rs
+++ b/src/meta/app/src/schema/mod.rs
@@ -28,6 +28,7 @@ pub mod index_id_to_name_ident;
 pub mod index_name_ident;
 pub mod least_visible_time_ident;
 pub mod table_lock_ident;
+pub mod table_niv;
 pub mod virtual_column_ident;
 
 mod create_option;
diff --git a/src/meta/app/src/schema/table.rs b/src/meta/app/src/schema/table.rs
index 0b4df049bbe5..a7a6e48fc8d1 100644
--- a/src/meta/app/src/schema/table.rs
+++ b/src/meta/app/src/schema/table.rs
@@ -39,6 +39,7 @@ use super::CatalogInfo;
 use super::CreateOption;
 use super::DatabaseId;
 use crate::schema::database_name_ident::DatabaseNameIdent;
+use crate::schema::table_niv::TableNIV;
 use crate::storage::StorageParams;
 use crate::tenant::Tenant;
 use crate::tenant::ToTenant;
@@ -988,7 +989,18 @@ pub enum DroppedId {
     Table { name: DBIdTableName, id: TableId },
 }
 
+impl From<TableNIV> for DroppedId {
+    fn from(value: TableNIV) -> Self {
+        let (name, id, _) = value.unpack();
+        Self::Table { name, id }
+    }
+}
+
 impl DroppedId {
+    pub fn new_table_name_id(name: DBIdTableName, id: TableId) -> DroppedId {
+        DroppedId::Table { name, id }
+    }
+
     pub fn new_table(db_id: u64, table_id: u64, table_name: impl ToString) -> DroppedId {
         DroppedId::Table {
             name: DBIdTableName::new(db_id, table_name),
diff --git a/src/meta/app/src/schema/table_niv.rs b/src/meta/app/src/schema/table_niv.rs
new file mode 100644
index 000000000000..744d869ce165
--- /dev/null
+++ b/src/meta/app/src/schema/table_niv.rs
@@ -0,0 +1,49 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use databend_common_meta_types::SeqV;
+
+use crate::schema::DBIdTableName;
+use crate::schema::TableId;
+use crate::schema::TableMeta;
+
+/// The **Name, ID, Value** for a table metadata stored in meta-service.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct TableNIV {
+    name: DBIdTableName,
+    id: TableId,
+    value: SeqV<TableMeta>,
+}
+
+impl TableNIV {
+    pub fn new(name: DBIdTableName, id: TableId, value: SeqV<TableMeta>) -> Self {
+        TableNIV { name, id, value }
+    }
+
+    pub fn name(&self) -> &DBIdTableName {
+        &self.name
+    }
+
+    pub fn id(&self) -> &TableId {
+        &self.id
+    }
+
+    pub fn value(&self) -> &SeqV<TableMeta> {
+        &self.value
+    }
+
+    pub fn unpack(self) -> (DBIdTableName, TableId, SeqV<TableMeta>) {
+        (self.name, self.id, self.value)
+    }
+}

From abd8266714d7ffc49b7d930d342f459a1bc0d2dd Mon Sep 17 00:00:00 2001
From: baishen <baishen2009@gmail.com>
Date: Mon, 30 Sep 2024 12:04:34 +0800
Subject: [PATCH 12/14] refactor(storage): improve inverted index match phrase
 query (#16547)

* refactor(storage): improve inverted index match phrase query

* fix typos

* fix machete

* fix machete
---
 Cargo.lock                                    |   23 +-
 Cargo.toml                                    |    6 +-
 .../arrow/src/arrow/array/list/mutable.rs     |    4 +-
 src/common/arrow/src/arrow/offset.rs          |   22 +-
 .../tests/it/inverted_index/index_refresh.rs  |    4 +-
 .../transform_add_internal_columns.rs         |    4 +-
 .../flight_sql/flight_sql_service/session.rs  |    2 +-
 .../http/v1/session/refresh_handler.rs        |   10 +-
 src/query/sql/src/planner/planner_cache.rs    |    2 +-
 src/query/storages/common/index/Cargo.toml    |    3 +-
 .../common/index/src/inverted_index.rs        | 1076 ++++++++++-------
 src/query/storages/common/index/src/lib.rs    |    5 +-
 .../inverted_index/inverted_index_loader.rs   |    3 +-
 .../inverted_index/inverted_index_reader.rs   |  235 ++--
 .../src/io/write/inverted_index_writer.rs     |  178 ---
 .../fuse/src/pruning/inverted_index_pruner.rs |   10 +-
 src/query/storages/stream/src/stream_table.rs |    9 +-
 17 files changed, 825 insertions(+), 771 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 81df0e0cbfb4..f8cdc9ca06e4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5396,7 +5396,6 @@ version = "0.1.0"
 dependencies = [
  "anyerror",
  "cbordata",
- "crc32fast",
  "criterion",
  "databend-common-arrow",
  "databend-common-ast",
@@ -5411,8 +5410,8 @@ dependencies = [
  "match-template",
  "parquet",
  "rand 0.8.5",
+ "roaring",
  "serde",
- "serde_json",
  "tantivy",
  "tantivy-common",
  "tantivy-fst",
@@ -11086,7 +11085,7 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
 [[package]]
 name = "ownedbytes"
 version = "0.7.0"
-source = "git+https://github.com/datafuse-extras/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c"
+source = "git+https://github.com/datafuse-extras/tantivy?rev=7502370#7502370b68e6822a687ee071660e350b67808533"
 dependencies = [
  "stable_deref_trait",
 ]
@@ -14770,7 +14769,7 @@ checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417"
 [[package]]
 name = "tantivy"
 version = "0.22.0"
-source = "git+https://github.com/datafuse-extras/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c"
+source = "git+https://github.com/datafuse-extras/tantivy?rev=7502370#7502370b68e6822a687ee071660e350b67808533"
 dependencies = [
  "aho-corasick",
  "arc-swap",
@@ -14820,7 +14819,7 @@ dependencies = [
 [[package]]
 name = "tantivy-bitpacker"
 version = "0.6.0"
-source = "git+https://github.com/datafuse-extras/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c"
+source = "git+https://github.com/datafuse-extras/tantivy?rev=7502370#7502370b68e6822a687ee071660e350b67808533"
 dependencies = [
  "bitpacking 0.9.2",
 ]
@@ -14828,7 +14827,7 @@ dependencies = [
 [[package]]
 name = "tantivy-columnar"
 version = "0.3.0"
-source = "git+https://github.com/datafuse-extras/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c"
+source = "git+https://github.com/datafuse-extras/tantivy?rev=7502370#7502370b68e6822a687ee071660e350b67808533"
 dependencies = [
  "downcast-rs",
  "fastdivide",
@@ -14843,7 +14842,7 @@ dependencies = [
 [[package]]
 name = "tantivy-common"
 version = "0.7.0"
-source = "git+https://github.com/datafuse-extras/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c"
+source = "git+https://github.com/datafuse-extras/tantivy?rev=7502370#7502370b68e6822a687ee071660e350b67808533"
 dependencies = [
  "async-trait",
  "byteorder",
@@ -14866,7 +14865,7 @@ dependencies = [
 [[package]]
 name = "tantivy-jieba"
 version = "0.11.0"
-source = "git+https://github.com/datafuse-extras/tantivy-jieba?rev=124a8fc#124a8fc8c8a9f1389af5a9bfa497fb358ecc556e"
+source = "git+https://github.com/datafuse-extras/tantivy-jieba?rev=0e300e9#0e300e9085651b7e6659dfcc7b0ea0fa9cab09c2"
 dependencies = [
  "jieba-rs",
  "lazy_static",
@@ -14876,7 +14875,7 @@ dependencies = [
 [[package]]
 name = "tantivy-query-grammar"
 version = "0.22.0"
-source = "git+https://github.com/datafuse-extras/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c"
+source = "git+https://github.com/datafuse-extras/tantivy?rev=7502370#7502370b68e6822a687ee071660e350b67808533"
 dependencies = [
  "nom",
 ]
@@ -14884,7 +14883,7 @@ dependencies = [
 [[package]]
 name = "tantivy-sstable"
 version = "0.3.0"
-source = "git+https://github.com/datafuse-extras/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c"
+source = "git+https://github.com/datafuse-extras/tantivy?rev=7502370#7502370b68e6822a687ee071660e350b67808533"
 dependencies = [
  "tantivy-bitpacker",
  "tantivy-common",
@@ -14895,7 +14894,7 @@ dependencies = [
 [[package]]
 name = "tantivy-stacker"
 version = "0.3.0"
-source = "git+https://github.com/datafuse-extras/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c"
+source = "git+https://github.com/datafuse-extras/tantivy?rev=7502370#7502370b68e6822a687ee071660e350b67808533"
 dependencies = [
  "murmurhash32",
  "rand_distr",
@@ -14905,7 +14904,7 @@ dependencies = [
 [[package]]
 name = "tantivy-tokenizer-api"
 version = "0.3.0"
-source = "git+https://github.com/datafuse-extras/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c"
+source = "git+https://github.com/datafuse-extras/tantivy?rev=7502370#7502370b68e6822a687ee071660e350b67808533"
 dependencies = [
  "serde",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index d723c441a9d6..2e207de5095e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -408,7 +408,7 @@ openai_api_rust = { git = "https://github.com/datafuse-extras/openai-api", rev =
 orc-rust = { git = "https://github.com/datafuse-extras/datafusion-orc", rev = "03372b97" }
 recursive = { git = "https://github.com/datafuse-extras/recursive.git", rev = "6af35a1" }
 sled = { git = "https://github.com/datafuse-extras/sled", tag = "v0.34.7-datafuse.1" }
-tantivy = { git = "https://github.com/datafuse-extras/tantivy", rev = "37aeac0" }
-tantivy-common = { git = "https://github.com/datafuse-extras/tantivy", rev = "37aeac0", package = "tantivy-common" }
-tantivy-jieba = { git = "https://github.com/datafuse-extras/tantivy-jieba", rev = "124a8fc" }
+tantivy = { git = "https://github.com/datafuse-extras/tantivy", rev = "7502370" }
+tantivy-common = { git = "https://github.com/datafuse-extras/tantivy", rev = "7502370", package = "tantivy-common" }
+tantivy-jieba = { git = "https://github.com/datafuse-extras/tantivy-jieba", rev = "0e300e9" }
 xorfilter-rs = { git = "https://github.com/datafuse-extras/xorfilter", tag = "databend-alpha.4" }
diff --git a/src/common/arrow/src/arrow/array/list/mutable.rs b/src/common/arrow/src/arrow/array/list/mutable.rs
index 1a63727a45ad..4834b84c4bff 100644
--- a/src/common/arrow/src/arrow/array/list/mutable.rs
+++ b/src/common/arrow/src/arrow/array/list/mutable.rs
@@ -182,7 +182,9 @@ impl<O: Offset, M: MutableArray> MutableListArray<O, M> {
     pub fn try_push_valid(&mut self) -> Result<()> {
         let total_length = self.values.len();
         let offset = self.offsets.last().to_usize();
-        let length = total_length.checked_sub(offset).ok_or(Error::Overflow)?;
+        let length = total_length
+            .checked_sub(offset)
+            .ok_or_else(|| Error::Overflow)?;
 
         self.offsets.try_push_usize(length)?;
         if let Some(validity) = &mut self.validity {
diff --git a/src/common/arrow/src/arrow/offset.rs b/src/common/arrow/src/arrow/offset.rs
index 7d56d883ed8c..dd24cd0bce21 100644
--- a/src/common/arrow/src/arrow/offset.rs
+++ b/src/common/arrow/src/arrow/offset.rs
@@ -127,7 +127,9 @@ impl<O: Offset> Offsets<O> {
     pub fn try_push(&mut self, length: O) -> Result<(), Error> {
         let old_length = self.last();
         assert!(length >= O::zero());
-        let new_length = old_length.checked_add(&length).ok_or(Error::Overflow)?;
+        let new_length = old_length
+            .checked_add(&length)
+            .ok_or_else(|| Error::Overflow)?;
         self.0.push(new_length);
         Ok(())
     }
@@ -140,10 +142,12 @@ impl<O: Offset> Offsets<O> {
     /// * checks that this length does not overflow
     #[inline]
     pub fn try_push_usize(&mut self, length: usize) -> Result<(), Error> {
-        let length = O::from_usize(length).ok_or(Error::Overflow)?;
+        let length = O::from_usize(length).ok_or_else(|| Error::Overflow)?;
 
         let old_length = self.last();
-        let new_length = old_length.checked_add(&length).ok_or(Error::Overflow)?;
+        let new_length = old_length
+            .checked_add(&length)
+            .ok_or_else(|| Error::Overflow)?;
         self.0.push(new_length);
         Ok(())
     }
@@ -267,8 +271,8 @@ impl<O: Offset> Offsets<O> {
 
         let last_offset = original_offset
             .checked_add(total_length)
-            .ok_or(Error::Overflow)?;
-        O::from_usize(last_offset).ok_or(Error::Overflow)?;
+            .ok_or_else(|| Error::Overflow)?;
+        O::from_usize(last_offset).ok_or_else(|| Error::Overflow)?;
         Ok(())
     }
 
@@ -279,7 +283,9 @@ impl<O: Offset> Offsets<O> {
         let mut length = *self.last();
         let other_length = *other.last();
         // check if the operation would overflow
-        length.checked_add(&other_length).ok_or(Error::Overflow)?;
+        length
+            .checked_add(&other_length)
+            .ok_or_else(|| Error::Overflow)?;
 
         let lengths = other.as_slice().windows(2).map(|w| w[1] - w[0]);
         let offsets = lengths.map(|new_length| {
@@ -306,7 +312,9 @@ impl<O: Offset> Offsets<O> {
         let other_length = other.last().expect("Length to be non-zero");
         let mut length = *self.last();
         // check if the operation would overflow
-        length.checked_add(other_length).ok_or(Error::Overflow)?;
+        length
+            .checked_add(other_length)
+            .ok_or_else(|| Error::Overflow)?;
 
         let lengths = other.windows(2).map(|w| w[1] - w[0]);
         let offsets = lengths.map(|new_length| {
diff --git a/src/query/ee/tests/it/inverted_index/index_refresh.rs b/src/query/ee/tests/it/inverted_index/index_refresh.rs
index 611e1ff11181..9d0d45879293 100644
--- a/src/query/ee/tests/it/inverted_index/index_refresh.rs
+++ b/src/query/ee/tests/it/inverted_index/index_refresh.rs
@@ -143,7 +143,6 @@ async fn test_fuse_do_refresh_inverted_index() -> Result<()> {
         &index_version,
     );
 
-    let field_nums = query_fields.len();
     let has_score = true;
     let need_position = false;
     let mut field_ids = HashSet::new();
@@ -177,7 +176,6 @@ async fn test_fuse_do_refresh_inverted_index() -> Result<()> {
         let matched_rows = index_reader
             .clone()
             .do_filter(
-                field_nums,
                 need_position,
                 has_score,
                 query.box_clone(),
@@ -185,7 +183,7 @@ async fn test_fuse_do_refresh_inverted_index() -> Result<()> {
                 &index_record,
                 &fuzziness,
                 tokenizer_manager,
-                block_meta.row_count as u32,
+                block_meta.row_count,
                 &index_loc,
             )
             .await?;
diff --git a/src/query/service/src/pipelines/processors/transforms/transform_add_internal_columns.rs b/src/query/service/src/pipelines/processors/transforms/transform_add_internal_columns.rs
index f62db09eee1e..7b35277c25e2 100644
--- a/src/query/service/src/pipelines/processors/transforms/transform_add_internal_columns.rs
+++ b/src/query/service/src/pipelines/processors/transforms/transform_add_internal_columns.rs
@@ -40,8 +40,8 @@ impl Transform for TransformAddInternalColumns {
 
     fn transform(&mut self, mut block: DataBlock) -> Result<DataBlock> {
         if let Some(meta) = block.take_meta() {
-            let internal_column_meta =
-                InternalColumnMeta::downcast_from(meta).ok_or(ErrorCode::Internal("It's a bug"))?;
+            let internal_column_meta = InternalColumnMeta::downcast_from(meta)
+                .ok_or_else(|| ErrorCode::Internal("It's a bug"))?;
             let num_rows = block.num_rows();
             for internal_column in self.internal_columns.values() {
                 let column =
diff --git a/src/query/service/src/servers/flight_sql/flight_sql_service/session.rs b/src/query/service/src/servers/flight_sql/flight_sql_service/session.rs
index 82214bc67e4f..1cc7a6de4731 100644
--- a/src/query/service/src/servers/flight_sql/flight_sql_service/session.rs
+++ b/src/query/service/src/servers/flight_sql/flight_sql_service/session.rs
@@ -64,7 +64,7 @@ impl FlightSqlServiceImpl {
     pub(super) fn get_user_password(metadata: &MetadataMap) -> Result<(String, String), String> {
         let basic = "Basic ";
         let authorization = Self::get_header_value(metadata, "authorization")
-            .ok_or("authorization not parsable".to_string())?;
+            .ok_or_else(|| "authorization not parsable".to_string())?;
 
         if !authorization.starts_with(basic) {
             return Err(format!("Auth type not implemented: {authorization}"));
diff --git a/src/query/service/src/servers/http/v1/session/refresh_handler.rs b/src/query/service/src/servers/http/v1/session/refresh_handler.rs
index 84b0bac34462..830330b7c140 100644
--- a/src/query/service/src/servers/http/v1/session/refresh_handler.rs
+++ b/src/query/service/src/servers/http/v1/session/refresh_handler.rs
@@ -48,11 +48,11 @@ pub async fn refresh_handler(
     let mgr = ClientSessionManager::instance();
     match &ctx.credential {
         Credential::Jwt { .. } => {
-            let session_id =
-                req.session_id
-                    .ok_or(HttpErrorCode::bad_request(ErrorCode::BadArguments(
-                        "JWT session should provide session_id when refresh session",
-                    )))?;
+            let session_id = req.session_id.ok_or_else(|| {
+                HttpErrorCode::bad_request(ErrorCode::BadArguments(
+                    "JWT session should provide session_id when refresh session",
+                ))
+            })?;
             mgr.refresh_in_memory_states(&session_id);
 
             let tenant = ctx.session.get_current_tenant();
diff --git a/src/query/sql/src/planner/planner_cache.rs b/src/query/sql/src/planner/planner_cache.rs
index 8da310f3bb55..2a8f0e51b8aa 100644
--- a/src/query/sql/src/planner/planner_cache.rs
+++ b/src/query/sql/src/planner/planner_cache.rs
@@ -165,7 +165,7 @@ impl TableRefVisitor {
 
         let func_name = func.name.name.to_lowercase();
         // If the function is not suitable for caching, we should not cache the plan
-        if !is_cacheable_function(&func_name) || func_name == "score" {
+        if !is_cacheable_function(&func_name) {
             self.cache_miss = true;
         }
     }
diff --git a/src/query/storages/common/index/Cargo.toml b/src/query/storages/common/index/Cargo.toml
index 1d1372be7302..2ee4d664f7df 100644
--- a/src/query/storages/common/index/Cargo.toml
+++ b/src/query/storages/common/index/Cargo.toml
@@ -16,7 +16,6 @@ ignored = ["xorfilter-rs", "match-template"]
 [dependencies]
 anyerror = { workspace = true }
 cbordata = { version = "0.6.0" }
-crc32fast = "1.3.2"
 databend-common-arrow = { workspace = true }
 databend-common-ast = { workspace = true }
 databend-common-exception = { workspace = true }
@@ -29,8 +28,8 @@ levenshtein_automata = "0.2.1"
 log = { workspace = true }
 match-template = { workspace = true }
 parquet = { workspace = true }
+roaring = "0.10.1"
 serde = { workspace = true }
-serde_json = { workspace = true }
 tantivy = { workspace = true }
 tantivy-common = { workspace = true }
 tantivy-fst = "0.5"
diff --git a/src/query/storages/common/index/src/inverted_index.rs b/src/query/storages/common/index/src/inverted_index.rs
index 6d8db4c5b424..09580866f2de 100644
--- a/src/query/storages/common/index/src/inverted_index.rs
+++ b/src/query/storages/common/index/src/inverted_index.rs
@@ -36,19 +36,20 @@
 use std::collections::BTreeMap;
 use std::collections::HashMap;
 use std::collections::HashSet;
-use std::collections::VecDeque;
 use std::io;
 use std::io::BufWriter;
 use std::io::Cursor;
 use std::io::Read;
 use std::io::Write;
 use std::marker::PhantomData;
+use std::ops::BitAndAssign;
+use std::ops::BitOrAssign;
+use std::ops::SubAssign;
 use std::path::Path;
 use std::path::PathBuf;
 use std::result;
 use std::sync::Arc;
 
-use crc32fast::Hasher;
 use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
 use databend_common_expression::Scalar;
@@ -61,6 +62,7 @@ use levenshtein_automata::Distance;
 use levenshtein_automata::LevenshteinAutomatonBuilder;
 use levenshtein_automata::DFA;
 use log::warn;
+use roaring::RoaringTreemap;
 use tantivy::directory::error::DeleteError;
 use tantivy::directory::error::OpenReadError;
 use tantivy::directory::error::OpenWriteError;
@@ -73,14 +75,21 @@ use tantivy::directory::WatchCallback;
 use tantivy::directory::WatchHandle;
 use tantivy::directory::WritePtr;
 use tantivy::positions::PositionReader;
+use tantivy::postings::BlockSegmentPostings;
 use tantivy::postings::TermInfo;
+use tantivy::query::AllQuery;
 use tantivy::query::BooleanQuery;
+use tantivy::query::BoostQuery;
+use tantivy::query::ConstScoreQuery;
+use tantivy::query::EmptyQuery;
 use tantivy::query::FuzzyTermQuery;
 use tantivy::query::Occur;
+use tantivy::query::PhrasePrefixQuery;
 use tantivy::query::PhraseQuery;
 use tantivy::query::Query;
 use tantivy::query::QueryClone;
 use tantivy::query::TermQuery;
+use tantivy::schema::Field;
 use tantivy::Directory;
 use tantivy::Term;
 use tantivy_common::BinarySerializable;
@@ -88,60 +97,9 @@ use tantivy_common::HasLen;
 use tantivy_common::VInt;
 use tantivy_fst::Automaton;
 use tantivy_fst::IntoStreamer;
+use tantivy_fst::Regex;
 use tantivy_fst::Streamer;
 
-// tantivy version is used to generate the footer data
-
-// Index major version.
-const INDEX_MAJOR_VERSION: u32 = 0;
-// Index minor version.
-const INDEX_MINOR_VERSION: u32 = 22;
-// Index patch version.
-const INDEX_PATCH_VERSION: u32 = 0;
-// Index format version.
-const INDEX_FORMAT_VERSION: u32 = 6;
-
-// The magic byte of the footer to identify corruption
-// or an old version of the footer.
-const FOOTER_MAGIC_NUMBER: u32 = 1337;
-
-type CrcHashU32 = u32;
-
-/// Structure version for the index.
-#[derive(Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
-pub struct Version {
-    major: u32,
-    minor: u32,
-    patch: u32,
-    index_format_version: u32,
-}
-
-/// A Footer is appended every part of data, like tantivy file.
-#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
-struct Footer {
-    version: Version,
-    crc: CrcHashU32,
-}
-
-impl Footer {
-    fn new(crc: CrcHashU32) -> Self {
-        let version = Version {
-            major: INDEX_MAJOR_VERSION,
-            minor: INDEX_MINOR_VERSION,
-            patch: INDEX_PATCH_VERSION,
-            index_format_version: INDEX_FORMAT_VERSION,
-        };
-        Footer { version, crc }
-    }
-
-    fn append_footer<W: std::io::Write>(&self, write: &mut W) -> Result<()> {
-        let footer_payload_len = write.write(serde_json::to_string(&self)?.as_ref())?;
-        BinarySerializable::serialize(&(footer_payload_len as u32), write)?;
-        BinarySerializable::serialize(&FOOTER_MAGIC_NUMBER, write)?;
-        Ok(())
-    }
-}
-
 fn extract_footer(data: FileSlice) -> Result<(Vec<FileAddr>, Vec<usize>)> {
     // The following code is copied from tantivy `CompositeFile::open` function.
     // extract field number and offsets of each fields.
@@ -240,59 +198,12 @@ pub fn extract_component_fields(
     Ok(())
 }
 
-// Build footer for tantivy files.
-// Footer is used to check whether the data is valid when open a file.
-pub fn build_tantivy_footer(bytes: &[u8]) -> Result<Vec<u8>> {
-    let mut hasher = Hasher::new();
-    hasher.update(bytes);
-    let crc = hasher.finalize();
-
-    let footer = Footer::new(crc);
-    let mut buf = Vec::new();
-    footer.append_footer(&mut buf)?;
-    Ok(buf)
-}
-
-#[derive(
-    Copy, Clone, Debug, PartialEq, PartialOrd, Eq, Ord, Hash, serde::Serialize, serde::Deserialize,
-)]
-struct Field(u32);
-
-impl Field {
-    /// Create a new field object for the given FieldId.
-    const fn from_field_id(field_id: u32) -> Field {
-        Field(field_id)
-    }
-
-    /// Returns a u32 identifying uniquely a field within a schema.
-    #[allow(dead_code)]
-    const fn field_id(self) -> u32 {
-        self.0
-    }
-}
-
-impl BinarySerializable for Field {
-    fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
-        self.0.serialize(writer)
-    }
-
-    fn deserialize<R: Read>(reader: &mut R) -> io::Result<Field> {
-        u32::deserialize(reader).map(Field)
-    }
-}
-
 #[derive(Eq, PartialEq, Hash, Copy, Ord, PartialOrd, Clone, Debug)]
 struct FileAddr {
     field: Field,
     idx: usize,
 }
 
-impl FileAddr {
-    fn new(field: Field, idx: usize) -> FileAddr {
-        FileAddr { field, idx }
-    }
-}
-
 impl BinarySerializable for FileAddr {
     fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
         self.field.serialize(writer)?;
@@ -307,36 +218,6 @@ impl BinarySerializable for FileAddr {
     }
 }
 
-// Build empty position data to be used when there are no phrase terms in the query.
-// This can reduce data reading and speed up the query
-fn build_empty_position_data(field_nums: usize) -> Result<OwnedBytes> {
-    let offsets: Vec<_> = (0..field_nums)
-        .map(|i| {
-            let field = Field::from_field_id(i as u32);
-            let file_addr = FileAddr::new(field, 0);
-            (file_addr, 0)
-        })
-        .collect();
-
-    let mut buf = Vec::new();
-    VInt(offsets.len() as u64).serialize(&mut buf)?;
-
-    let mut prev_offset = 0;
-    for (file_addr, offset) in offsets {
-        VInt(offset - prev_offset).serialize(&mut buf)?;
-        file_addr.serialize(&mut buf)?;
-        prev_offset = offset;
-    }
-
-    let footer_len = buf.len() as u32;
-    footer_len.serialize(&mut buf)?;
-
-    let mut footer = build_tantivy_footer(&buf)?;
-    buf.append(&mut footer);
-
-    Ok(OwnedBytes::new(buf))
-}
-
 struct DfaWrapper(pub DFA);
 
 impl Automaton for DfaWrapper {
@@ -362,354 +243,704 @@ impl Automaton for DfaWrapper {
     }
 }
 
-// Term value contains values associated with a Term
-// used to match query and collect matched doc ids.
+// Collect matched `doc_ids` for a Query.
 #[derive(Clone)]
-pub struct TermValue {
-    // term info
-    pub term_info: TermInfo,
-    // term matched doc ids
-    pub doc_ids: Vec<u32>,
-    // term frequencies for each doc
-    pub term_freqs: Vec<u32>,
-    // position reader is used to read positions in doc for phrase query
-    pub position_reader: Option<PositionReader>,
+pub struct DocIdsCollector {
+    row_count: u64,
+    need_position: bool,
+    // key is `term`, value is `term_id`,
+    // These terms are in `fst`, means that those terms exist in the block,
+    // we need to use related term infos to determine the matched `doc_ids`.
+    // Use `term_id` as key to get related information and avoid copy `term`.
+    term_map: HashMap<Term, u64>,
+    // key is `term_id`, value is `field_id` and `term_info`.
+    term_infos: HashMap<u64, (u32, TermInfo)>,
+    // key is `term_id`, value is related `BlockSegmentPostings`,
+    // used to read `doc_ids` and `term_freqs`.
+    block_postings_map: HashMap<u64, BlockSegmentPostings>,
+    // key is `term_id`, value is related `PositionReader`,
+    // used to read `positions` in each docs.
+    position_reader_map: HashMap<u64, PositionReader>,
+    // key is `term_id`, value is related `doc_ids`.
+    // `doc_ids` is lazy loaded when used.
+    doc_ids: HashMap<u64, RoaringTreemap>,
+    // key is `term_id`, value is related `term_freqs`.
+    // `term_freqs` is lazy loaded when used.
+    term_freqs: HashMap<u64, Vec<u32>>,
 }
 
-// Check if fst contains terms in query.
-// If not, we can skip read other parts of inverted index.
-pub fn check_term_fsts_match(
-    query: Box<dyn Query>,
-    fst_maps: &HashMap<usize, tantivy_fst::Map<OwnedBytes>>,
-    fuzziness: &Option<u8>,
-    matched_terms: &mut HashMap<Term, u64>,
-    fuzziness_terms: &mut HashMap<Term, Vec<Term>>,
-) -> bool {
-    if let Some(term_query) = query.downcast_ref::<TermQuery>() {
-        let term = term_query.term();
-        let field = term.field();
-        let field_id = field.field_id() as usize;
-        if let Some(fst_map) = fst_maps.get(&field_id) {
-            if let Some(idx) = fst_map.get(term.serialized_value_bytes()) {
-                matched_terms.insert(term.clone(), idx);
-                return true;
-            }
+impl DocIdsCollector {
+    pub fn create(
+        row_count: u64,
+        need_position: bool,
+        terms: HashMap<Term, (u32, u64)>,
+        term_infos: HashMap<u64, (u32, TermInfo)>,
+        block_postings_map: HashMap<u64, BlockSegmentPostings>,
+        position_reader_map: HashMap<u64, PositionReader>,
+    ) -> Self {
+        let term_len = terms.len();
+        let term_map = terms
+            .into_iter()
+            .map(|(term, (_, term_id))| (term, term_id))
+            .collect::<HashMap<Term, u64>>();
+
+        Self {
+            row_count,
+            need_position,
+            term_map,
+            term_infos,
+            block_postings_map,
+            position_reader_map,
+            doc_ids: HashMap::with_capacity(term_len),
+            term_freqs: HashMap::with_capacity(term_len),
         }
-        false
-    } else if let Some(bool_query) = query.downcast_ref::<BooleanQuery>() {
-        let mut matched_num = 0;
-        for (occur, sub_query) in bool_query.clauses() {
-            let matched = check_term_fsts_match(
-                sub_query.box_clone(),
-                fst_maps,
-                fuzziness,
-                matched_terms,
-                fuzziness_terms,
-            );
-            if matched {
-                matched_num += 1;
-            }
-            match occur {
-                Occur::Should => {}
-                Occur::Must => {
-                    if !matched {
-                        return false;
+    }
+
+    fn check_term_match(
+        fst_map: &tantivy_fst::Map<OwnedBytes>,
+        field_id: u32,
+        term: &Term,
+        matched_terms: &mut HashMap<Term, (u32, u64)>,
+    ) -> bool {
+        if let Some(term_id) = fst_map.get(term.serialized_value_bytes()) {
+            matched_terms.insert(term.clone(), (field_id, term_id));
+            true
+        } else {
+            false
+        }
+    }
+
+    fn get_fst_map(
+        field_id: u32,
+        fst_maps: &HashMap<u32, tantivy_fst::Map<OwnedBytes>>,
+    ) -> Result<&tantivy_fst::Map<OwnedBytes>> {
+        let fst_map = fst_maps.get(&field_id).ok_or_else(|| {
+            ErrorCode::TantivyError(format!(
+                "inverted index fst field `{}` does not exist",
+                field_id
+            ))
+        })?;
+
+        Ok(fst_map)
+    }
+
+    // Check if fst contains terms in query.
+    // If not, we can skip read other parts of inverted index.
+    pub fn check_term_fsts_match(
+        query: Box<dyn Query>,
+        fst_maps: &HashMap<u32, tantivy_fst::Map<OwnedBytes>>,
+        fuzziness: &Option<u8>,
+        matched_terms: &mut HashMap<Term, (u32, u64)>,
+        prefix_terms: &mut HashMap<Term, Vec<u64>>,
+        fuzziness_terms: &mut HashMap<Term, Vec<u64>>,
+    ) -> Result<bool> {
+        if let Some(term_query) = query.downcast_ref::<TermQuery>() {
+            let term = term_query.term();
+            let field = term.field();
+            let field_id = field.field_id();
+            let fst_map = Self::get_fst_map(field_id, fst_maps)?;
+            let matched = Self::check_term_match(fst_map, field_id, term, matched_terms);
+            Ok(matched)
+        } else if let Some(bool_query) = query.downcast_ref::<BooleanQuery>() {
+            let mut matched_any = false;
+            for (occur, sub_query) in bool_query.clauses() {
+                let matched = Self::check_term_fsts_match(
+                    sub_query.box_clone(),
+                    fst_maps,
+                    fuzziness,
+                    matched_terms,
+                    prefix_terms,
+                    fuzziness_terms,
+                )?;
+                match occur {
+                    Occur::Should => {
+                        if matched {
+                            matched_any = true;
+                        }
+                    }
+                    Occur::Must => {
+                        if matched {
+                            matched_any = true;
+                        } else {
+                            return Ok(false);
+                        }
+                    }
+                    Occur::MustNot => {
+                        // Matched means that the block contains the term,
+                        // but we still need to filter out the `doc_ids`.
+                        matched_any = true;
                     }
                 }
-                Occur::MustNot => {}
             }
-        }
-        matched_num > 0
-    } else if let Some(phrase_query) = query.downcast_ref::<PhraseQuery>() {
-        // PhraseQuery must match all terms.
-        let field = phrase_query.field();
-        let field_id = field.field_id() as usize;
-        if let Some(fst_map) = fst_maps.get(&field_id) {
+            Ok(matched_any)
+        } else if let Some(phrase_query) = query.downcast_ref::<PhraseQuery>() {
+            // PhraseQuery must match all terms.
+            let field = phrase_query.field();
+            let field_id = field.field_id();
+            let fst_map = Self::get_fst_map(field_id, fst_maps)?;
+
             let mut matched_all = true;
             for term in phrase_query.phrase_terms() {
-                let matched = if let Some(idx) = fst_map.get(term.serialized_value_bytes()) {
-                    matched_terms.insert(term.clone(), idx);
-                    true
-                } else {
-                    false
-                };
+                let matched = Self::check_term_match(fst_map, field_id, &term, matched_terms);
                 if !matched {
                     matched_all = false;
                     break;
                 }
             }
-            matched_all
-        } else {
-            false
-        }
-    } else if let Some(fuzzy_term_query) = query.downcast_ref::<FuzzyTermQuery>() {
-        // FuzzyTermQuery match terms by levenshtein distance.
-        let fuzziness = fuzziness.unwrap();
-
-        let term = fuzzy_term_query.term();
-        let field = term.field();
-        let field_id = field.field_id() as usize;
-        if let Some(fst_map) = fst_maps.get(&field_id) {
-            // build levenshtein automaton
+            Ok(matched_all)
+        } else if let Some(phrase_prefix_query) = query.downcast_ref::<PhrasePrefixQuery>() {
+            // PhrasePrefixQuery must match all terms.
+            let field = phrase_prefix_query.field();
+            let field_id = field.field_id();
+            let fst_map = Self::get_fst_map(field_id, fst_maps)?;
+
+            let mut matched_all = true;
+            for term in phrase_prefix_query.phrase_terms() {
+                let matched = Self::check_term_match(fst_map, field_id, &term, matched_terms);
+                if !matched {
+                    matched_all = false;
+                    break;
+                }
+            }
+            if !matched_all {
+                return Ok(false);
+            }
+
+            // using regex to check prefix term, get related term ids.
+            let (_, prefix_term) = phrase_prefix_query.prefix_term_with_offset();
+            let term_str = String::from_utf8_lossy(prefix_term.serialized_value_bytes());
+            let key = format!("{}.*", term_str);
+            let re = Regex::new(&key).map_err(|_| {
+                ErrorCode::TantivyError(format!("inverted index create regex `{}` failed", key))
+            })?;
+
+            let mut prefix_term_ids = vec![];
+            let mut stream = fst_map.search(&re).into_stream();
+            while let Some((key, term_id)) = stream.next() {
+                let key_str = unsafe { std::str::from_utf8_unchecked(key) };
+                let prefix_term = Term::from_field_text(field, key_str);
+                matched_terms.insert(prefix_term.clone(), (field_id, term_id));
+                prefix_term_ids.push(term_id);
+            }
+            let matched = !prefix_term_ids.is_empty();
+            if matched {
+                prefix_terms.insert(prefix_term.clone(), prefix_term_ids);
+            }
+            Ok(matched)
+        } else if let Some(fuzzy_term_query) = query.downcast_ref::<FuzzyTermQuery>() {
+            // FuzzyTermQuery match terms by levenshtein distance.
+            let fuzziness = fuzziness.unwrap();
+            let term = fuzzy_term_query.term();
+            let field = term.field();
+            let field_id = field.field_id();
+            let fst_map = Self::get_fst_map(field_id, fst_maps)?;
+
+            // build levenshtein automaton to check fuzziness term, get related term ids.
             let lev_automaton_builder = LevenshteinAutomatonBuilder::new(fuzziness, true);
             let term_str = String::from_utf8_lossy(term.serialized_value_bytes());
             let automaton = DfaWrapper(lev_automaton_builder.build_dfa(&term_str));
 
-            let mut fuzz_term_values = vec![];
+            let mut fuzz_term_ids = vec![];
             let mut stream = fst_map.search(automaton).into_stream();
-            while let Some((key, idx)) = stream.next() {
+            while let Some((key, term_id)) = stream.next() {
                 let key_str = unsafe { std::str::from_utf8_unchecked(key) };
                 let fuzz_term = Term::from_field_text(field, key_str);
-                matched_terms.insert(fuzz_term.clone(), idx);
-                fuzz_term_values.push(fuzz_term);
+                matched_terms.insert(fuzz_term.clone(), (field_id, term_id));
+                fuzz_term_ids.push(term_id);
+            }
+            let matched = !fuzz_term_ids.is_empty();
+            if matched {
+                fuzziness_terms.insert(term.clone(), fuzz_term_ids);
             }
-            let matched = !fuzz_term_values.is_empty();
-            fuzziness_terms.insert(term.clone(), fuzz_term_values);
-            matched
+            Ok(matched)
+        } else if let Some(boost_query) = query.downcast_ref::<BoostQuery>() {
+            Self::check_term_fsts_match(
+                boost_query.query(),
+                fst_maps,
+                fuzziness,
+                matched_terms,
+                prefix_terms,
+                fuzziness_terms,
+            )
+        } else if let Some(const_query) = query.downcast_ref::<ConstScoreQuery>() {
+            Self::check_term_fsts_match(
+                const_query.query(),
+                fst_maps,
+                fuzziness,
+                matched_terms,
+                prefix_terms,
+                fuzziness_terms,
+            )
+        } else if let Some(_empty_query) = query.downcast_ref::<EmptyQuery>() {
+            Ok(false)
+        } else if let Some(_all_query) = query.downcast_ref::<AllQuery>() {
+            Ok(true)
         } else {
-            false
+            Err(ErrorCode::TantivyError(format!(
+                "inverted index unsupported query `{:?}`",
+                query
+            )))
         }
-    } else {
-        // TODO: handle other Query types
-        let mut matched = false;
-        query.query_terms(&mut |term, _| {
-            let field = term.field();
-            let field_id = field.field_id() as usize;
-            if let Some(fst_map) = fst_maps.get(&field_id) {
-                if let Some(idx) = fst_map.get(term.serialized_value_bytes()) {
-                    matched_terms.insert(term.clone(), idx);
-                    matched = true;
-                }
-            }
-        });
-
-        matched
     }
-}
 
-// collect matched rows by term value
-pub fn collect_matched_rows(
-    query: Box<dyn Query>,
-    row_count: u32,
-    fuzziness_terms: &HashMap<Term, Vec<Term>>,
-    term_values: &mut HashMap<Term, TermValue>,
-) -> Vec<u32> {
-    if let Some(term_query) = query.downcast_ref::<TermQuery>() {
-        let term = term_query.term();
-        if let Some(term_value) = term_values.get(term) {
-            term_value.doc_ids.clone()
-        } else {
-            vec![]
-        }
-    } else if let Some(bool_query) = query.downcast_ref::<BooleanQuery>() {
-        let mut should_doc_ids_opt = None;
-        let mut must_doc_ids_opt = None;
-        let mut must_not_doc_ids_opt = None;
-        for (occur, sub_query) in bool_query.clauses() {
-            let doc_ids = collect_matched_rows(
-                sub_query.box_clone(),
-                row_count,
-                fuzziness_terms,
-                term_values,
-            );
-            let doc_id_set = HashSet::from_iter(doc_ids.into_iter());
-            match occur {
-                Occur::Should => {
-                    if should_doc_ids_opt.is_none() {
-                        should_doc_ids_opt = Some(doc_id_set);
-                    } else {
-                        let should_doc_ids = should_doc_ids_opt.unwrap();
-                        should_doc_ids_opt =
-                            Some(should_doc_ids.union(&doc_id_set).copied().collect())
-                    }
-                }
-                Occur::Must => {
-                    if must_doc_ids_opt.is_none() {
-                        must_doc_ids_opt = Some(doc_id_set);
-                    } else {
-                        let must_doc_ids = must_doc_ids_opt.unwrap();
-                        must_doc_ids_opt =
-                            Some(must_doc_ids.intersection(&doc_id_set).copied().collect())
-                    }
+    // get `doc_ids` of a `term_id`,
+    fn get_doc_ids(&mut self, term_id: u64) -> Result<&RoaringTreemap> {
+        if let std::collections::hash_map::Entry::Vacant(doc_ids_entry) =
+            self.doc_ids.entry(term_id)
+        {
+            // `doc_ids` are lazy loaded when used.
+            let block_postings = self.block_postings_map.get_mut(&term_id).ok_or_else(|| {
+                ErrorCode::TantivyError(format!(
+                    "inverted index block postings `{}` does not exist",
+                    term_id
+                ))
+            })?;
+
+            let term_freqs_len = if self.need_position {
+                let (_, term_info) = self.term_infos.get(&term_id).ok_or_else(|| {
+                    ErrorCode::TantivyError(format!(
+                        "inverted index term info `{}` does not exist",
+                        term_id
+                    ))
+                })?;
+                term_info.doc_freq as usize
+            } else {
+                0
+            };
+            let mut doc_ids = RoaringTreemap::new();
+            let mut term_freqs = Vec::with_capacity(term_freqs_len);
+            // `doc_ids` are stored in multiple blocks and need to be decode sequentially.
+            // TODO: We can skip some blocks by checking related `doc_ids`.
+            loop {
+                let block_doc_ids = block_postings.docs();
+                if block_doc_ids.is_empty() {
+                    break;
                 }
-                Occur::MustNot => {
-                    if must_not_doc_ids_opt.is_none() {
-                        must_not_doc_ids_opt = Some(doc_id_set);
-                    } else {
-                        let must_not_doc_ids = must_not_doc_ids_opt.unwrap();
-                        must_not_doc_ids_opt =
-                            Some(must_not_doc_ids.union(&doc_id_set).copied().collect())
-                    }
+                doc_ids
+                    .append(block_doc_ids.iter().map(|id| *id as u64))
+                    .unwrap();
+
+                // `term_freqs` is only used if the query need position.
+                if self.need_position {
+                    let block_term_freqs = block_postings.freqs();
+                    term_freqs.extend_from_slice(block_term_freqs);
                 }
+                block_postings.advance();
             }
+            doc_ids_entry.insert(doc_ids);
+            self.term_freqs.insert(term_id, term_freqs);
         }
 
-        let doc_ids = if let Some(mut should_doc_ids) = should_doc_ids_opt {
-            if let Some(must_doc_ids) = must_doc_ids_opt {
-                should_doc_ids = should_doc_ids
-                    .intersection(&must_doc_ids)
-                    .copied()
-                    .collect()
+        let doc_ids = self.doc_ids.get(&term_id).unwrap();
+        Ok(doc_ids)
+    }
+
+    // get the position `offsets` and `term_freqs` of a `term_id` in each `docs`,
+    // which is used to read `positions`.
+    fn get_position_offsets(
+        &mut self,
+        term_id: u64,
+        all_doc_ids: &RoaringTreemap,
+    ) -> Result<HashMap<u64, (u64, u32)>> {
+        let doc_ids = self.doc_ids.get(&term_id).unwrap();
+        let term_freqs = self.term_freqs.get(&term_id).unwrap();
+
+        let mut doc_offset = 0;
+        let mut offset_and_term_freqs = HashMap::with_capacity(all_doc_ids.len() as usize);
+        for (doc_id, term_freq) in doc_ids.iter().zip(term_freqs.iter()) {
+            if all_doc_ids.len() as usize == offset_and_term_freqs.len() {
+                break;
             }
-            if let Some(must_not_doc_ids) = must_not_doc_ids_opt {
-                should_doc_ids = should_doc_ids
-                    .difference(&must_not_doc_ids)
-                    .copied()
-                    .collect()
+            if !all_doc_ids.contains(doc_id) {
+                doc_offset += *term_freq as u64;
+                continue;
             }
-            should_doc_ids
-        } else if let Some(mut must_doc_ids) = must_doc_ids_opt {
-            if let Some(must_not_doc_ids) = must_not_doc_ids_opt {
-                must_doc_ids = must_doc_ids
-                    .difference(&must_not_doc_ids)
-                    .copied()
-                    .collect()
+
+            offset_and_term_freqs.insert(doc_id, (doc_offset, *term_freq));
+            doc_offset += *term_freq as u64;
+        }
+
+        Ok(offset_and_term_freqs)
+    }
+
+    // get `positions` of a `term_id` in a `doc`.
+    fn get_positions(
+        &mut self,
+        term_id: u64,
+        doc_offset: u64,
+        term_freq: u32,
+    ) -> Result<RoaringTreemap> {
+        let position_reader = self.position_reader_map.get_mut(&term_id).ok_or_else(|| {
+            ErrorCode::TantivyError(format!(
+                "inverted index position reader `{}` does not exist",
+                term_id
+            ))
+        })?;
+
+        let mut positions = vec![0; term_freq as usize];
+        position_reader.read(doc_offset, &mut positions[..]);
+        for i in 1..positions.len() {
+            positions[i] += positions[i - 1];
+        }
+        let term_poses =
+            RoaringTreemap::from_sorted_iter(positions.into_iter().map(|i| i as u64)).unwrap();
+        Ok(term_poses)
+    }
+
+    // The phrase query matches `doc_ids` as follows:
+    //
+    // 1. Collect the position for each term in the query.
+    // 2. Collect the `doc_ids` of each term and take
+    //    the intersection to get the candidate `doc_ids`.
+    // 3. Iterate over the candidate `doc_ids` to check whether
+    //    the position of terms matches the position of terms in query.
+    // 4. Each position in the first term is a possible query phrase beginning.
+    //    Verify that the beginning is valid by checking whether corresponding
+    //    positions in other terms exist. If not, delete the possible position
+    //    in the first term. After traversing all terms, determine if there are
+    //    any positions left in the first term. If there are, then the `doc_id`
+    //    is matched.
+    //
+    // If the query is a prefix phrase query, also check if any prefix terms
+    // match the positions.
+    pub fn collect_phrase_matched_rows(
+        &mut self,
+        phrase_terms: Vec<(usize, Term)>,
+        prefix_term: Option<(usize, &Vec<u64>)>,
+    ) -> Result<Option<RoaringTreemap>> {
+        let mut query_term_poses = Vec::with_capacity(phrase_terms.len());
+        for (term_pos, term) in &phrase_terms {
+            // term not exist means this phrase in not matched.
+            let Some(term_id) = self.term_map.get(term) else {
+                return Ok(None);
+            };
+            query_term_poses.push((*term_pos, *term_id));
+        }
+        if query_term_poses.is_empty() {
+            return Ok(None);
+        }
+
+        let first_term_pos = &query_term_poses[0].0;
+        let first_term_id = &query_term_poses[0].1;
+
+        let mut term_ids = HashSet::with_capacity(phrase_terms.len() + 1);
+        term_ids.insert(*first_term_id);
+
+        let first_doc_ids = self.get_doc_ids(*first_term_id)?;
+        let mut candidate_doc_ids = RoaringTreemap::new();
+        candidate_doc_ids.bitor_assign(first_doc_ids);
+
+        // Collect the `doc_ids` of other terms in the query, and take the intersection,
+        // obtains the candidate `doc_ids` containing all terms.
+        let mut query_term_offsets = Vec::with_capacity(query_term_poses.len() - 1);
+        for (term_pos, term_id) in query_term_poses.iter().skip(1) {
+            if !term_ids.contains(term_id) {
+                let doc_ids = self.get_doc_ids(*term_id)?;
+
+                candidate_doc_ids.bitand_assign(doc_ids);
+                if candidate_doc_ids.is_empty() {
+                    break;
+                }
+                term_ids.insert(*term_id);
             }
-            must_doc_ids
-        } else if let Some(must_not_doc_ids) = must_not_doc_ids_opt {
-            let all_doc_ids = HashSet::from_iter(0..row_count);
-            let doc_ids = all_doc_ids.difference(&must_not_doc_ids).copied().collect();
-            doc_ids
-        } else {
-            HashSet::new()
-        };
-
-        let mut doc_ids = Vec::from_iter(doc_ids);
-        doc_ids.sort();
-        doc_ids
-    } else if let Some(phrase_query) = query.downcast_ref::<PhraseQuery>() {
-        let mut union_doc_ids = HashSet::new();
-        let mut intersection_doc_ids_opt = None;
-
-        for term in phrase_query.phrase_terms() {
-            if let Some(term_value) = term_values.get(&term) {
-                let doc_id_set = HashSet::from_iter(term_value.doc_ids.clone());
-                union_doc_ids = union_doc_ids.union(&doc_id_set).copied().collect();
-                if intersection_doc_ids_opt.is_none() {
-                    intersection_doc_ids_opt = Some(doc_id_set);
-                } else {
-                    let intersection_doc_ids = intersection_doc_ids_opt.unwrap();
-                    intersection_doc_ids_opt = Some(
-                        intersection_doc_ids
-                            .intersection(&doc_id_set)
-                            .copied()
-                            .collect(),
-                    );
+            let term_pos_offset = (term_pos - first_term_pos) as u64;
+            query_term_offsets.push((*term_id, term_pos_offset));
+        }
+        // If the candidate `doc_ids` is empty, all docs are not matched.
+        if candidate_doc_ids.is_empty() {
+            return Ok(None);
+        }
+
+        // If the query is a prefix phrase query,
+        // also need to collect `doc_ids` of prefix terms.
+        if let Some((_, prefix_term_ids)) = prefix_term {
+            let mut all_prefix_doc_ids = RoaringTreemap::new();
+            for prefix_term_id in prefix_term_ids {
+                let prefix_doc_ids = self.get_doc_ids(*prefix_term_id)?;
+                // If the `doc_ids` does not intersect at all, this prefix can be ignored.
+                if candidate_doc_ids.is_disjoint(prefix_doc_ids) {
+                    continue;
                 }
+
+                all_prefix_doc_ids.bitor_assign(prefix_doc_ids);
+                term_ids.insert(*prefix_term_id);
             }
+            // If there is no matched prefix `doc_ids`, the prefix term does not matched.
+            if all_prefix_doc_ids.is_empty() {
+                return Ok(None);
+            }
+
+            // Get the intersection of phrase `doc_ids` and prefix `doc_ids`
+            candidate_doc_ids.bitand_assign(all_prefix_doc_ids);
         }
 
-        let intersection_doc_ids = intersection_doc_ids_opt.unwrap_or_default();
-        if intersection_doc_ids.is_empty() {
-            return vec![];
+        // Collect the position `offset` and `term_freqs` for each terms,
+        // which can be used to read positions.
+        let mut offset_and_term_freqs_map = HashMap::new();
+        for term_id in term_ids.into_iter() {
+            let offset_and_term_freqs = self.get_position_offsets(term_id, &candidate_doc_ids)?;
+            offset_and_term_freqs_map.insert(term_id, offset_and_term_freqs);
         }
-        let mut union_doc_ids = Vec::from_iter(union_doc_ids);
-        union_doc_ids.sort();
 
-        // check each docs
-        let mut matched_doc_ids = vec![];
-        for doc_id in union_doc_ids {
-            if !intersection_doc_ids.contains(&doc_id) {
-                continue;
+        let first_offset_and_term_freqs = offset_and_term_freqs_map.get(first_term_id).unwrap();
+
+        // Check every candidate `doc_ids` if the position of each terms match the query.
+        let mut all_doc_ids = RoaringTreemap::new();
+        let mut offset_poses = RoaringTreemap::new();
+        let mut term_poses_map = HashMap::new();
+        for (doc_id, (first_doc_offset, first_term_freq)) in first_offset_and_term_freqs.iter() {
+            let mut first_term_poses =
+                self.get_positions(*first_term_id, *first_doc_offset, *first_term_freq)?;
+
+            term_poses_map.clear();
+            term_poses_map.insert(first_term_id, first_term_poses.clone());
+
+            for (term_id, term_pos_offset) in &query_term_offsets {
+                if !term_poses_map.contains_key(term_id) {
+                    let offset_and_term_freqs = offset_and_term_freqs_map.get(term_id).unwrap();
+                    let (doc_offset, term_freq) = offset_and_term_freqs.get(doc_id).unwrap();
+
+                    let term_poses = self.get_positions(*term_id, *doc_offset, *term_freq)?;
+                    term_poses_map.insert(term_id, term_poses);
+                }
+                let term_poses = term_poses_map.get(term_id).unwrap();
+
+                // Using the position of the first term and the offset of this term with the first term,
+                // calculate all possible positions for this term.
+                offset_poses.clear();
+                offset_poses
+                    .append(first_term_poses.iter().map(|pos| pos + term_pos_offset))
+                    .unwrap();
+
+                // Term possible positions subtract term actual positions,
+                // remaining positions are not matched and need to be removed in the first term.
+                offset_poses.sub_assign(term_poses);
+                for offset_pos in &offset_poses {
+                    first_term_poses.remove(offset_pos - term_pos_offset);
+                }
+                if first_term_poses.is_empty() {
+                    break;
+                }
             }
 
-            let mut term_pos_map = HashMap::new();
-            for term in phrase_query.phrase_terms() {
-                let mut offset = 0;
-                let mut term_freq = 0;
-                if let Some(term_value) = term_values.get_mut(&term) {
-                    for i in 0..term_value.doc_ids.len() {
-                        if term_value.doc_ids[i] < doc_id {
-                            offset += term_value.term_freqs[i] as u64;
-                        } else {
-                            term_freq = term_value.term_freqs[i] as usize;
-                            break;
+            // If the query is a prefix phrase query,
+            // also need to check if any prefix term match.
+            if let Some((prefix_term_pos, prefix_term_ids)) = prefix_term {
+                if !first_term_poses.is_empty() {
+                    let mut prefix_matched = false;
+                    let prefix_term_pos_offset = (prefix_term_pos - first_term_pos) as u64;
+                    for prefix_term_id in prefix_term_ids {
+                        if !term_poses_map.contains_key(prefix_term_id) {
+                            if let Some(offset_and_term_freqs) =
+                                offset_and_term_freqs_map.get(prefix_term_id)
+                            {
+                                if let Some((doc_offset, term_freq)) =
+                                    offset_and_term_freqs.get(doc_id)
+                                {
+                                    let term_poses = self.get_positions(
+                                        *prefix_term_id,
+                                        *doc_offset,
+                                        *term_freq,
+                                    )?;
+                                    term_poses_map.insert(prefix_term_id, term_poses);
+                                }
+                            }
                         }
-                    }
-                    // collect positions in the docs
-                    if let Some(position_reader) = term_value.position_reader.as_mut() {
-                        let mut pos_output = vec![0; term_freq];
-                        position_reader.read(offset, &mut pos_output[..]);
-                        for i in 1..pos_output.len() {
-                            pos_output[i] += pos_output[i - 1];
+                        if let Some(term_poses) = term_poses_map.get(prefix_term_id) {
+                            offset_poses.clear();
+                            offset_poses
+                                .append(
+                                    first_term_poses
+                                        .iter()
+                                        .map(|pos| pos + prefix_term_pos_offset),
+                                )
+                                .unwrap();
+
+                            offset_poses.bitand_assign(term_poses);
+                            // If any of the possible prefix term positions exist,
+                            // the prefix phrase query is matched.
+                            if !offset_poses.is_empty() {
+                                prefix_matched = true;
+                                break;
+                            }
                         }
-                        let positions = VecDeque::from_iter(pos_output);
-                        term_pos_map.insert(term.clone(), positions);
+                    }
+                    if prefix_matched {
+                        all_doc_ids.insert(*doc_id);
                     }
                 }
+            } else {
+                let matched = !first_term_poses.is_empty();
+                if matched {
+                    all_doc_ids.insert(*doc_id);
+                }
             }
+        }
+
+        if !all_doc_ids.is_empty() {
+            Ok(Some(all_doc_ids))
+        } else {
+            Ok(None)
+        }
+    }
 
-            let mut is_first = true;
-            let mut distance = 0;
-            let mut matched = true;
-            let mut last_position = 0;
-            for (query_position, term) in phrase_query.phrase_terms_with_offsets() {
-                if let Some(positions) = term_pos_map.get_mut(&term) {
-                    let mut find_position = false;
-                    while let Some(doc_position) = positions.pop_front() {
-                        // skip previous positions.
-                        if doc_position < last_position {
+    // collect matched rows by term value
+    pub fn collect_matched_rows(
+        &mut self,
+        query: Box<dyn Query>,
+        prefix_terms: &HashMap<Term, Vec<u64>>,
+        fuzziness_terms: &HashMap<Term, Vec<u64>>,
+    ) -> Result<Option<RoaringTreemap>> {
+        if let Some(term_query) = query.downcast_ref::<TermQuery>() {
+            let term = term_query.term();
+            if let Some(term_id) = self.term_map.get(term) {
+                let doc_ids = self.get_doc_ids(*term_id)?;
+                Ok(Some(doc_ids.clone()))
+            } else {
+                Ok(None)
+            }
+        } else if let Some(bool_query) = query.downcast_ref::<BooleanQuery>() {
+            let mut should_doc_ids: Option<RoaringTreemap> = None;
+            let mut must_doc_ids: Option<RoaringTreemap> = None;
+            let mut must_not_doc_ids: Option<RoaringTreemap> = None;
+            for (occur, sub_query) in bool_query.clauses() {
+                let doc_ids = self.collect_matched_rows(
+                    sub_query.box_clone(),
+                    prefix_terms,
+                    fuzziness_terms,
+                )?;
+                if doc_ids.is_none() {
+                    match occur {
+                        Occur::Should => {
                             continue;
                         }
-                        last_position = doc_position;
-                        let doc_distance = doc_position - (query_position as u32);
-                        if is_first {
-                            is_first = false;
-                            distance = doc_distance;
-                        } else {
-                            // distance must same as first term.
-                            if doc_distance != distance {
-                                matched = false;
+                        Occur::Must => {
+                            if must_doc_ids.is_none() {
+                                must_doc_ids = Some(RoaringTreemap::new());
+                            }
+                            continue;
+                        }
+                        Occur::MustNot => {
+                            if must_not_doc_ids.is_none() {
+                                must_not_doc_ids = Some(RoaringTreemap::new());
                             }
+                            continue;
                         }
-                        find_position = true;
-                        break;
                     }
-                    if !find_position {
-                        matched = false;
+                }
+                let doc_ids = doc_ids.unwrap();
+                match occur {
+                    Occur::Should => {
+                        if let Some(ref mut should_doc_ids) = should_doc_ids {
+                            should_doc_ids.bitor_assign(&doc_ids);
+                        } else {
+                            should_doc_ids = Some(doc_ids);
+                        }
+                    }
+                    Occur::Must => {
+                        if let Some(ref mut must_doc_ids) = must_doc_ids {
+                            must_doc_ids.bitand_assign(&doc_ids);
+                        } else {
+                            must_doc_ids = Some(doc_ids);
+                        }
+                    }
+                    Occur::MustNot => {
+                        if let Some(ref mut must_not_doc_ids) = must_not_doc_ids {
+                            must_not_doc_ids.bitor_assign(&doc_ids);
+                        } else {
+                            must_not_doc_ids = Some(doc_ids);
+                        }
                     }
-                } else {
-                    matched = false;
                 }
-                if !matched {
-                    break;
+            }
+
+            let all_doc_ids = match (should_doc_ids, must_doc_ids, must_not_doc_ids) {
+                // only should
+                (Some(should_doc_ids), None, None) => should_doc_ids,
+                // only must
+                (None, Some(must_doc_ids), None) => must_doc_ids,
+                // only must not
+                (None, None, Some(must_not_doc_ids)) => {
+                    let mut all_doc_ids = RoaringTreemap::from_iter(0..self.row_count);
+                    all_doc_ids.sub_assign(must_not_doc_ids);
+                    all_doc_ids
+                }
+                // should and must
+                (Some(mut should_doc_ids), Some(must_doc_ids), None) => {
+                    should_doc_ids.bitor_assign(must_doc_ids);
+                    should_doc_ids
+                }
+                // should and must not
+                (Some(mut should_doc_ids), None, Some(must_not_doc_ids)) => {
+                    should_doc_ids.sub_assign(must_not_doc_ids);
+                    should_doc_ids
+                }
+                // must and must not
+                (None, Some(mut must_doc_ids), Some(must_not_doc_ids)) => {
+                    must_doc_ids.sub_assign(must_not_doc_ids);
+                    must_doc_ids
+                }
+                // should, must and must not
+                (Some(mut should_doc_ids), Some(must_doc_ids), Some(must_not_doc_ids)) => {
+                    should_doc_ids.bitor_assign(must_doc_ids);
+                    should_doc_ids.sub_assign(must_not_doc_ids);
+                    should_doc_ids
+                }
+                (None, None, None) => {
+                    return Ok(None);
                 }
+            };
+
+            if !all_doc_ids.is_empty() {
+                Ok(Some(all_doc_ids))
+            } else {
+                Ok(None)
             }
-            if matched {
-                matched_doc_ids.push(doc_id);
+        } else if let Some(phrase_query) = query.downcast_ref::<PhraseQuery>() {
+            let phrase_terms = phrase_query.phrase_terms_with_offsets();
+            self.collect_phrase_matched_rows(phrase_terms, None)
+        } else if let Some(phrase_prefix_query) = query.downcast_ref::<PhrasePrefixQuery>() {
+            let phrase_terms = phrase_prefix_query.phrase_terms_with_offsets();
+            let (prefix_term_pos, prefix_term) = phrase_prefix_query.prefix_term_with_offset();
+
+            let Some(prefix_term_ids) = prefix_terms.get(&prefix_term) else {
+                return Ok(None);
+            };
+            let prefix_term = Some((prefix_term_pos, prefix_term_ids));
+
+            self.collect_phrase_matched_rows(phrase_terms, prefix_term)
+        } else if let Some(fuzzy_term_query) = query.downcast_ref::<FuzzyTermQuery>() {
+            let mut all_doc_ids = RoaringTreemap::new();
+            let term = fuzzy_term_query.term();
+
+            let Some(fuzz_term_ids) = fuzziness_terms.get(term) else {
+                return Ok(None);
+            };
+            // collect related terms of the original term.
+            for term_id in fuzz_term_ids {
+                let doc_ids = self.get_doc_ids(*term_id)?;
+                all_doc_ids.bitor_assign(doc_ids);
             }
-        }
-        matched_doc_ids
-    } else if let Some(fuzzy_term_query) = query.downcast_ref::<FuzzyTermQuery>() {
-        let mut fuzz_doc_ids = HashSet::new();
-        let term = fuzzy_term_query.term();
-
-        // collect related terms of the original term.
-        if let Some(related_terms) = fuzziness_terms.get(term) {
-            for term in related_terms {
-                if let Some(term_value) = term_values.get(term) {
-                    let doc_id_set: HashSet<u32> = HashSet::from_iter(term_value.doc_ids.clone());
-                    fuzz_doc_ids = fuzz_doc_ids.union(&doc_id_set).copied().collect();
-                }
+            if !all_doc_ids.is_empty() {
+                Ok(Some(all_doc_ids))
+            } else {
+                Ok(None)
             }
-            let mut doc_ids = Vec::from_iter(fuzz_doc_ids);
-            doc_ids.sort();
-            doc_ids
+        } else if let Some(boost_query) = query.downcast_ref::<BoostQuery>() {
+            self.collect_matched_rows(boost_query.query(), prefix_terms, fuzziness_terms)
+        } else if let Some(const_query) = query.downcast_ref::<ConstScoreQuery>() {
+            self.collect_matched_rows(const_query.query(), prefix_terms, fuzziness_terms)
+        } else if let Some(_empty_query) = query.downcast_ref::<EmptyQuery>() {
+            Ok(None)
+        } else if let Some(_all_query) = query.downcast_ref::<AllQuery>() {
+            let all_doc_ids = RoaringTreemap::from_iter(0..self.row_count);
+            Ok(Some(all_doc_ids))
         } else {
-            vec![]
+            Err(ErrorCode::TantivyError(format!(
+                "inverted index unsupported query `{:?}`",
+                query
+            )))
         }
-    } else {
-        let mut union_doc_ids = HashSet::new();
-        query.query_terms(&mut |term, _| {
-            if let Some(term_value) = term_values.get(term) {
-                let doc_id_set: HashSet<u32> = HashSet::from_iter(term_value.doc_ids.clone());
-                union_doc_ids = union_doc_ids.union(&doc_id_set).copied().collect();
-            }
-        });
-
-        let mut doc_ids = Vec::from_iter(union_doc_ids);
-        doc_ids.sort();
-        doc_ids
     }
 }
 
@@ -798,7 +1029,7 @@ pub struct InvertedIndexDirectory {
 }
 
 impl InvertedIndexDirectory {
-    pub fn try_create(field_nums: usize, files: Vec<Arc<InvertedIndexFile>>) -> Result<Self> {
+    pub fn try_create(files: Vec<Arc<InvertedIndexFile>>) -> Result<Self> {
         let mut file_map = BTreeMap::<String, OwnedBytes>::new();
 
         for file in files.into_iter() {
@@ -812,12 +1043,7 @@ impl InvertedIndexDirectory {
         let fast_data = file_map.remove("fast").unwrap();
         let store_data = file_map.remove("store").unwrap();
         let fieldnorm_data = file_map.remove("fieldnorm").unwrap();
-        // If there are no phrase terms in the query,
-        // we can use empty position data instead.
-        let pos_data = match file_map.remove("pos") {
-            Some(pos_data) => pos_data,
-            None => build_empty_position_data(field_nums)?,
-        };
+        let pos_data = file_map.remove("pos").unwrap();
         let idx_data = file_map.remove("idx").unwrap();
         let term_data = file_map.remove("term").unwrap();
         let meta_data = file_map.remove("meta.json").unwrap();
diff --git a/src/query/storages/common/index/src/lib.rs b/src/query/storages/common/index/src/lib.rs
index f60a1b16982e..48a30df64cae 100644
--- a/src/query/storages/common/index/src/lib.rs
+++ b/src/query/storages/common/index/src/lib.rs
@@ -26,15 +26,12 @@ pub use bloom_index::BloomIndex;
 pub use bloom_index::BloomIndexMeta;
 pub use bloom_index::FilterEvalResult;
 pub use index::Index;
-pub use inverted_index::build_tantivy_footer;
-pub use inverted_index::check_term_fsts_match;
-pub use inverted_index::collect_matched_rows;
 pub use inverted_index::extract_component_fields;
 pub use inverted_index::extract_fsts;
+pub use inverted_index::DocIdsCollector;
 pub use inverted_index::InvertedIndexDirectory;
 pub use inverted_index::InvertedIndexFile;
 pub use inverted_index::InvertedIndexMeta;
-pub use inverted_index::TermValue;
 pub use page_index::PageIndex;
 pub use range_index::statistics_to_domain;
 pub use range_index::RangeIndex;
diff --git a/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_loader.rs b/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_loader.rs
index 411dc316b98b..658a8bc38c95 100644
--- a/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_loader.rs
+++ b/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_loader.rs
@@ -131,7 +131,6 @@ pub(crate) async fn load_inverted_index_file<'a>(
 #[fastrace::trace]
 pub(crate) async fn load_inverted_index_directory<'a>(
     dal: Operator,
-    field_nums: usize,
     index_path: &'a str,
 ) -> Result<InvertedIndexDirectory> {
     // load inverted index meta, contains the offsets of each files.
@@ -154,7 +153,7 @@ pub(crate) async fn load_inverted_index_directory<'a>(
 
     let files: Vec<_> = try_join_all(futs).await?.into_iter().collect();
     // use those files to create inverted index directory
-    let directory = InvertedIndexDirectory::try_create(field_nums, files)?;
+    let directory = InvertedIndexDirectory::try_create(files)?;
 
     Ok(directory)
 }
diff --git a/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs b/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs
index e3baee6c0c0f..6441a0fccf97 100644
--- a/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs
+++ b/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs
@@ -17,12 +17,10 @@ use std::collections::HashSet;
 use std::sync::Arc;
 use std::time::Instant;
 
+use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
 use databend_common_expression::types::F32;
 use databend_common_metrics::storage::metrics_inc_block_inverted_index_search_milliseconds;
-use databend_storages_common_index::check_term_fsts_match;
-use databend_storages_common_index::collect_matched_rows;
-use databend_storages_common_index::TermValue;
 use databend_storages_common_table_meta::meta::SingleColumnMeta;
 use futures_util::future::try_join_all;
 use opendal::Operator;
@@ -40,6 +38,7 @@ use tantivy::tokenizer::TokenizerManager;
 use tantivy::Index;
 use tantivy_fst::raw::Fst;
 
+use crate::index::DocIdsCollector;
 use crate::io::read::inverted_index::inverted_index_loader::load_inverted_index_directory;
 use crate::io::read::inverted_index::inverted_index_loader::load_inverted_index_file;
 use crate::io::read::inverted_index::inverted_index_loader::load_inverted_index_meta;
@@ -61,15 +60,14 @@ impl InvertedIndexReader {
     #[allow(clippy::too_many_arguments)]
     pub async fn do_filter(
         self,
-        field_nums: usize,
         need_position: bool,
         has_score: bool,
         query: Box<dyn Query>,
-        field_ids: &HashSet<usize>,
+        field_ids: &HashSet<u32>,
         index_record: &IndexRecordOption,
         fuzziness: &Option<u8>,
         tokenizer_manager: TokenizerManager,
-        row_count: u32,
+        row_count: u64,
         index_loc: &str,
     ) -> Result<Option<Vec<(usize, Option<F32>)>>> {
         let start = Instant::now();
@@ -79,7 +77,6 @@ impl InvertedIndexReader {
                 index_loc,
                 query,
                 field_ids,
-                field_nums,
                 need_position,
                 has_score,
                 index_record,
@@ -101,15 +98,19 @@ impl InvertedIndexReader {
         &self,
         index_path: &'a str,
         name: &str,
-        field_ids: &HashSet<usize>,
+        field_ids: &HashSet<u32>,
         inverted_index_meta_map: &HashMap<String, SingleColumnMeta>,
-    ) -> Result<HashMap<usize, OwnedBytes>> {
-        let mut col_metas = vec![];
-        let mut col_field_map = HashMap::new();
+    ) -> Result<HashMap<u32, OwnedBytes>> {
+        let mut col_metas = Vec::with_capacity(field_ids.len());
+        let mut col_field_map = HashMap::with_capacity(field_ids.len());
         for field_id in field_ids {
             let col_name = format!("{}-{}", name, field_id);
-            let col_meta = inverted_index_meta_map.get(&col_name).unwrap();
-
+            let col_meta = inverted_index_meta_map.get(&col_name).ok_or_else(|| {
+                ErrorCode::TantivyError(format!(
+                    "inverted index column `{}` does not exist",
+                    col_name
+                ))
+            })?;
             col_metas.push((col_name.clone(), col_meta));
             col_field_map.insert(col_name, *field_id);
         }
@@ -131,18 +132,16 @@ impl InvertedIndexReader {
         Ok(col_files)
     }
 
-    // first version search function, using tantivy searcher.
-    async fn search_v0<'a>(
+    // legacy query search function, using tantivy searcher.
+    async fn legacy_search<'a>(
         &self,
         index_path: &'a str,
         query: Box<dyn Query>,
-        field_nums: usize,
         has_score: bool,
         tokenizer_manager: TokenizerManager,
-        row_count: u32,
+        row_count: u64,
     ) -> Result<Option<Vec<(usize, Option<F32>)>>> {
-        let directory =
-            load_inverted_index_directory(self.dal.clone(), field_nums, index_path).await?;
+        let directory = load_inverted_index_directory(self.dal.clone(), index_path).await?;
 
         let mut index = Index::open(directory)?;
         index.set_tokenizers(tokenizer_manager);
@@ -179,34 +178,48 @@ impl InvertedIndexReader {
     }
 
     // Follow the process below to perform the query search:
-    // 1. Read the `fst` first, check if the term in the query matches,
-    //    return if it doesn't matched.
-    // 2. Read the `term dict` to get the `postings_range` in `idx`
-    //    and the `positions_range` in `pos` for each terms.
-    // 3. Read the `doc_ids` and `term_freqs` in `idx` for each terms
-    //    using `postings_range`.
-    // 4. If it's a phrase query, read the `position` of each terms in
-    //    `pos` using `positions_range`.
-    // 5. Collect matched doc ids using term-related information.
+    //
+    // 1. Read the `fst` first, check if the term in the query matches.
+    //    If it matches, collect the terms that need to be checked
+    //    for subsequent processing, otherwise, return directly
+    //    and ignore the block.
+    // 2. Read the `term_info` for each terms from the `term_dict`,
+    //    which contains three parts:
+    //    `doc_freq` is the number of docs containing the term.
+    //    `postings_range` is used to read posting list in the postings (`.idx`) file.
+    //    `positions_range` is used to read positions in the positions (`.pos`) file.
+    // 3. Read `postings` data from postings (`.idx`) file by `postings_range`
+    //    of each terms, and `positions` data from positions (`.pos`) file
+    //    by `positions_range` of each terms.
+    // 4. Open `BlockSegmentPostings` using `postings` data for each terms,
+    //    which can be used to read `doc_ids` and `term_freqs`.
+    // 5. If the query is a phrase query, Open `PositionReader` using
+    //    `positions` data for each terms, which can be use to read
+    //    term `positions` in each docs.
+    // 6. Collect matched `doc_ids` of the query.
+    //    If the query is a term query, the `doc_ids` can read from `BlockSegmentPostings`.
+    //    If the query is a phrase query, in addition to `doc_ids`, also need to
+    //    use `PositionReader` to read the positions for each terms and check whether
+    //    the position of terms in doc is the same as the position of terms in query.
     //
     // If the term does not match, only the `fst` file needs to be read.
-    // If the term matches, only the `idx` and `pos` data of the related terms
-    // need to be read instead of all the `idx` and `pos` data.
+    // If the term matches, the `term_dict` and `postings`, `positions`
+    // data of the related terms need to be read instead of all
+    // the `postings` and `positions` data.
     #[allow(clippy::too_many_arguments)]
     async fn search<'a>(
         &self,
         index_path: &'a str,
         query: Box<dyn Query>,
-        field_ids: &HashSet<usize>,
-        field_nums: usize,
+        field_ids: &HashSet<u32>,
         need_position: bool,
         has_score: bool,
         index_record: &IndexRecordOption,
         fuzziness: &Option<u8>,
         tokenizer_manager: TokenizerManager,
-        row_count: u32,
+        row_count: u64,
     ) -> Result<Option<Vec<(usize, Option<F32>)>>> {
-        // 1. read index meta
+        // 1. read index meta.
         let inverted_index_meta = load_inverted_index_meta(self.dal.clone(), index_path).await?;
 
         let inverted_index_meta_map = inverted_index_meta
@@ -220,18 +233,11 @@ impl InvertedIndexReader {
         // use compatible search function to read.
         if inverted_index_meta_map.contains_key("meta.json") {
             return self
-                .search_v0(
-                    index_path,
-                    query,
-                    field_nums,
-                    has_score,
-                    tokenizer_manager,
-                    row_count,
-                )
+                .legacy_search(index_path, query, has_score, tokenizer_manager, row_count)
                 .await;
         }
 
-        // 2. read fst files
+        // 2. read fst files.
         let fst_files = self
             .read_column_data(index_path, "fst", field_ids, &inverted_index_meta_map)
             .await?;
@@ -250,15 +256,18 @@ impl InvertedIndexReader {
 
         // 3. check whether query is matched in the fsts.
         let mut matched_terms = HashMap::new();
+        let mut prefix_terms = HashMap::new();
         let mut fuzziness_terms = HashMap::new();
-        let matched = check_term_fsts_match(
+        let matched = DocIdsCollector::check_term_fsts_match(
             query.box_clone(),
             &fst_maps,
             fuzziness,
             &mut matched_terms,
+            &mut prefix_terms,
             &mut fuzziness_terms,
-        );
+        )?;
 
+        // if not matched, return without further check
         if !matched {
             return Ok(None);
         }
@@ -268,71 +277,66 @@ impl InvertedIndexReader {
             .read_column_data(index_path, "term", field_ids, &inverted_index_meta_map)
             .await?;
 
-        let mut term_dict_maps = HashMap::new();
+        let mut term_infos = HashMap::with_capacity(matched_terms.len());
         for (field_id, term_dict_data) in term_dict_files.into_iter() {
             let term_dict_file = FileSlice::new(Arc::new(term_dict_data));
             let term_info_store = TermInfoStore::open(term_dict_file)?;
-            term_dict_maps.insert(field_id, term_info_store);
-        }
 
-        let mut term_values = HashMap::new();
-        for (term, term_ord) in matched_terms.iter() {
-            let field = term.field();
-            let field_id = field.field_id() as usize;
-
-            let term_dict = term_dict_maps.get(&field_id).unwrap();
-            let term_info = term_dict.get(*term_ord);
-
-            let term_value = TermValue {
-                term_info,
-                doc_ids: vec![],
-                term_freqs: vec![],
-                position_reader: None,
-            };
-            term_values.insert(term.clone(), term_value);
+            for (_, (term_field_id, term_id)) in matched_terms.iter() {
+                if field_id == *term_field_id {
+                    let term_info = term_info_store.get(*term_id);
+                    term_infos.insert(*term_id, (field_id, term_info));
+                }
+            }
         }
 
         // 5. read postings and optional positions.
-        //    collect doc ids, term frequencies and optional position readers.
-        let mut slice_metas = Vec::with_capacity(term_values.len());
-        let mut name_map = HashMap::new();
-        for (term, term_value) in term_values.iter() {
-            let field = term.field();
-            let field_id = field.field_id() as usize;
-
+        let term_slice_len = if need_position {
+            term_infos.len() * 2
+        } else {
+            term_infos.len()
+        };
+        let mut slice_metas = Vec::with_capacity(term_slice_len);
+        let mut slice_name_map = HashMap::with_capacity(term_slice_len);
+        for (term_id, (field_id, term_info)) in term_infos.iter() {
             let idx_name = format!("idx-{}", field_id);
-            let idx_meta = inverted_index_meta_map.get(&idx_name).unwrap();
-
+            let idx_meta = inverted_index_meta_map.get(&idx_name).ok_or_else(|| {
+                ErrorCode::TantivyError(format!(
+                    "inverted index column `{}` does not exist",
+                    idx_name
+                ))
+            })?;
             // ignore 8 bytes total_num_tokens_slice
-            let offset = idx_meta.offset + 8 + (term_value.term_info.postings_range.start as u64);
-            let len = term_value.term_info.postings_range.len() as u64;
+            let offset = idx_meta.offset + 8 + (term_info.postings_range.start as u64);
+            let len = term_info.postings_range.len() as u64;
             let idx_slice_meta = SingleColumnMeta {
                 offset,
                 len,
                 num_values: 1,
             };
 
-            let idx_slice_name =
-                format!("{}-{}", idx_name, term_value.term_info.postings_range.start);
+            let idx_slice_name = format!("{}-{}", idx_name, term_info.postings_range.start);
             slice_metas.push((idx_slice_name.clone(), idx_slice_meta));
-            name_map.insert(idx_slice_name, term.clone());
+            slice_name_map.insert(idx_slice_name, *term_id);
 
             if need_position {
                 let pos_name = format!("pos-{}", field_id);
-                let pos_meta = inverted_index_meta_map.get(&pos_name).unwrap();
-                let offset = pos_meta.offset + (term_value.term_info.positions_range.start as u64);
-                let len = term_value.term_info.positions_range.len() as u64;
+                let pos_meta = inverted_index_meta_map.get(&pos_name).ok_or_else(|| {
+                    ErrorCode::TantivyError(format!(
+                        "inverted index column `{}` does not exist",
+                        pos_name
+                    ))
+                })?;
+                let offset = pos_meta.offset + (term_info.positions_range.start as u64);
+                let len = term_info.positions_range.len() as u64;
                 let pos_slice_meta = SingleColumnMeta {
                     offset,
                     len,
                     num_values: 1,
                 };
-                let pos_slice_name = format!(
-                    "{}-{}",
-                    pos_name, term_value.term_info.positions_range.start
-                );
+                let pos_slice_name = format!("{}-{}", pos_name, term_info.positions_range.start);
                 slice_metas.push((pos_slice_name.clone(), pos_slice_meta));
-                name_map.insert(pos_slice_name, term.clone());
+                slice_name_map.insert(pos_slice_name, *term_id);
             }
         }
 
@@ -347,53 +351,58 @@ impl InvertedIndexReader {
             .map(|f| (f.name.clone(), f.data.clone()))
             .collect::<HashMap<_, _>>();
 
+        let mut block_postings_map = HashMap::with_capacity(term_infos.len());
+        let mut position_reader_map = HashMap::with_capacity(term_infos.len());
         for (slice_name, slice_data) in slice_files.into_iter() {
-            let term = name_map.get(&slice_name).unwrap();
-            let term_value = term_values.get_mut(term).unwrap();
+            let term_id = slice_name_map.remove(&slice_name).unwrap();
+            let (_, term_info) = term_infos.get(&term_id).unwrap();
 
             if slice_name.starts_with("idx") {
                 let posting_file = FileSlice::new(Arc::new(slice_data));
-                let postings = BlockSegmentPostings::open(
-                    term_value.term_info.doc_freq,
+                let block_postings = BlockSegmentPostings::open(
+                    term_info.doc_freq,
                     posting_file,
                     *index_record,
                     *index_record,
                 )?;
-                let doc_ids = postings.docs();
-                let term_freqs = postings.freqs();
 
-                term_value.doc_ids = doc_ids.to_vec();
-                term_value.term_freqs = term_freqs.to_vec();
+                block_postings_map.insert(term_id, block_postings);
             } else if slice_name.starts_with("pos") {
                 let position_reader = PositionReader::open(slice_data)?;
-                term_value.position_reader = Some(position_reader);
+                position_reader_map.insert(term_id, position_reader);
             }
         }
 
-        // 6. collect matched rows by term values.
-        let matched_docs = collect_matched_rows(
-            query.box_clone(),
+        // 6. collect matched doc ids.
+        let mut collector = DocIdsCollector::create(
             row_count,
-            &fuzziness_terms,
-            &mut term_values,
+            need_position,
+            matched_terms,
+            term_infos,
+            block_postings_map,
+            position_reader_map,
         );
-
-        if !matched_docs.is_empty() {
-            let mut matched_rows = Vec::with_capacity(matched_docs.len());
-            if has_score {
-                // TODO: add score
-                for doc_id in matched_docs {
-                    matched_rows.push((doc_id as usize, Some(F32::from(1.0))));
-                }
-            } else {
-                for doc_id in matched_docs {
-                    matched_rows.push((doc_id as usize, None))
+        let matched_doc_ids =
+            collector.collect_matched_rows(query.box_clone(), &prefix_terms, &fuzziness_terms)?;
+
+        if let Some(matched_doc_ids) = matched_doc_ids {
+            if !matched_doc_ids.is_empty() {
+                let mut matched_rows = Vec::with_capacity(matched_doc_ids.len() as usize);
+                let doc_ids_iter = matched_doc_ids.iter();
+                if has_score {
+                    // TODO: add score
+                    for doc_id in doc_ids_iter {
+                        matched_rows.push((doc_id as usize, Some(F32::from(1.0))));
+                    }
+                } else {
+                    for doc_id in doc_ids_iter {
+                        matched_rows.push((doc_id as usize, None));
+                    }
                 }
+                return Ok(Some(matched_rows));
             }
-            Ok(Some(matched_rows))
-        } else {
-            Ok(None)
         }
+        Ok(None)
     }
 
     // delegation of [InvertedIndexFileReader::cache_key_of_index_columns]
diff --git a/src/query/storages/fuse/src/io/write/inverted_index_writer.rs b/src/query/storages/fuse/src/io/write/inverted_index_writer.rs
index 8935a3420e78..232ebe3ab773 100644
--- a/src/query/storages/fuse/src/io/write/inverted_index_writer.rs
+++ b/src/query/storages/fuse/src/io/write/inverted_index_writer.rs
@@ -14,8 +14,6 @@
 
 use std::collections::BTreeMap;
 use std::collections::HashSet;
-use std::io::Write;
-use std::path::Path;
 use std::sync::Arc;
 
 use arrow_ipc::writer::write_message;
@@ -52,16 +50,12 @@ use tantivy::tokenizer::Stemmer;
 use tantivy::tokenizer::StopWordFilter;
 use tantivy::tokenizer::TextAnalyzer;
 use tantivy::tokenizer::TokenizerManager;
-use tantivy::Directory;
-use tantivy::Index;
 use tantivy::IndexBuilder;
 use tantivy::IndexSettings;
 use tantivy::IndexWriter;
 use tantivy::SegmentComponent;
 use tantivy_jieba::JiebaTokenizer;
 
-use crate::index::build_tantivy_footer;
-
 pub struct InvertedIndexWriter {
     schema: DataSchemaRef,
     index_writer: IndexWriter,
@@ -175,178 +169,6 @@ impl InvertedIndexWriter {
 
         Ok((inverted_index_schema, inverted_index_block))
     }
-
-    // The tantivy index data consists of eight files.
-    // `.managed.json` file stores the name of the file that the index contains, for example:
-    // [
-    //   "94bce521d5bc4eccbf3e7a0212093622.pos",
-    //   "94bce521d5bc4eccbf3e7a0212093622.fieldnorm",
-    //   "94bce521d5bc4eccbf3e7a0212093622.fast",
-    //   "meta.json",
-    //   "94bce521d5bc4eccbf3e7a0212093622.term",
-    //   "94bce521d5bc4eccbf3e7a0212093622.store",
-    //   "94bce521d5bc4eccbf3e7a0212093622.idx"
-    // ]
-    //
-    // `meta.json` file store the meta information associated with the index, for example:
-    // {
-    //   "index_settings": {
-    //     "docstore_compression": "lz4",
-    //     "docstore_blocksize": 16384
-    //   },
-    //   "segments":[{
-    //     "segment_id": "94bce521-d5bc-4ecc-bf3e-7a0212093622",
-    //     "max_doc": 6,
-    //     "deletes": null
-    //   }],
-    //   "schema":[{
-    //     "name": "title",
-    //     "type": "text",
-    //     "options": {
-    //       "indexing": {
-    //         "record": "position",
-    //         "fieldnorms": true,
-    //         "tokenizer": "en"
-    //       },
-    //       "stored": false,
-    //       "fast": false
-    //     }
-    //   }, {
-    //     "name": "content",
-    //     "type": "text",
-    //     "options": {
-    //       "indexing": {
-    //         "record": "position",
-    //         "fieldnorms": true,
-    //         "tokenizer": "en"
-    //       },
-    //       "stored": false,
-    //       "fast": false
-    //     }
-    //   }],
-    //   "opstamp":0
-    // }
-    //
-    // `terms` file stores the term dictionary, the value is
-    // an address into the `postings` file and the `positions` file.
-    // `postings` file stores the lists of document ids and freqs.
-    // `positions` file stores the positions of terms in each document.
-    // `field norms` file stores the sum of the length of the term.
-    // `fast fields` file stores column-oriented documents.
-    // `store` file stores row-oriented documents.
-    //
-    // More details can be seen here
-    // https://github.com/quickwit-oss/tantivy/blob/main/src/index/segment_component.rs#L8
-    //
-    // We merge the data from these files into one file and
-    // record the offset to read each part of the data.
-    #[async_backtrace::framed]
-    fn write_index<W: Write>(mut writer: &mut W, index: &Index) -> Result<()> {
-        let directory = index.directory();
-
-        let managed_filepath = Path::new(".managed.json");
-        let managed_bytes = directory.atomic_read(managed_filepath)?;
-
-        let meta_filepath = Path::new("meta.json");
-        let meta_data = directory.atomic_read(meta_filepath)?;
-
-        let meta_string = std::str::from_utf8(&meta_data)?;
-        let meta_val: serde_json::Value = serde_json::from_str(meta_string)?;
-        let meta_json: String = serde_json::to_string(&meta_val)?;
-
-        let segments = index.searchable_segments()?;
-        let segment = &segments[0];
-
-        let fast_fields = segment.open_read(SegmentComponent::FastFields)?;
-        let fast_fields_bytes = fast_fields.read_bytes()?;
-
-        let store = segment.open_read(SegmentComponent::Store)?;
-        let store_bytes = store.read_bytes()?;
-
-        let field_norms = segment.open_read(SegmentComponent::FieldNorms)?;
-        let field_norms_bytes = field_norms.read_bytes()?;
-
-        let positions = segment.open_read(SegmentComponent::Positions)?;
-        let positions_bytes = positions.read_bytes()?;
-
-        let postings = segment.open_read(SegmentComponent::Postings)?;
-        let postings_bytes = postings.read_bytes()?;
-
-        let terms = segment.open_read(SegmentComponent::Terms)?;
-        let terms_bytes = terms.read_bytes()?;
-
-        // write each tantivy files as part of data
-        let mut fast_fields_length = writer.write(&fast_fields_bytes)?;
-        let footer_length = Self::build_footer(&mut writer, &fast_fields_bytes)?;
-        fast_fields_length += footer_length;
-
-        let mut store_length = writer.write(&store_bytes)?;
-        let footer_length = Self::build_footer(&mut writer, &store_bytes)?;
-        store_length += footer_length;
-
-        let mut field_norms_length = writer.write(&field_norms_bytes)?;
-        let footer_length = Self::build_footer(&mut writer, &field_norms_bytes)?;
-        field_norms_length += footer_length;
-
-        let mut positions_length = writer.write(&positions_bytes)?;
-        let footer_length = Self::build_footer(&mut writer, &positions_bytes)?;
-        positions_length += footer_length;
-
-        let mut postings_length = writer.write(&postings_bytes)?;
-        let footer_length = Self::build_footer(&mut writer, &postings_bytes)?;
-        postings_length += footer_length;
-
-        let mut terms_length = writer.write(&terms_bytes)?;
-        let footer_length = Self::build_footer(&mut writer, &terms_bytes)?;
-        terms_length += footer_length;
-
-        let meta_length = writer.write(meta_json.as_bytes())?;
-        let managed_length = writer.write(&managed_bytes)?;
-
-        // write offsets of each parts
-        let mut offset: u32 = 0;
-        let mut offsets = Vec::with_capacity(8);
-        offset += fast_fields_length as u32;
-        offsets.push(offset);
-
-        offset += store_length as u32;
-        offsets.push(offset);
-
-        offset += field_norms_length as u32;
-        offsets.push(offset);
-
-        offset += positions_length as u32;
-        offsets.push(offset);
-
-        offset += postings_length as u32;
-        offsets.push(offset);
-
-        offset += terms_length as u32;
-        offsets.push(offset);
-
-        offset += meta_length as u32;
-        offsets.push(offset);
-
-        offset += managed_length as u32;
-        offsets.push(offset);
-
-        // the number of offsets, used for multi index segments in one file
-        let nums = offsets.len() as u32;
-        for offset in offsets {
-            writer.write_all(&offset.to_le_bytes())?;
-        }
-        writer.write_all(&nums.to_le_bytes())?;
-
-        writer.flush()?;
-
-        Ok(())
-    }
-
-    fn build_footer<W: Write>(writer: &mut W, bytes: &[u8]) -> Result<usize> {
-        let buf = build_tantivy_footer(bytes)?;
-        let len = writer.write(&buf)?;
-        Ok(len)
-    }
 }
 
 // inverted index block include 5 types of data,
diff --git a/src/query/storages/fuse/src/pruning/inverted_index_pruner.rs b/src/query/storages/fuse/src/pruning/inverted_index_pruner.rs
index 9f58e4233edf..a5f585e15dd5 100644
--- a/src/query/storages/fuse/src/pruning/inverted_index_pruner.rs
+++ b/src/query/storages/fuse/src/pruning/inverted_index_pruner.rs
@@ -53,13 +53,12 @@ use crate::io::TableMetaLocationGenerator;
 pub struct InvertedIndexPruner {
     dal: Operator,
     has_score: bool,
-    field_nums: usize,
     index_name: String,
     index_version: String,
     need_position: bool,
     tokenizer_manager: TokenizerManager,
     query: Box<dyn Query>,
-    field_ids: HashSet<usize>,
+    field_ids: HashSet<u32>,
     index_record: IndexRecordOption,
     fuzziness: Option<u8>,
 }
@@ -84,7 +83,7 @@ impl InvertedIndexPruner {
             let mut field_ids = HashSet::new();
             query.query_terms(&mut |term, pos| {
                 let field = term.field();
-                let field_id = field.field_id() as usize;
+                let field_id = field.field_id();
                 field_ids.insert(field_id);
                 if pos {
                     need_position = true;
@@ -93,14 +92,12 @@ impl InvertedIndexPruner {
 
             // whether need to generate score internl column
             let has_score = inverted_index_info.has_score;
-            let field_nums = inverted_index_info.index_schema.num_fields();
             let index_name = inverted_index_info.index_name.clone();
             let index_version = inverted_index_info.index_version.clone();
 
             return Ok(Some(Arc::new(InvertedIndexPruner {
                 dal,
                 has_score,
-                field_nums,
                 index_name,
                 index_version,
                 need_position,
@@ -130,7 +127,6 @@ impl InvertedIndexPruner {
 
         let matched_rows = inverted_index_reader
             .do_filter(
-                self.field_nums,
                 self.need_position,
                 self.has_score,
                 self.query.box_clone(),
@@ -138,7 +134,7 @@ impl InvertedIndexPruner {
                 &self.index_record,
                 &self.fuzziness,
                 self.tokenizer_manager.clone(),
-                row_count as u32,
+                row_count,
                 &index_loc,
             )
             .await?;
diff --git a/src/query/storages/stream/src/stream_table.rs b/src/query/storages/stream/src/stream_table.rs
index cf5599b468d0..e593ab106acd 100644
--- a/src/query/storages/stream/src/stream_table.rs
+++ b/src/query/storages/stream/src/stream_table.rs
@@ -264,10 +264,9 @@ impl StreamTable {
             .get_table_name_by_id(source_table_id)
             .await
             .and_then(|opt| {
-                opt.ok_or(ErrorCode::UnknownTable(format!(
-                    "Unknown table id: '{}'",
-                    source_table_id
-                )))
+                opt.ok_or_else(|| {
+                    ErrorCode::UnknownTable(format!("Unknown table id: '{}'", source_table_id))
+                })
             })
     }
 
@@ -286,7 +285,7 @@ impl StreamTable {
                 let source_table_meta = catalog
                     .get_table_meta_by_id(source_table_id)
                     .await?
-                    .ok_or(ErrorCode::Internal("source database id must be set"))?;
+                    .ok_or_else(|| ErrorCode::Internal("source database id must be set"))?;
                 source_table_meta
                     .data
                     .options

From c304e3c96b4823182b92777ac0f1a2504fe85c41 Mon Sep 17 00:00:00 2001
From: zhya <mytesla@live.com>
Date: Mon, 30 Sep 2024 14:57:47 +0800
Subject: [PATCH 13/14] chore(storage): refactor compact source (#16527)

refactor compact source
---
 .../src/pipelines/builders/builder_compact.rs | 141 +++++++++-
 src/query/storages/fuse/src/fuse_table.rs     |   4 +
 .../storages/fuse/src/operations/compact.rs   | 134 ----------
 .../fuse/src/operations/mutation/meta/mod.rs  |   1 +
 .../operations/mutation/meta/mutation_meta.rs |  44 ++++
 .../mutation/processors/compact_source.rs     | 243 ++++++++----------
 .../src/operations/mutation/processors/mod.rs |   1 +
 7 files changed, 289 insertions(+), 279 deletions(-)

diff --git a/src/query/service/src/pipelines/builders/builder_compact.rs b/src/query/service/src/pipelines/builders/builder_compact.rs
index 0888f5571715..03fadf872e31 100644
--- a/src/query/service/src/pipelines/builders/builder_compact.rs
+++ b/src/query/service/src/pipelines/builders/builder_compact.rs
@@ -12,15 +12,35 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use databend_common_base::runtime::Runtime;
+use databend_common_catalog::plan::PartInfoType;
+use databend_common_catalog::plan::Partitions;
+use databend_common_catalog::plan::PartitionsShuffleKind;
+use databend_common_catalog::plan::Projection;
+use databend_common_catalog::table::Table;
+use databend_common_catalog::table_context::TableContext;
 use databend_common_exception::Result;
 use databend_common_pipeline_sources::EmptySource;
-use databend_common_sql::executor::physical_plans::CompactSource;
+use databend_common_pipeline_sources::PrefetchAsyncSourcer;
+use databend_common_pipeline_transforms::processors::TransformPipelineHelper;
+use databend_common_sql::executor::physical_plans::CompactSource as PhysicalCompactSource;
+use databend_common_sql::executor::physical_plans::MutationKind;
+use databend_common_sql::StreamContext;
+use databend_common_storages_fuse::operations::BlockCompactMutator;
+use databend_common_storages_fuse::operations::CompactLazyPartInfo;
+use databend_common_storages_fuse::operations::CompactSource;
+use databend_common_storages_fuse::operations::CompactTransform;
+use databend_common_storages_fuse::operations::TableMutationAggregator;
+use databend_common_storages_fuse::operations::TransformSerializeBlock;
 use databend_common_storages_fuse::FuseTable;
 
 use crate::pipelines::PipelineBuilder;
 
 impl PipelineBuilder {
-    pub(crate) fn build_compact_source(&mut self, compact_block: &CompactSource) -> Result<()> {
+    pub(crate) fn build_compact_source(
+        &mut self,
+        compact_block: &PhysicalCompactSource,
+    ) -> Result<()> {
         let table = self
             .ctx
             .build_table_by_table_info(&compact_block.table_info, None)?;
@@ -30,11 +50,120 @@ impl PipelineBuilder {
             return self.main_pipeline.add_source(EmptySource::create, 1);
         }
 
-        table.build_compact_source(
+        let is_lazy = compact_block.parts.partitions_type() == PartInfoType::LazyLevel;
+        let thresholds = table.get_block_thresholds();
+        let cluster_key_id = table.cluster_key_id();
+        let mut max_threads = self.ctx.get_settings().get_max_threads()? as usize;
+
+        if is_lazy {
+            let query_ctx = self.ctx.clone();
+
+            let lazy_parts = compact_block
+                .parts
+                .partitions
+                .iter()
+                .map(|v| {
+                    v.as_any()
+                        .downcast_ref::<CompactLazyPartInfo>()
+                        .unwrap()
+                        .clone()
+                })
+                .collect::<Vec<_>>();
+
+            let column_ids = compact_block.column_ids.clone();
+            self.main_pipeline.set_on_init(move || {
+                let ctx = query_ctx.clone();
+                let partitions = Runtime::with_worker_threads(2, None)?.block_on(async move {
+                    let partitions = BlockCompactMutator::build_compact_tasks(
+                        ctx.clone(),
+                        column_ids.clone(),
+                        cluster_key_id,
+                        thresholds,
+                        lazy_parts,
+                    )
+                    .await?;
+
+                    Result::<_>::Ok(partitions)
+                })?;
+
+                let partitions = Partitions::create(PartitionsShuffleKind::Mod, partitions);
+                query_ctx.set_partitions(partitions)?;
+                Ok(())
+            });
+        } else {
+            max_threads = max_threads.min(compact_block.parts.len()).max(1);
+            self.ctx.set_partitions(compact_block.parts.clone())?;
+        }
+
+        let block_reader = table.create_block_reader(
+            self.ctx.clone(),
+            Projection::Columns(table.all_column_indices()),
+            false,
+            table.change_tracking_enabled(),
+            false,
+        )?;
+        let stream_ctx = if table.change_tracking_enabled() {
+            Some(StreamContext::try_create(
+                self.ctx.get_function_context()?,
+                table.schema_with_stream(),
+                table.get_table_info().ident.seq,
+                false,
+                false,
+            )?)
+        } else {
+            None
+        };
+        // Add source pipe.
+        self.main_pipeline.add_source(
+            |output| {
+                let source = CompactSource::create(self.ctx.clone(), block_reader.clone(), 1);
+                PrefetchAsyncSourcer::create(self.ctx.clone(), output, source)
+            },
+            max_threads,
+        )?;
+        let storage_format = table.get_storage_format();
+        self.main_pipeline.add_block_meta_transformer(|| {
+            CompactTransform::create(
+                self.ctx.clone(),
+                block_reader.clone(),
+                storage_format,
+                stream_ctx.clone(),
+            )
+        });
+
+        // sort
+        let cluster_stats_gen = table.cluster_gen_for_append(
             self.ctx.clone(),
-            compact_block.parts.clone(),
-            compact_block.column_ids.clone(),
             &mut self.main_pipeline,
-        )
+            thresholds,
+            None,
+        )?;
+        self.main_pipeline.add_transform(|input, output| {
+            let proc = TransformSerializeBlock::try_create(
+                self.ctx.clone(),
+                input,
+                output,
+                table,
+                cluster_stats_gen.clone(),
+                MutationKind::Compact,
+            )?;
+            proc.into_processor()
+        })?;
+
+        if is_lazy {
+            self.main_pipeline.try_resize(1)?;
+            self.main_pipeline.add_async_accumulating_transformer(|| {
+                TableMutationAggregator::create(
+                    table,
+                    self.ctx.clone(),
+                    vec![],
+                    vec![],
+                    vec![],
+                    Default::default(),
+                    MutationKind::Compact,
+                )
+            });
+        }
+        Ok(())
     }
 }
diff --git a/src/query/storages/fuse/src/fuse_table.rs b/src/query/storages/fuse/src/fuse_table.rs
index 1895b6ac8bb0..71ef0b77de75 100644
--- a/src/query/storages/fuse/src/fuse_table.rs
+++ b/src/query/storages/fuse/src/fuse_table.rs
@@ -461,6 +461,10 @@ impl FuseTable {
         };
         Ok(retention_period)
     }
+
+    pub fn get_storage_format(&self) -> FuseStorageFormat {
+        self.storage_format
+    }
 }
 
 #[async_trait::async_trait]
diff --git a/src/query/storages/fuse/src/operations/compact.rs b/src/query/storages/fuse/src/operations/compact.rs
index 491c089c382a..d17fe2f72cb6 100644
--- a/src/query/storages/fuse/src/operations/compact.rs
+++ b/src/query/storages/fuse/src/operations/compact.rs
@@ -12,31 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::collections::HashSet;
 use std::sync::Arc;
 
-use databend_common_base::runtime::Runtime;
-use databend_common_catalog::plan::PartInfoType;
 use databend_common_catalog::plan::Partitions;
-use databend_common_catalog::plan::PartitionsShuffleKind;
-use databend_common_catalog::plan::Projection;
 use databend_common_catalog::table::CompactionLimits;
 use databend_common_exception::Result;
-use databend_common_expression::ColumnId;
 use databend_common_expression::ComputedExpr;
 use databend_common_expression::FieldIndex;
-use databend_common_pipeline_core::Pipeline;
-use databend_common_pipeline_transforms::processors::TransformPipelineHelper;
-use databend_common_sql::executor::physical_plans::MutationKind;
-use databend_common_sql::StreamContext;
-use databend_storages_common_table_meta::meta::Statistics;
 use databend_storages_common_table_meta::meta::TableSnapshot;
 
-use crate::operations::common::TableMutationAggregator;
-use crate::operations::common::TransformSerializeBlock;
 use crate::operations::mutation::BlockCompactMutator;
-use crate::operations::mutation::CompactLazyPartInfo;
-use crate::operations::mutation::CompactSource;
 use crate::operations::mutation::SegmentCompactMutator;
 use crate::FuseTable;
 use crate::Table;
@@ -119,125 +104,6 @@ impl FuseTable {
         )))
     }
 
-    pub fn build_compact_source(
-        &self,
-        ctx: Arc<dyn TableContext>,
-        parts: Partitions,
-        column_ids: HashSet<ColumnId>,
-        pipeline: &mut Pipeline,
-    ) -> Result<()> {
-        let is_lazy = parts.partitions_type() == PartInfoType::LazyLevel;
-        let thresholds = self.get_block_thresholds();
-        let cluster_key_id = self.cluster_key_id();
-        let mut max_threads = ctx.get_settings().get_max_threads()? as usize;
-
-        if is_lazy {
-            let query_ctx = ctx.clone();
-
-            let lazy_parts = parts
-                .partitions
-                .into_iter()
-                .map(|v| {
-                    v.as_any()
-                        .downcast_ref::<CompactLazyPartInfo>()
-                        .unwrap()
-                        .clone()
-                })
-                .collect::<Vec<_>>();
-
-            pipeline.set_on_init(move || {
-                let ctx = query_ctx.clone();
-                let column_ids = column_ids.clone();
-                let partitions = Runtime::with_worker_threads(2, None)?.block_on(async move {
-                    let partitions = BlockCompactMutator::build_compact_tasks(
-                        ctx.clone(),
-                        column_ids,
-                        cluster_key_id,
-                        thresholds,
-                        lazy_parts,
-                    )
-                    .await?;
-
-                    Result::<_>::Ok(partitions)
-                })?;
-
-                let partitions = Partitions::create(PartitionsShuffleKind::Mod, partitions);
-                query_ctx.set_partitions(partitions)?;
-                Ok(())
-            });
-        } else {
-            max_threads = max_threads.min(parts.len()).max(1);
-            ctx.set_partitions(parts)?;
-        }
-
-        let all_column_indices = self.all_column_indices();
-        let projection = Projection::Columns(all_column_indices);
-        let block_reader = self.create_block_reader(
-            ctx.clone(),
-            projection,
-            false,
-            self.change_tracking_enabled(),
-            false,
-        )?;
-        let stream_ctx = if self.change_tracking_enabled() {
-            Some(StreamContext::try_create(
-                ctx.get_function_context()?,
-                self.schema_with_stream(),
-                self.get_table_info().ident.seq,
-                false,
-                false,
-            )?)
-        } else {
-            None
-        };
-        // Add source pipe.
-        pipeline.add_source(
-            |output| {
-                CompactSource::try_create(
-                    ctx.clone(),
-                    self.storage_format,
-                    block_reader.clone(),
-                    stream_ctx.clone(),
-                    output,
-                )
-            },
-            max_threads,
-        )?;
-
-        // sort
-        let cluster_stats_gen =
-            self.cluster_gen_for_append(ctx.clone(), pipeline, thresholds, None)?;
-        pipeline.add_transform(
-            |input: Arc<databend_common_pipeline_core::processors::InputPort>, output| {
-                let proc = TransformSerializeBlock::try_create(
-                    ctx.clone(),
-                    input,
-                    output,
-                    self,
-                    cluster_stats_gen.clone(),
-                    MutationKind::Compact,
-                )?;
-                proc.into_processor()
-            },
-        )?;
-
-        if is_lazy {
-            pipeline.try_resize(1)?;
-            pipeline.add_async_accumulating_transformer(|| {
-                TableMutationAggregator::create(
-                    self,
-                    ctx.clone(),
-                    vec![],
-                    vec![],
-                    vec![],
-                    Statistics::default(),
-                    MutationKind::Compact,
-                )
-            });
-        }
-        Ok(())
-    }
-
     async fn compact_options_with_segment_limit(
         &self,
         num_segment_limit: Option<usize>,
diff --git a/src/query/storages/fuse/src/operations/mutation/meta/mod.rs b/src/query/storages/fuse/src/operations/mutation/meta/mod.rs
index 776b7d48979b..0a14e6b7ad28 100644
--- a/src/query/storages/fuse/src/operations/mutation/meta/mod.rs
+++ b/src/query/storages/fuse/src/operations/mutation/meta/mod.rs
@@ -21,6 +21,7 @@ pub use compact_part::CompactExtraInfo;
 pub use compact_part::CompactLazyPartInfo;
 pub use compact_part::CompactTaskInfo;
 pub use mutation_meta::ClusterStatsGenType;
+pub use mutation_meta::CompactSourceMeta;
 pub use mutation_meta::SerializeBlock;
 pub use mutation_meta::SerializeDataMeta;
 pub use mutation_part::DeletedSegmentInfo;
diff --git a/src/query/storages/fuse/src/operations/mutation/meta/mutation_meta.rs b/src/query/storages/fuse/src/operations/mutation/meta/mutation_meta.rs
index b98943f6821e..913462add22c 100644
--- a/src/query/storages/fuse/src/operations/mutation/meta/mutation_meta.rs
+++ b/src/query/storages/fuse/src/operations/mutation/meta/mutation_meta.rs
@@ -12,13 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::sync::Arc;
+
 use databend_common_expression::BlockMetaInfo;
 use databend_common_expression::BlockMetaInfoDowncast;
+use databend_storages_common_table_meta::meta::BlockMeta;
 use databend_storages_common_table_meta::meta::ClusterStatistics;
 
 use crate::operations::common::BlockMetaIndex;
 use crate::operations::mutation::CompactExtraInfo;
 use crate::operations::mutation::DeletedSegmentInfo;
+use crate::MergeIOReadResult;
 
 #[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq)]
 pub enum SerializeDataMeta {
@@ -55,3 +59,43 @@ impl SerializeBlock {
         SerializeBlock { index, stats_type }
     }
 }
+
+pub enum CompactSourceMeta {
+    Concat {
+        read_res: Vec<MergeIOReadResult>,
+        metas: Vec<Arc<BlockMeta>>,
+        index: BlockMetaIndex,
+    },
+    Extras(CompactExtraInfo),
+}
+
+#[typetag::serde(name = "compact_data_source")]
+impl BlockMetaInfo for CompactSourceMeta {
+    fn equals(&self, _: &Box<dyn BlockMetaInfo>) -> bool {
+        unimplemented!("Unimplemented equals CompactSourceMeta")
+    }
+
+    fn clone_self(&self) -> Box<dyn BlockMetaInfo> {
+        unimplemented!("Unimplemented clone CompactSourceMeta")
+    }
+}
+
+impl std::fmt::Debug for CompactSourceMeta {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        f.debug_struct("CompactSourceMeta").finish()
+    }
+}
+
+impl serde::Serialize for CompactSourceMeta {
+    fn serialize<S>(&self, _: S) -> std::result::Result<S::Ok, S::Error>
+    where S: serde::Serializer {
+        unimplemented!("Unimplemented serialize DataSourceMeta")
+    }
+}
+
+impl<'de> serde::Deserialize<'de> for CompactSourceMeta {
+    fn deserialize<D>(_: D) -> std::result::Result<Self, D::Error>
+    where D: serde::Deserializer<'de> {
+        unimplemented!("Unimplemented deserialize DataSourceMeta")
+    }
+}
diff --git a/src/query/storages/fuse/src/operations/mutation/processors/compact_source.rs b/src/query/storages/fuse/src/operations/mutation/processors/compact_source.rs
index 203a6e73ed3f..0256651a7a3b 100644
--- a/src/query/storages/fuse/src/operations/mutation/processors/compact_source.rs
+++ b/src/query/storages/fuse/src/operations/mutation/processors/compact_source.rs
@@ -12,125 +12,148 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::any::Any;
 use std::sync::Arc;
 use std::time::Instant;
 
 use databend_common_base::base::ProgressValues;
 use databend_common_catalog::plan::gen_mutation_stream_meta;
-use databend_common_catalog::plan::PartInfoPtr;
 use databend_common_catalog::table_context::TableContext;
-use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
 use databend_common_expression::DataBlock;
 use databend_common_metrics::storage::*;
-use databend_common_pipeline_core::processors::Event;
-use databend_common_pipeline_core::processors::OutputPort;
-use databend_common_pipeline_core::processors::Processor;
-use databend_common_pipeline_core::processors::ProcessorPtr;
+use databend_common_pipeline_sources::PrefetchAsyncSource;
+use databend_common_pipeline_transforms::processors::BlockMetaTransform;
+use databend_common_pipeline_transforms::processors::UnknownMode;
 use databend_common_sql::StreamContext;
-use databend_storages_common_table_meta::meta::BlockMeta;
 
 use crate::io::BlockReader;
 use crate::io::ReadSettings;
-use crate::operations::mutation::ClusterStatsGenType;
-use crate::operations::mutation::CompactBlockPartInfo;
-use crate::operations::mutation::SerializeBlock;
-use crate::operations::mutation::SerializeDataMeta;
-use crate::operations::BlockMetaIndex;
+use crate::operations::ClusterStatsGenType;
+use crate::operations::CompactBlockPartInfo;
+use crate::operations::CompactSourceMeta;
+use crate::operations::SerializeBlock;
+use crate::operations::SerializeDataMeta;
 use crate::FuseStorageFormat;
-use crate::MergeIOReadResult;
-
-enum State {
-    ReadData(Option<PartInfoPtr>),
-    Concat {
-        read_res: Vec<MergeIOReadResult>,
-        metas: Vec<Arc<BlockMeta>>,
-        index: BlockMetaIndex,
-    },
-    Output(Option<PartInfoPtr>, DataBlock),
-    Finish,
-}
 
 pub struct CompactSource {
-    state: State,
     ctx: Arc<dyn TableContext>,
     block_reader: Arc<BlockReader>,
-    storage_format: FuseStorageFormat,
-    output: Arc<OutputPort>,
-    stream_ctx: Option<StreamContext>,
+    prefetch_num: usize,
 }
 
 impl CompactSource {
-    pub fn try_create(
+    pub fn create(
         ctx: Arc<dyn TableContext>,
-        storage_format: FuseStorageFormat,
         block_reader: Arc<BlockReader>,
-        stream_ctx: Option<StreamContext>,
-        output: Arc<OutputPort>,
-    ) -> Result<ProcessorPtr> {
-        Ok(ProcessorPtr::create(Box::new(CompactSource {
-            state: State::ReadData(None),
+        prefetch_num: usize,
+    ) -> Self {
+        Self {
             ctx,
             block_reader,
-            storage_format,
-            output,
-            stream_ctx,
-        })))
+            prefetch_num,
+        }
     }
 }
 
 #[async_trait::async_trait]
-impl Processor for CompactSource {
-    fn name(&self) -> String {
-        "CompactSource".to_string()
-    }
+impl PrefetchAsyncSource for CompactSource {
+    const NAME: &'static str = "CompactSource";
 
-    fn as_any(&mut self) -> &mut dyn Any {
-        self
+    const SKIP_EMPTY_DATA_BLOCK: bool = false;
+
+    fn is_full(&self, prefetched: &[DataBlock]) -> bool {
+        prefetched.len() >= self.prefetch_num
     }
 
-    fn event(&mut self) -> Result<Event> {
-        if matches!(self.state, State::ReadData(None)) {
-            self.state = self
-                .ctx
-                .get_partition()
-                .map_or(State::Finish, |part| State::ReadData(Some(part)));
-        }
+    async fn generate(&mut self) -> Result<Option<DataBlock>> {
+        let part = match self.ctx.get_partition() {
+            Some(part) => part,
+            None => return Ok(None),
+        };
 
-        if self.output.is_finished() {
-            return Ok(Event::Finished);
-        }
+        let part = CompactBlockPartInfo::from_part(&part)?;
+        let meta = match part {
+            CompactBlockPartInfo::CompactTaskInfo(task) => {
+                let mut task_futures = Vec::new();
+                for block in &task.blocks {
+                    let settings = ReadSettings::from_ctx(&self.ctx)?;
+                    let block_reader = self.block_reader.clone();
+                    let block = block.clone();
+                    // read block in parallel.
+                    task_futures.push(async move {
+                        databend_common_base::runtime::spawn(async move {
+                            // Perf
+                            {
+                                metrics_inc_compact_block_read_nums(1);
+                                metrics_inc_compact_block_read_bytes(block.block_size);
+                            }
+
+                            block_reader
+                                .read_columns_data_by_merge_io(
+                                    &settings,
+                                    &block.location.0,
+                                    &block.col_metas,
+                                    &None,
+                                )
+                                .await
+                        })
+                        .await
+                        .unwrap()
+                    });
+                }
 
-        if !self.output.can_push() {
-            return Ok(Event::NeedConsume);
-        }
+                let start = Instant::now();
 
-        match self.state {
-            State::ReadData(_) => Ok(Event::Async),
-            State::Concat { .. } => Ok(Event::Sync),
-            State::Output(_, _) => {
-                if let State::Output(part, data_block) =
-                    std::mem::replace(&mut self.state, State::Finish)
+                let read_res = futures::future::try_join_all(task_futures).await?;
+                // Perf.
                 {
-                    self.state = part.map_or(State::Finish, |part| State::ReadData(Some(part)));
-
-                    self.output.push_data(Ok(data_block));
-                    Ok(Event::NeedConsume)
-                } else {
-                    Err(ErrorCode::Internal("It's a bug."))
+                    metrics_inc_compact_block_read_milliseconds(start.elapsed().as_millis() as u64);
                 }
+                Box::new(CompactSourceMeta::Concat {
+                    read_res,
+                    metas: task.blocks.clone(),
+                    index: task.index.clone(),
+                })
             }
-            State::Finish => {
-                self.output.finish();
-                Ok(Event::Finished)
+            CompactBlockPartInfo::CompactExtraInfo(extra) => {
+                Box::new(CompactSourceMeta::Extras(extra.clone()))
             }
+        };
+        Ok(Some(DataBlock::empty_with_meta(meta)))
+    }
+}
+
+pub struct CompactTransform {
+    ctx: Arc<dyn TableContext>,
+    block_reader: Arc<BlockReader>,
+    storage_format: FuseStorageFormat,
+    stream_ctx: Option<StreamContext>,
+}
+
+impl CompactTransform {
+    pub fn create(
+        ctx: Arc<dyn TableContext>,
+        block_reader: Arc<BlockReader>,
+        storage_format: FuseStorageFormat,
+        stream_ctx: Option<StreamContext>,
+    ) -> Self {
+        Self {
+            ctx,
+            block_reader,
+            storage_format,
+            stream_ctx,
         }
     }
+}
+
+#[async_trait::async_trait]
+impl BlockMetaTransform<CompactSourceMeta> for CompactTransform {
+    const UNKNOWN_MODE: UnknownMode = UnknownMode::Pass;
+    const NAME: &'static str = "CompactTransform";
 
-    fn process(&mut self) -> Result<()> {
-        match std::mem::replace(&mut self.state, State::Finish) {
-            State::Concat {
+    fn transform(&mut self, meta: CompactSourceMeta) -> Result<Vec<DataBlock>> {
+        match meta {
+            CompactSourceMeta::Concat {
                 read_res,
                 metas,
                 index,
@@ -171,70 +194,12 @@ impl Processor for CompactSource {
                     bytes: new_block.memory_size(),
                 };
                 self.ctx.get_write_progress().incr(&progress_values);
-
-                self.state = State::Output(self.ctx.get_partition(), new_block);
+                Ok(vec![new_block])
             }
-            _ => return Err(ErrorCode::Internal("It's a bug.")),
-        }
-        Ok(())
-    }
-
-    #[async_backtrace::framed]
-    async fn async_process(&mut self) -> Result<()> {
-        match std::mem::replace(&mut self.state, State::Finish) {
-            State::ReadData(Some(part)) => {
-                let block_reader = self.block_reader.as_ref();
-
-                // block read tasks.
-                let mut task_futures = Vec::new();
-                let part = CompactBlockPartInfo::from_part(&part)?;
-                match part {
-                    CompactBlockPartInfo::CompactExtraInfo(extra) => {
-                        let meta = Box::new(SerializeDataMeta::CompactExtras(extra.clone()));
-                        let block = DataBlock::empty_with_meta(meta);
-                        self.state = State::Output(self.ctx.get_partition(), block);
-                    }
-                    CompactBlockPartInfo::CompactTaskInfo(task) => {
-                        for block in &task.blocks {
-                            let settings = ReadSettings::from_ctx(&self.ctx)?;
-                            // read block in parallel.
-                            task_futures.push(async move {
-                                // Perf
-                                {
-                                    metrics_inc_compact_block_read_nums(1);
-                                    metrics_inc_compact_block_read_bytes(block.block_size);
-                                }
-
-                                block_reader
-                                    .read_columns_data_by_merge_io(
-                                        &settings,
-                                        &block.location.0,
-                                        &block.col_metas,
-                                        &None,
-                                    )
-                                    .await
-                            });
-                        }
-
-                        let start = Instant::now();
-
-                        let read_res = futures::future::try_join_all(task_futures).await?;
-                        // Perf.
-                        {
-                            metrics_inc_compact_block_read_milliseconds(
-                                start.elapsed().as_millis() as u64,
-                            );
-                        }
-                        self.state = State::Concat {
-                            read_res,
-                            metas: task.blocks.clone(),
-                            index: task.index.clone(),
-                        };
-                    }
-                }
-                Ok(())
+            CompactSourceMeta::Extras(extra) => {
+                let meta = Box::new(SerializeDataMeta::CompactExtras(extra));
+                Ok(vec![DataBlock::empty_with_meta(meta)])
             }
-            _ => Err(ErrorCode::Internal("It's a bug.")),
         }
     }
 }
diff --git a/src/query/storages/fuse/src/operations/mutation/processors/mod.rs b/src/query/storages/fuse/src/operations/mutation/processors/mod.rs
index f86df532bad5..ae312d9f438e 100644
--- a/src/query/storages/fuse/src/operations/mutation/processors/mod.rs
+++ b/src/query/storages/fuse/src/operations/mutation/processors/mod.rs
@@ -16,5 +16,6 @@ mod compact_source;
 mod mutation_source;
 
 pub use compact_source::CompactSource;
+pub use compact_source::CompactTransform;
 pub use mutation_source::MutationAction;
 pub use mutation_source::MutationSource;

From a1497e26094a81caefa9c5543c6b1c0223ad3cda Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E7=82=8E=E6=B3=BC?= <drdr.xp@gmail.com>
Date: Mon, 30 Sep 2024 16:28:04 +0800
Subject: [PATCH 14/14] fix: move TableInfo building out of SchemaApi (#16548)

`SchemaApi` can not provide enough information such as `CatalogInfo` and
`DatabaseType`. Therefore `SchemaApi` should not build a `TableInfo`,
leave such task to its caller.
---
 src/meta/api/src/schema_api_impl.rs           | 38 ++++++---------
 src/meta/api/src/schema_api_test_suite.rs     | 47 ++++++++++++-------
 src/meta/app/src/schema/table.rs              | 25 +++++++++-
 .../src/catalogs/default/mutable_catalog.rs   | 14 ++++--
 4 files changed, 80 insertions(+), 44 deletions(-)

diff --git a/src/meta/api/src/schema_api_impl.rs b/src/meta/api/src/schema_api_impl.rs
index ffdf39541597..6e4da9fddac3 100644
--- a/src/meta/api/src/schema_api_impl.rs
+++ b/src/meta/api/src/schema_api_impl.rs
@@ -2658,13 +2658,13 @@ impl<KV: kvapi::KVApi<Error = MetaError> + ?Sized> SchemaApi for KV {
                 )
                 .await?;
 
-            let mut vacuum_table_infos = vec![];
+            let mut vacuum_tables = vec![];
             let mut vacuum_ids = vec![];
 
             for db_info in db_infos {
-                if vacuum_table_infos.len() >= the_limit {
+                if vacuum_tables.len() >= the_limit {
                     return Ok(ListDroppedTableResp {
-                        drop_table_infos: vacuum_table_infos,
+                        vacuum_tables,
                         drop_ids: vacuum_ids,
                     });
                 }
@@ -2679,7 +2679,7 @@ impl<KV: kvapi::KVApi<Error = MetaError> + ?Sized> SchemaApi for KV {
                     drop_time_range.clone()
                 };
 
-                let capacity = the_limit - vacuum_table_infos.len();
+                let capacity = the_limit - vacuum_tables.len();
                 let table_nivs = get_history_tables_for_gc(
                     self,
                     table_drop_time_range,
@@ -2692,26 +2692,22 @@ impl<KV: kvapi::KVApi<Error = MetaError> + ?Sized> SchemaApi for KV {
                     vacuum_ids.push(DroppedId::from(table_niv.clone()));
                 }
 
+                let db_name = db_info.name_ident.database_name().to_string();
+                let db_name_ident = db_info.name_ident.clone();
+
                 // A DB can be removed only when all its tables are removed.
                 if vacuum_db && capacity > table_nivs.len() {
                     vacuum_ids.push(DroppedId::Db {
                         db_id: db_info.database_id.db_id,
-                        db_name: db_info.name_ident.database_name().to_string(),
+                        db_name: db_name.clone(),
                     });
                 }
 
-                vacuum_table_infos.extend(table_nivs.iter().take(capacity).map(|niv| {
-                    Arc::new(TableInfo::new(
-                        db_info.name_ident.database_name(),
-                        &niv.name().table_name,
-                        TableIdent::new(niv.id().table_id, niv.value().seq),
-                        niv.value().data.clone(),
-                    ))
-                }));
+                vacuum_tables.extend(std::iter::repeat(db_name_ident).zip(table_nivs));
             }
 
             return Ok(ListDroppedTableResp {
-                drop_table_infos: vacuum_table_infos,
+                vacuum_tables,
                 drop_ids: vacuum_ids,
             });
         }
@@ -2748,21 +2744,15 @@ impl<KV: kvapi::KVApi<Error = MetaError> + ?Sized> SchemaApi for KV {
         .await?;
 
         let mut drop_ids = vec![];
-        let mut drop_table_infos = vec![];
+        let mut vacuum_tables = vec![];
 
-        for niv in table_nivs.iter() {
+        for niv in table_nivs {
             drop_ids.push(DroppedId::from(niv.clone()));
-
-            drop_table_infos.push(Arc::new(TableInfo::new(
-                db_info.name_ident.database_name(),
-                &niv.name().table_name,
-                TableIdent::new(niv.id().table_id, niv.value().seq),
-                niv.value().data.clone(),
-            )));
+            vacuum_tables.push((tenant_dbname.clone(), niv));
         }
 
         Ok(ListDroppedTableResp {
-            drop_table_infos,
+            vacuum_tables,
             drop_ids,
         })
     }
diff --git a/src/meta/api/src/schema_api_test_suite.rs b/src/meta/api/src/schema_api_test_suite.rs
index 9f6e253cd309..f9028f9fa012 100644
--- a/src/meta/api/src/schema_api_test_suite.rs
+++ b/src/meta/api/src/schema_api_test_suite.rs
@@ -4077,6 +4077,7 @@ impl SchemaApiTestSuite {
 
         // first create a database drop within filter time
         info!("--- create db1");
+        let db1_id;
         {
             let db_name = DatabaseNameIdent::new(&tenant, "db1");
             let req = CreateDatabaseReq {
@@ -4086,7 +4087,7 @@ impl SchemaApiTestSuite {
             };
 
             let res = mt.create_database(req).await?;
-            let db1_id = res.db_id.db_id;
+            db1_id = res.db_id;
 
             let req = CreateTableReq {
                 create_option: CreateOption::Create,
@@ -4103,21 +4104,22 @@ impl SchemaApiTestSuite {
             })
             .await?;
 
-            drop_ids_boundary.push(DroppedId::new_table(db1_id, db1_tb1_id, "tb1"));
+            drop_ids_boundary.push(DroppedId::new_table(*db1_id, db1_tb1_id, "tb1"));
             drop_ids_boundary.push(DroppedId::Db {
-                db_id: db1_id,
+                db_id: *db1_id,
                 db_name: db_name.database_name().to_string(),
             });
 
-            drop_ids_no_boundary.push(DroppedId::new_table(db1_id, db1_tb1_id, "tb1"));
+            drop_ids_no_boundary.push(DroppedId::new_table(*db1_id, db1_tb1_id, "tb1"));
             drop_ids_no_boundary.push(DroppedId::Db {
-                db_id: db1_id,
+                db_id: *db1_id,
                 db_name: db_name.database_name().to_string(),
             });
         }
 
         // second create a database drop outof filter time, but has a table drop within filter time
         info!("--- create db2");
+        let db2_id;
         {
             let create_db_req = CreateDatabaseReq {
                 create_option: CreateOption::Create,
@@ -4126,7 +4128,7 @@ impl SchemaApiTestSuite {
             };
 
             let res = mt.create_database(create_db_req.clone()).await?;
-            let db2_id = res.db_id;
+            db2_id = res.db_id;
             drop_ids_no_boundary.push(DroppedId::Db {
                 db_id: *db2_id,
                 db_name: "db2".to_string(),
@@ -4231,6 +4233,7 @@ impl SchemaApiTestSuite {
         }
 
         // third create a database not dropped, but has a table drop within filter time
+        let db3_id;
         {
             let create_db_req = CreateDatabaseReq {
                 create_option: CreateOption::Create,
@@ -4239,7 +4242,7 @@ impl SchemaApiTestSuite {
             };
 
             let res = mt.create_database(create_db_req.clone()).await?;
-            let db_id = res.db_id;
+            db3_id = res.db_id;
 
             info!("--- create and drop db3.tb1");
             {
@@ -4250,12 +4253,12 @@ impl SchemaApiTestSuite {
                     as_dropped: false,
                 };
                 let resp = mt.create_table(req.clone()).await?;
-                drop_ids_boundary.push(DroppedId::new_table(*db_id, resp.table_id, "tb1"));
-                drop_ids_no_boundary.push(DroppedId::new_table(*db_id, resp.table_id, "tb1"));
+                drop_ids_boundary.push(DroppedId::new_table(*db3_id, resp.table_id, "tb1"));
+                drop_ids_no_boundary.push(DroppedId::new_table(*db3_id, resp.table_id, "tb1"));
                 mt.drop_table_by_id(DropTableByIdReq {
                     if_exists: false,
                     tenant: req.name_ident.tenant.clone(),
-                    db_id: *db_id,
+                    db_id: *db3_id,
                     table_name: req.name_ident.table_name.clone(),
                     tb_id: resp.table_id,
                     engine: "FUSE".to_string(),
@@ -4274,11 +4277,11 @@ impl SchemaApiTestSuite {
                     as_dropped: false,
                 };
                 let resp = mt.create_table(req.clone()).await?;
-                drop_ids_no_boundary.push(DroppedId::new_table(*db_id, resp.table_id, "tb2"));
+                drop_ids_no_boundary.push(DroppedId::new_table(*db3_id, resp.table_id, "tb2"));
                 mt.drop_table_by_id(DropTableByIdReq {
                     if_exists: false,
                     tenant: req.name_ident.tenant.clone(),
-                    db_id: *db_id,
+                    db_id: *db3_id,
                     table_name: req.name_ident.table_name.clone(),
                     tb_id: resp.table_id,
                     engine: "FUSE".to_string(),
@@ -4331,9 +4334,15 @@ impl SchemaApiTestSuite {
             .into_iter()
             .collect();
             let actual: BTreeSet<String> = resp
-                .drop_table_infos
+                .vacuum_tables
                 .iter()
-                .map(|table_info| table_info.desc.clone())
+                .map(|(db_name_ident, table_niv)| {
+                    format!(
+                        "'{}'.'{}'",
+                        db_name_ident.database_name(),
+                        &table_niv.name().table_name
+                    )
+                })
                 .collect();
             assert_eq!(expected, actual);
         }
@@ -4368,9 +4377,15 @@ impl SchemaApiTestSuite {
             .cloned()
             .collect();
             let actual: BTreeSet<String> = resp
-                .drop_table_infos
+                .vacuum_tables
                 .iter()
-                .map(|table_info| table_info.desc.clone())
+                .map(|(db_name_ident, table_niv)| {
+                    format!(
+                        "'{}'.'{}'",
+                        db_name_ident.database_name(),
+                        &table_niv.name().table_name
+                    )
+                })
                 .collect();
             assert_eq!(expected, actual);
         }
diff --git a/src/meta/app/src/schema/table.rs b/src/meta/app/src/schema/table.rs
index a7a6e48fc8d1..6da8183345f8 100644
--- a/src/meta/app/src/schema/table.rs
+++ b/src/meta/app/src/schema/table.rs
@@ -130,6 +130,9 @@ impl DBIdTableName {
             table_name: table_name.to_string(),
         }
     }
+    pub fn display(&self) -> impl Display {
+        format!("{}.'{}'", self.db_id, self.table_name)
+    }
 }
 
 impl Display for DBIdTableName {
@@ -342,6 +345,7 @@ impl TableInfo {
         }
     }
 
+    /// Deprecated: use `new_full()`. This method sets default values for some fields.
     pub fn new(db_name: &str, table_name: &str, ident: TableIdent, meta: TableMeta) -> TableInfo {
         TableInfo {
             ident,
@@ -352,6 +356,24 @@ impl TableInfo {
         }
     }
 
+    pub fn new_full(
+        db_name: &str,
+        table_name: &str,
+        ident: TableIdent,
+        meta: TableMeta,
+        catalog_info: Arc<CatalogInfo>,
+        db_type: DatabaseType,
+    ) -> TableInfo {
+        TableInfo {
+            ident,
+            desc: format!("'{}'.'{}'", db_name, table_name),
+            name: table_name.to_string(),
+            meta,
+            catalog_info,
+            db_type,
+        }
+    }
+
     pub fn schema(&self) -> Arc<TableSchema> {
         self.meta.schema.clone()
     }
@@ -1021,7 +1043,8 @@ impl DroppedId {
 
 #[derive(Clone, Debug, PartialEq, Eq)]
 pub struct ListDroppedTableResp {
-    pub drop_table_infos: Vec<Arc<TableInfo>>,
+    /// The **database_name, (name, id, value)** of a table to vacuum.
+    pub vacuum_tables: Vec<(DatabaseNameIdent, TableNIV)>,
     pub drop_ids: Vec<DroppedId>,
 }
 
diff --git a/src/query/service/src/catalogs/default/mutable_catalog.rs b/src/query/service/src/catalogs/default/mutable_catalog.rs
index 8c559f34915c..0e89fe51887a 100644
--- a/src/query/service/src/catalogs/default/mutable_catalog.rs
+++ b/src/query/service/src/catalogs/default/mutable_catalog.rs
@@ -92,6 +92,7 @@ use databend_common_meta_app::schema::RenameTableReply;
 use databend_common_meta_app::schema::RenameTableReq;
 use databend_common_meta_app::schema::SetTableColumnMaskPolicyReply;
 use databend_common_meta_app::schema::SetTableColumnMaskPolicyReq;
+use databend_common_meta_app::schema::TableIdent;
 use databend_common_meta_app::schema::TableInfo;
 use databend_common_meta_app::schema::TableMeta;
 use databend_common_meta_app::schema::TruncateTableReply;
@@ -524,13 +525,20 @@ impl Catalog for MutableCatalog {
         let resp = ctx.meta.get_drop_table_infos(req).await?;
 
         let drop_ids = resp.drop_ids.clone();
-        let drop_table_infos = resp.drop_table_infos;
 
         let storage = ctx.storage_factory;
 
         let mut tables = vec![];
-        for table_info in drop_table_infos {
-            tables.push(storage.get_table(table_info.as_ref())?);
+        for (db_name_ident, niv) in resp.vacuum_tables {
+            let table_info = TableInfo::new_full(
+                db_name_ident.database_name(),
+                &niv.name().table_name,
+                TableIdent::new(niv.id().table_id, niv.value().seq),
+                niv.value().data.clone(),
+                self.info(),
+                DatabaseType::NormalDB,
+            );
+            tables.push(storage.get_table(&table_info)?);
         }
         Ok((tables, drop_ids))
     }