From 6473d8b3e27f9f41a4152d7632bbae548bcc6e96 Mon Sep 17 00:00:00 2001 From: Yang Xiufeng Date: Fri, 27 Sep 2024 16:46:52 +0800 Subject: [PATCH 01/14] feat: http handler add X-DATABEND-VERSION in each response. (#16518) * refactor: split middleware.rs. * small refactors. * not check type of databend_token when in management_mode. * http handler add X-DATABEND-VERSION in each response. * small refactor. * add license. * update tests. * test X-DATABEND-VERSION. * test X-DATABEND-VERSION. --- src/common/base/src/headers.rs | 1 + .../src/servers/http/middleware/metrics.rs | 70 +++++++++++ .../src/servers/http/middleware/mod.rs | 26 ++++ .../servers/http/middleware/panic_handler.rs | 36 ++++++ .../{middleware.rs => middleware/session.rs} | 116 +++++------------- .../servers/http/v1/http_query_handlers.rs | 2 +- src/query/service/src/servers/http/v1/mod.rs | 3 +- .../service/src/servers/http/v1/query/mod.rs | 1 + .../src/servers/http/v1/query/page_manager.rs | 4 +- .../http/v1/{ => query}/string_block.rs | 4 - .../it/servers/http/http_query_handlers.rs | 17 +-- .../tests/it/servers/http/json_block.rs | 2 +- 12 files changed, 179 insertions(+), 103 deletions(-) create mode 100644 src/query/service/src/servers/http/middleware/metrics.rs create mode 100644 src/query/service/src/servers/http/middleware/mod.rs create mode 100644 src/query/service/src/servers/http/middleware/panic_handler.rs rename src/query/service/src/servers/http/{middleware.rs => middleware/session.rs} (85%) rename src/query/service/src/servers/http/v1/{ => query}/string_block.rs (98%) diff --git a/src/common/base/src/headers.rs b/src/common/base/src/headers.rs index db18c883948c..823e28e9e0a1 100644 --- a/src/common/base/src/headers.rs +++ b/src/common/base/src/headers.rs @@ -25,6 +25,7 @@ pub const HEADER_NODE_ID: &str = "X-DATABEND-NODE-ID"; pub const HEADER_QUERY_STATE: &str = "X-DATABEND-QUERY-STATE"; pub const HEADER_QUERY_PAGE_ROWS: &str = "X-DATABEND-QUERY-PAGE-ROWS"; +pub const HEADER_VERSION: &str = "X-DATABEND-VERSION"; pub const HEADER_SIGNATURE: &str = "X-DATABEND-SIGNATURE"; pub const HEADER_AUTH_METHOD: &str = "X-DATABEND-AUTH-METHOD"; diff --git a/src/query/service/src/servers/http/middleware/metrics.rs b/src/query/service/src/servers/http/middleware/metrics.rs new file mode 100644 index 000000000000..60afddc44f06 --- /dev/null +++ b/src/query/service/src/servers/http/middleware/metrics.rs @@ -0,0 +1,70 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::time::Instant; + +use databend_common_metrics::http::metrics_incr_http_request_count; +use databend_common_metrics::http::metrics_incr_http_slow_request_count; +use databend_common_metrics::http::metrics_observe_http_response_duration; +use poem::Endpoint; +use poem::IntoResponse; +use poem::Middleware; +use poem::Request; +use poem::Response; + +pub struct MetricsMiddleware { + api: String, +} + +impl MetricsMiddleware { + pub fn new(api: impl Into) -> Self { + Self { api: api.into() } + } +} + +impl Middleware for MetricsMiddleware { + type Output = MetricsMiddlewareEndpoint; + + fn transform(&self, ep: E) -> Self::Output { + MetricsMiddlewareEndpoint { + ep, + api: self.api.clone(), + } + } +} + +pub struct MetricsMiddlewareEndpoint { + api: String, + ep: E, +} + +impl Endpoint for MetricsMiddlewareEndpoint { + type Output = Response; + + async fn call(&self, req: Request) -> poem::error::Result { + let start_time = Instant::now(); + let method = req.method().to_string(); + let output = self.ep.call(req).await?; + let resp = output.into_response(); + let status_code = resp.status().to_string(); + let duration = start_time.elapsed(); + metrics_incr_http_request_count(method.clone(), self.api.clone(), status_code.clone()); + metrics_observe_http_response_duration(method.clone(), self.api.clone(), duration); + if duration.as_secs_f64() > 60.0 { + // TODO: replace this into histogram + metrics_incr_http_slow_request_count(method, self.api.clone(), status_code); + } + Ok(resp) + } +} diff --git a/src/query/service/src/servers/http/middleware/mod.rs b/src/query/service/src/servers/http/middleware/mod.rs new file mode 100644 index 000000000000..5da4f1c621b4 --- /dev/null +++ b/src/query/service/src/servers/http/middleware/mod.rs @@ -0,0 +1,26 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod metrics; +mod panic_handler; +mod session; + +pub(crate) use metrics::MetricsMiddleware; +pub(crate) use panic_handler::PanicHandler; +pub use session::json_response; +pub(crate) use session::sanitize_request_headers; +pub use session::EndpointKind; +// for it tests only +pub use session::HTTPSessionEndpoint; +pub use session::HTTPSessionMiddleware; diff --git a/src/query/service/src/servers/http/middleware/panic_handler.rs b/src/query/service/src/servers/http/middleware/panic_handler.rs new file mode 100644 index 000000000000..cbff37083ef8 --- /dev/null +++ b/src/query/service/src/servers/http/middleware/panic_handler.rs @@ -0,0 +1,36 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; + +use databend_common_metrics::http::metrics_incr_http_response_panics_count; +use http::StatusCode; + +#[derive(Clone, Debug)] +pub(crate) struct PanicHandler {} + +impl PanicHandler { + pub fn new() -> Self { + Self {} + } +} + +impl poem::middleware::PanicHandler for PanicHandler { + type Response = (StatusCode, &'static str); + + fn get_response(&self, _err: Box) -> Self::Response { + metrics_incr_http_response_panics_count(); + (StatusCode::INTERNAL_SERVER_ERROR, "internal server error") + } +} diff --git a/src/query/service/src/servers/http/middleware.rs b/src/query/service/src/servers/http/middleware/session.rs similarity index 85% rename from src/query/service/src/servers/http/middleware.rs rename to src/query/service/src/servers/http/middleware/session.rs index a0474310b346..3a95456f4f36 100644 --- a/src/query/service/src/servers/http/middleware.rs +++ b/src/query/service/src/servers/http/middleware/session.rs @@ -12,26 +12,22 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::any::Any; use std::collections::HashMap; use std::sync::Arc; -use std::time::Instant; use databend_common_base::headers::HEADER_DEDUPLICATE_LABEL; use databend_common_base::headers::HEADER_NODE_ID; use databend_common_base::headers::HEADER_QUERY_ID; use databend_common_base::headers::HEADER_SESSION_ID; use databend_common_base::headers::HEADER_TENANT; +use databend_common_base::headers::HEADER_VERSION; use databend_common_base::runtime::ThreadTracker; use databend_common_config::GlobalConfig; +use databend_common_config::QUERY_SEMVER; use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_meta_app::principal::user_token::TokenType; use databend_common_meta_app::tenant::Tenant; -use databend_common_metrics::http::metrics_incr_http_request_count; -use databend_common_metrics::http::metrics_incr_http_response_panics_count; -use databend_common_metrics::http::metrics_incr_http_slow_request_count; -use databend_common_metrics::http::metrics_observe_http_response_duration; use fastrace::func_name; use headers::authorization::Basic; use headers::authorization::Bearer; @@ -57,17 +53,18 @@ use poem::Request; use poem::Response; use uuid::Uuid; -use super::v1::HttpQueryContext; -use super::v1::SessionClaim; use crate::auth::AuthMgr; use crate::auth::Credential; use crate::servers::http::error::HttpErrorCode; use crate::servers::http::error::JsonErrorOnly; use crate::servers::http::error::QueryError; +use crate::servers::http::v1::HttpQueryContext; +use crate::servers::http::v1::SessionClaim; use crate::servers::HttpHandlerKind; use crate::sessions::SessionManager; use crate::sessions::SessionType; - +const USER_AGENT: &str = "User-Agent"; +const TRACE_PARENT: &str = "traceparent"; #[derive(Debug, Copy, Clone)] pub enum EndpointKind { Login, @@ -81,6 +78,7 @@ pub enum EndpointKind { } impl EndpointKind { + /// avoid the cost of get user from meta pub fn need_user_info(&self) -> bool { !matches!(self, EndpointKind::NoAuth | EndpointKind::PollQuery) } @@ -89,7 +87,11 @@ impl EndpointKind { EndpointKind::Verify => Ok(None), EndpointKind::Refresh => Ok(Some(TokenType::Refresh)), EndpointKind::StartQuery | EndpointKind::PollQuery | EndpointKind::Logout => { - Ok(Some(TokenType::Session)) + if GlobalConfig::instance().query.management_mode { + Ok(None) + } else { + Ok(Some(TokenType::Session)) + } } _ => Err(ErrorCode::AuthenticateFailure(format!( "should not use databend token for {self:?}", @@ -98,9 +100,6 @@ impl EndpointKind { } } -const USER_AGENT: &str = "User-Agent"; -const TRACE_PARENT: &str = "traceparent"; - pub struct HTTPSessionMiddleware { pub kind: HttpHandlerKind, pub endpoint_kind: EndpointKind, @@ -165,14 +164,14 @@ fn get_credential( let client_ip = get_client_ip(req); if std_auth_headers.is_empty() { if matches!(kind, HttpHandlerKind::Clickhouse) { - auth_clickhouse_name_password(req, client_ip) + get_clickhouse_name_password(req, client_ip) } else { Err(ErrorCode::AuthenticateFailure( "No authorization header detected", )) } } else { - auth_by_header(&std_auth_headers, client_ip, endpoint_kind) + get_credential_from_header(&std_auth_headers, client_ip, endpoint_kind) } } @@ -180,7 +179,7 @@ fn get_credential( /// not found, fallback to the remote address, which might be local proxy's ip address. /// please note that when it comes with network policy, we need make sure the incoming /// traffic comes from a trustworthy proxy instance. -pub fn get_client_ip(req: &Request) -> Option { +fn get_client_ip(req: &Request) -> Option { let headers = ["X-Real-IP", "X-Forwarded-For", "CF-Connecting-IP"]; for &header in headers.iter() { if let Some(value) = req.headers().get(header) { @@ -203,7 +202,7 @@ pub fn get_client_ip(req: &Request) -> Option { client_ip } -fn auth_by_header( +fn get_credential_from_header( std_auth_headers: &[&HeaderValue], client_ip: Option, endpoint_kind: EndpointKind, @@ -246,7 +245,7 @@ fn auth_by_header( } } -fn auth_clickhouse_name_password(req: &Request, client_ip: Option) -> Result { +fn get_clickhouse_name_password(req: &Request, client_ip: Option) -> Result { let (user, key) = ( req.headers().get("X-CLICKHOUSE-USER"), req.headers().get("X-CLICKHOUSE-KEY"), @@ -436,71 +435,8 @@ pub fn sanitize_request_headers(headers: &poem::http::HeaderMap) -> HashMap) -> Self { - Self { api: api.into() } - } -} - -impl Middleware for MetricsMiddleware { - type Output = MetricsMiddlewareEndpoint; - - fn transform(&self, ep: E) -> Self::Output { - MetricsMiddlewareEndpoint { - ep, - api: self.api.clone(), - } - } -} - -pub struct MetricsMiddlewareEndpoint { - api: String, - ep: E, -} - -impl Endpoint for MetricsMiddlewareEndpoint { - type Output = Response; - - async fn call(&self, req: Request) -> poem::error::Result { - let start_time = Instant::now(); - let method = req.method().to_string(); - let output = self.ep.call(req).await?; - let resp = output.into_response(); - let status_code = resp.status().to_string(); - let duration = start_time.elapsed(); - metrics_incr_http_request_count(method.clone(), self.api.clone(), status_code.clone()); - metrics_observe_http_response_duration(method.clone(), self.api.clone(), duration); - if duration.as_secs_f64() > 60.0 { - // TODO: replace this into histogram - metrics_incr_http_slow_request_count(method, self.api.clone(), status_code); - } - Ok(resp) - } -} - -#[derive(Clone, Debug)] -pub(crate) struct PanicHandler {} - -impl PanicHandler { - pub fn new() -> Self { - Self {} - } -} - -impl poem::middleware::PanicHandler for PanicHandler { - type Response = (StatusCode, &'static str); - - fn get_response(&self, _err: Box) -> Self::Response { - metrics_incr_http_response_panics_count(); - (StatusCode::INTERNAL_SERVER_ERROR, "internal server error") - } -} pub async fn json_response(next: E, req: Request) -> PoemResult { - let resp = match next.call(req).await { + let mut resp = match next.call(req).await { Ok(resp) => resp.into_response(), Err(err) => ( err.status(), @@ -514,5 +450,21 @@ pub async fn json_response(next: E, req: Request) -> PoemResult>>, } -pub type StringBlockRef = Arc; - fn data_is_null(column: &Column, row_index: usize) -> bool { match column { Column::Null { .. } => true, diff --git a/src/query/service/tests/it/servers/http/http_query_handlers.rs b/src/query/service/tests/it/servers/http/http_query_handlers.rs index 23d611edfbc4..2ebd78d7079d 100644 --- a/src/query/service/tests/it/servers/http/http_query_handlers.rs +++ b/src/query/service/tests/it/servers/http/http_query_handlers.rs @@ -21,15 +21,16 @@ use base64::engine::general_purpose; use base64::prelude::*; use databend_common_base::base::get_free_tcp_port; use databend_common_base::base::tokio; +use databend_common_base::headers::HEADER_VERSION; use databend_common_config::UserAuthConfig; use databend_common_config::UserConfig; +use databend_common_config::QUERY_SEMVER; use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_meta_app::principal::PasswordHashMethod; use databend_common_users::CustomClaims; use databend_common_users::EnsureUser; use databend_query::servers::http::error::QueryError; -use databend_query::servers::http::middleware::get_client_ip; use databend_query::servers::http::middleware::json_response; use databend_query::servers::http::v1::make_page_uri; use databend_query::servers::http::v1::query_route; @@ -184,6 +185,10 @@ impl TestHttpQueryRequest { .await .map_err(|e| ErrorCode::Internal(e.to_string())) .unwrap(); + assert_eq!( + resp.header(HEADER_VERSION), + Some(QUERY_SEMVER.to_string().as_str()) + ); let status_code = resp.status(); let body = resp.into_body().into_string().await.unwrap(); @@ -1671,16 +1676,6 @@ async fn test_txn_timeout() -> Result<()> { Ok(()) } -#[test] -fn test_parse_ip() -> Result<()> { - let req = poem::Request::builder() - .header("X-Forwarded-For", "1.2.3.4") - .finish(); - let ip = get_client_ip(&req); - assert_eq!(ip, Some("1.2.3.4".to_string())); - Ok(()) -} - #[tokio::test(flavor = "current_thread")] async fn test_has_result_set() -> Result<()> { let _fixture = TestFixture::setup().await?; diff --git a/src/query/service/tests/it/servers/http/json_block.rs b/src/query/service/tests/it/servers/http/json_block.rs index 060728c634cb..0ca921a6ed66 100644 --- a/src/query/service/tests/it/servers/http/json_block.rs +++ b/src/query/service/tests/it/servers/http/json_block.rs @@ -23,7 +23,7 @@ use databend_common_expression::types::StringType; use databend_common_expression::DataBlock; use databend_common_expression::FromData; use databend_common_io::prelude::FormatSettings; -use databend_query::servers::http::v1::string_block::StringBlock; +use databend_query::servers::http::v1::StringBlock; use pretty_assertions::assert_eq; fn test_data_block(is_nullable: bool) -> Result<()> { From 3d11bd976b46290ffb287e979aa95d1e428e4cef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E7=82=8E=E6=B3=BC?= Date: Fri, 27 Sep 2024 21:07:19 +0800 Subject: [PATCH 02/14] refactor: dropped table listing and GC (#16531) - Replace `TableInfoFilter` with `Range>>` for more flexible filtering - Simplify `ListDroppedTableReq` struct and add builder methods - Update schema_api_impl and tests to use new ListDroppedTableReq API - Adjust vacuum logic in interpreter_vacuum_drop_tables to use new API - Minor code cleanup and documentation improvements --- src/meta/api/src/schema_api_impl.rs | 94 ++++++++++--------- src/meta/api/src/schema_api_test_suite.rs | 46 +++------ src/meta/app/src/schema/mod.rs | 1 - src/meta/app/src/schema/table.rs | 91 ++++++++++++++---- .../interpreter_vacuum_drop_tables.rs | 19 ++-- 5 files changed, 143 insertions(+), 108 deletions(-) diff --git a/src/meta/api/src/schema_api_impl.rs b/src/meta/api/src/schema_api_impl.rs index 8c0991aade57..e3b482986866 100644 --- a/src/meta/api/src/schema_api_impl.rs +++ b/src/meta/api/src/schema_api_impl.rs @@ -19,6 +19,7 @@ use std::collections::HashMap; use std::collections::HashSet; use std::convert::Infallible; use std::fmt::Display; +use std::ops::Range; use std::sync::Arc; use std::time::Duration; @@ -140,7 +141,6 @@ use databend_common_meta_app::schema::TableIdToName; use databend_common_meta_app::schema::TableIdent; use databend_common_meta_app::schema::TableIndex; use databend_common_meta_app::schema::TableInfo; -use databend_common_meta_app::schema::TableInfoFilter; use databend_common_meta_app::schema::TableMeta; use databend_common_meta_app::schema::TableNameIdent; use databend_common_meta_app::schema::TruncateTableReply; @@ -2644,11 +2644,13 @@ impl + ?Sized> SchemaApi for KV { let the_limit = req.limit.unwrap_or(usize::MAX); - if let TableInfoFilter::DroppedTableOrDroppedDatabase(retention_boundary) = &req.filter { + let drop_time_range = req.drop_time_range; + + if req.database_name.is_none() { let db_infos = self .get_tenant_history_databases( ListDatabaseReq { - tenant: req.inner.tenant().clone(), + tenant: req.tenant.clone(), }, true, ) @@ -2665,24 +2667,17 @@ impl + ?Sized> SchemaApi for KV { }); } - // If boundary is None, it means choose all tables. - // Thus, we just choose a very large time. - let boundary = retention_boundary.unwrap_or(DateTime::::MAX_UTC); - - let vacuum_db = { - let drop_on = db_info.meta.drop_on; - drop_on.is_some() && drop_on <= Some(boundary) - }; + let vacuum_db = drop_time_range.contains(&db_info.meta.drop_on); // If to vacuum a db, just vacuum all tables. // Otherwise, choose only dropped tables(before retention time). - let filter = if vacuum_db { - TableInfoFilter::All + let table_drop_time_range = if vacuum_db { + None..Some(DateTime::::MAX_UTC) } else { - TableInfoFilter::DroppedTables(*retention_boundary) + drop_time_range.clone() }; - let db_filter = (filter, db_info.clone()); + let db_filter = (table_drop_time_range, db_info.clone()); let capacity = the_limit - vacuum_table_infos.len(); let table_infos = do_get_table_history(self, db_filter, capacity).await?; @@ -2723,12 +2718,13 @@ impl + ?Sized> SchemaApi for KV { }); } - let tenant_dbname = &req.inner; + let database_name = req.database_name.clone().unwrap(); + let tenant_dbname = DatabaseNameIdent::new(&req.tenant, database_name); // Get db by name to ensure presence let res = get_db_or_err( self, - tenant_dbname, + &tenant_dbname, format!("get_table_history: {}", tenant_dbname.display()), ) .await; @@ -2742,10 +2738,10 @@ impl + ?Sized> SchemaApi for KV { let db_info = Arc::new(DatabaseInfo { database_id: seq_db_id.data, - name_ident: req.inner.clone(), + name_ident: tenant_dbname.clone(), meta: db_meta, }); - let db_filter = (req.filter, db_info); + let db_filter = (drop_time_range.clone(), db_info); let table_infos = do_get_table_history(self, db_filter, the_limit).await?; let mut drop_ids = vec![]; let mut drop_table_infos = vec![]; @@ -3531,11 +3527,17 @@ fn build_upsert_table_deduplicated_label(deduplicated_label: String) -> TxnOp { ) } +#[allow(clippy::type_complexity)] #[logcall::logcall(input = "")] #[fastrace::trace] async fn batch_filter_table_info( kv_api: &(impl kvapi::KVApi + ?Sized), - args: &[(&TableInfoFilter, &Arc, u64, String)], + args: &[( + Range>>, + &Arc, + u64, + String, + )], filter_tb_infos: &mut Vec<(Arc, u64)>, ) -> Result<(), KVAppError> { let table_id_idents = args @@ -3544,9 +3546,7 @@ async fn batch_filter_table_info( let seq_metas = kv_api.get_pb_values_vec(table_id_idents).await?; - for (seq_meta, (filter, db_info, table_id, table_name)) in - seq_metas.into_iter().zip(args.iter()) - { + for (seq_meta, (rng, db_info, table_id, table_name)) in seq_metas.into_iter().zip(args.iter()) { let Some(seq_meta) = seq_meta else { error!( "batch_filter_table_info cannot find {:?} table_meta", @@ -3555,14 +3555,8 @@ async fn batch_filter_table_info( continue; }; - if let TableInfoFilter::DroppedTables(retention_boundary) = filter { - let Some(meta_drop_on) = seq_meta.drop_on else { - continue; - }; - - if meta_drop_on > retention_boundary.unwrap_or(DateTime::::MAX_UTC) { - continue; - } + if !rng.contains(&seq_meta.data.drop_on) { + continue; } let tb_info = TableInfo { @@ -3583,7 +3577,12 @@ async fn batch_filter_table_info( Ok(()) } -type TableFilterInfoList<'a> = Vec<(&'a TableInfoFilter, &'a Arc, u64, String)>; +type TableFilterInfoList<'a> = Vec<( + Range>>, + &'a Arc, + u64, + String, +)>; #[logcall::logcall(input = "")] #[fastrace::trace] @@ -3607,19 +3606,15 @@ async fn get_gc_table_info( #[fastrace::trace] async fn do_get_table_history( kv_api: &(impl kvapi::KVApi + ?Sized), - db_filter: (TableInfoFilter, Arc), + db_filter: (Range>>, Arc), limit: usize, ) -> Result, u64)>, KVAppError> { let mut filter_tb_infos = vec![]; // step 1: list db table name with db id - let mut filter_db_info_with_table_id_key_list: Vec<( - &TableInfoFilter, - &Arc, - TableIdHistoryIdent, - )> = vec![]; + let mut filter_db_info_with_table_id_key_list: Vec<_> = vec![]; - let (filter, db_info) = db_filter; + let (drop_time_range, db_info) = db_filter; let db_id = db_info.database_id.db_id; // List tables by tenant, db_id, table_name. @@ -3633,7 +3628,7 @@ async fn do_get_table_history( let keys = table_id_list_keys .iter() - .map(|table_id_list_key| (&filter, &db_info, table_id_list_key.clone())) + .map(|table_id_list_key| (drop_time_range.clone(), &db_info, table_id_list_key.clone())) .collect::>(); filter_db_info_with_table_id_key_list.extend(keys); @@ -3660,10 +3655,17 @@ async fn do_get_table_history( let (filter, db_info, table_id_list_key) = table_id_list_keys_iter.next().unwrap(); let tb_id_list = seq_table_id_list.data; - let id_list: Vec<(&TableInfoFilter, &Arc, u64, String)> = tb_id_list + let id_list: Vec<_> = tb_id_list .id_list .iter() - .map(|id| (filter, db_info, *id, table_id_list_key.table_name.clone())) + .map(|id| { + ( + filter.clone(), + db_info, + *id, + table_id_list_key.table_name.clone(), + ) + }) .collect(); filter_db_info_with_table_id_list.extend(id_list); @@ -3694,6 +3696,10 @@ async fn do_get_table_history( Ok(filter_tb_infos) } +/// Permanently remove a dropped database from the meta-service. +/// +/// Upon calling this method, the dropped database must be already marked as `gc_in_progress`, +/// then remove all **dropped and non-dropped** tables in the database. async fn gc_dropped_db_by_id( kv_api: &(impl kvapi::KVApi + ?Sized), db_id: u64, @@ -3892,6 +3898,10 @@ async fn update_txn_to_remove_table_history( Ok(()) } +/// Update TxnRequest to remove a dropped table's own data. +/// +/// This function returns the updated TxnRequest, +/// or Err of the reason in string if it can not proceed. async fn remove_data_for_dropped_table( kv_api: &(impl kvapi::KVApi + ?Sized), table_id: &TableId, diff --git a/src/meta/api/src/schema_api_test_suite.rs b/src/meta/api/src/schema_api_test_suite.rs index cc87b3e3a707..e17b6b752288 100644 --- a/src/meta/api/src/schema_api_test_suite.rs +++ b/src/meta/api/src/schema_api_test_suite.rs @@ -110,7 +110,6 @@ use databend_common_meta_app::schema::TableIdList; use databend_common_meta_app::schema::TableIdToName; use databend_common_meta_app::schema::TableIdent; use databend_common_meta_app::schema::TableInfo; -use databend_common_meta_app::schema::TableInfoFilter; use databend_common_meta_app::schema::TableMeta; use databend_common_meta_app::schema::TableNameIdent; use databend_common_meta_app::schema::TableStatistics; @@ -3472,11 +3471,7 @@ impl SchemaApiTestSuite { assert_eq!(id_list.len(), 2); { - let req = ListDroppedTableReq { - inner: DatabaseNameIdent::new(&tenant, ""), - filter: TableInfoFilter::DroppedTableOrDroppedDatabase(None), - limit: None, - }; + let req = ListDroppedTableReq::new(&tenant); let resp = mt.get_drop_table_infos(req).await?; let req = GcDroppedTableReq { @@ -3682,11 +3677,7 @@ impl SchemaApiTestSuite { // gc the drop tables { - let req = ListDroppedTableReq { - inner: DatabaseNameIdent::new(&tenant, ""), - filter: TableInfoFilter::DroppedTableOrDroppedDatabase(None), - limit: None, - }; + let req = ListDroppedTableReq::new(&tenant); let resp = mt.get_drop_table_infos(req).await?; let req = GcDroppedTableReq { @@ -3879,11 +3870,7 @@ impl SchemaApiTestSuite { // gc the data { - let req = ListDroppedTableReq { - inner: DatabaseNameIdent::new(&tenant, ""), - filter: TableInfoFilter::DroppedTableOrDroppedDatabase(None), - limit: None, - }; + let req = ListDroppedTableReq::new(&tenant); let resp = mt.get_drop_table_infos(req).await?; let req = GcDroppedTableReq { @@ -4352,11 +4339,7 @@ impl SchemaApiTestSuite { // case 1: test AllDroppedTables with filter time { let now = Utc::now(); - let req = ListDroppedTableReq { - inner: DatabaseNameIdent::new(&tenant, ""), - filter: TableInfoFilter::DroppedTableOrDroppedDatabase(Some(now)), - limit: None, - }; + let req = ListDroppedTableReq::new(&tenant).with_retention_boundary(now); let resp = mt.get_drop_table_infos(req).await?; let got = resp @@ -4390,11 +4373,7 @@ impl SchemaApiTestSuite { // case 2: test AllDroppedTables without filter time { - let req = ListDroppedTableReq { - inner: DatabaseNameIdent::new(&tenant, ""), - filter: TableInfoFilter::DroppedTableOrDroppedDatabase(None), - limit: None, - }; + let req = ListDroppedTableReq::new(&tenant); let resp = mt.get_drop_table_infos(req).await?; let got = resp @@ -4609,10 +4588,11 @@ impl SchemaApiTestSuite { ), ]; for (limit, number, drop_ids) in limit_and_drop_ids { - let req = ListDroppedTableReq { - inner: DatabaseNameIdent::new(&tenant, ""), - filter: TableInfoFilter::DroppedTableOrDroppedDatabase(None), - limit, + let req = ListDroppedTableReq::new(&tenant); + let req = if let Some(limit) = limit { + req.with_limit(limit) + } else { + req }; let resp = mt.get_drop_table_infos(req).await?; assert_eq!(resp.drop_ids.len(), number); @@ -5290,11 +5270,7 @@ impl SchemaApiTestSuite { assert!(seqv.is_some() && seqv.unwrap().seq != 0); // vacuum drop table - let req = ListDroppedTableReq { - inner: DatabaseNameIdent::new(&tenant, ""), - filter: TableInfoFilter::DroppedTableOrDroppedDatabase(None), - limit: None, - }; + let req = ListDroppedTableReq::new(&tenant); let resp = mt.get_drop_table_infos(req).await?; assert!(!resp.drop_ids.is_empty()); diff --git a/src/meta/app/src/schema/mod.rs b/src/meta/app/src/schema/mod.rs index 095f8e67f43b..e7e12d9e7bce 100644 --- a/src/meta/app/src/schema/mod.rs +++ b/src/meta/app/src/schema/mod.rs @@ -115,7 +115,6 @@ pub use table::TableIdToName; pub use table::TableIdent; pub use table::TableIndex; pub use table::TableInfo; -pub use table::TableInfoFilter; pub use table::TableMeta; pub use table::TableNameIdent; pub use table::TableStatistics; diff --git a/src/meta/app/src/schema/table.rs b/src/meta/app/src/schema/table.rs index 9a0384a9f5ec..7542b8177ddd 100644 --- a/src/meta/app/src/schema/table.rs +++ b/src/meta/app/src/schema/table.rs @@ -19,6 +19,7 @@ use std::fmt; use std::fmt::Display; use std::fmt::Formatter; use std::ops::Deref; +use std::ops::Range; use std::sync::Arc; use std::time::Duration; @@ -906,31 +907,81 @@ impl ListTableReq { } #[derive(Clone, Debug, PartialEq, Eq)] -pub enum TableInfoFilter { - /// Choose only dropped tables. - /// - /// If the arg `retention_boundary` time is Some, choose only tables dropped before this boundary time. - DroppedTables(Option>), - /// Choose dropped table or all table in dropped databases. - /// - /// In this case, `ListTableReq`.db_name will be ignored. - /// - /// If the `retention_boundary` time is Some, - /// choose the table dropped before this time - /// or choose the database before this time. - DroppedTableOrDroppedDatabase(Option>), +pub struct ListDroppedTableReq { + pub tenant: Tenant, - /// return all tables, ignore drop on time. - All, -} + /// If `database_name` is None, choose all tables in all databases. + /// Otherwise, choose only tables in this database. + pub database_name: Option, + + /// The time range in which the database/table will be returned. + /// choose only tables/databases dropped before this boundary time. + /// It can include non-dropped tables/databases with `None..Some()` + pub drop_time_range: Range>>, -#[derive(Clone, Debug, PartialEq, Eq)] -pub struct ListDroppedTableReq { - pub inner: DatabaseNameIdent, - pub filter: TableInfoFilter, pub limit: Option, } +impl ListDroppedTableReq { + pub fn new(tenant: &Tenant) -> ListDroppedTableReq { + let rng_start = Some(DateTime::::MIN_UTC); + let rng_end = Some(DateTime::::MAX_UTC); + ListDroppedTableReq { + tenant: tenant.clone(), + database_name: None, + drop_time_range: rng_start..rng_end, + limit: None, + } + } + + pub fn with_db(self, db_name: impl ToString) -> Self { + Self { + database_name: Some(db_name.to_string()), + ..self + } + } + + pub fn with_retention_boundary(self, d: DateTime) -> Self { + let rng_start = Some(DateTime::::MIN_UTC); + let rng_end = Some(d); + Self { + drop_time_range: rng_start..rng_end, + ..self + } + } + + pub fn with_limit(self, limit: usize) -> Self { + Self { + limit: Some(limit), + ..self + } + } + + pub fn new4( + tenant: &Tenant, + database_name: Option, + retention_boundary: Option>, + limit: Option, + ) -> ListDroppedTableReq { + let rng_start = Some(DateTime::::MIN_UTC); + let rng_end = if let Some(b) = retention_boundary { + Some(b) + } else { + Some(DateTime::::MAX_UTC) + }; + ListDroppedTableReq { + tenant: tenant.clone(), + database_name: database_name.map(|s| s.to_string()), + drop_time_range: rng_start..rng_end, + limit, + } + } + + pub fn database_name(&self) -> Option<&str> { + self.database_name.as_deref() + } +} + #[derive(Clone, Debug, PartialEq, Eq)] pub enum DroppedId { Db { diff --git a/src/query/service/src/interpreters/interpreter_vacuum_drop_tables.rs b/src/query/service/src/interpreters/interpreter_vacuum_drop_tables.rs index cb7c4d768662..d1b87942fb07 100644 --- a/src/query/service/src/interpreters/interpreter_vacuum_drop_tables.rs +++ b/src/query/service/src/interpreters/interpreter_vacuum_drop_tables.rs @@ -25,11 +25,9 @@ use databend_common_expression::DataBlock; use databend_common_expression::FromData; use databend_common_license::license::Feature::Vacuum; use databend_common_license::license_manager::LicenseManagerSwitch; -use databend_common_meta_app::schema::database_name_ident::DatabaseNameIdent; use databend_common_meta_app::schema::DroppedId; use databend_common_meta_app::schema::GcDroppedTableReq; use databend_common_meta_app::schema::ListDroppedTableReq; -use databend_common_meta_app::schema::TableInfoFilter; use databend_common_sql::plans::VacuumDropTablePlan; use databend_enterprise_vacuum_handler::get_vacuum_handler; use log::info; @@ -125,19 +123,20 @@ impl Interpreter for VacuumDropTablesInterpreter { self.plan.database, retention_time ); // if database if empty, vacuum all tables - let filter = if self.plan.database.is_empty() { - TableInfoFilter::DroppedTableOrDroppedDatabase(Some(retention_time)) + let database_name = if self.plan.database.is_empty() { + None } else { - TableInfoFilter::DroppedTables(Some(retention_time)) + Some(self.plan.database.clone()) }; let tenant = self.ctx.get_tenant(); let (tables, drop_ids) = catalog - .get_drop_table_infos(ListDroppedTableReq { - inner: DatabaseNameIdent::new(&tenant, &self.plan.database), - filter, - limit: self.plan.option.limit, - }) + .get_drop_table_infos(ListDroppedTableReq::new4( + &tenant, + database_name, + Some(retention_time), + self.plan.option.limit, + )) .await?; info!( From 0189b016a28cb8a30d933160ab16167596077a48 Mon Sep 17 00:00:00 2001 From: Jk Xu <54522439+Dousir9@users.noreply.github.com> Date: Sat, 28 Sep 2024 12:19:24 +0800 Subject: [PATCH 03/14] fix(query): fix column leaf_index (#16537) * chore(query): fix column leaf_index * chore(test): update sqllogictest * chore(binder): refine code * chore(binder): fix tuple inner_field * chore(binder): fix tuple inner column_id * chore(test): update test * chore(binder): fix TUPLE inner_column_id * chore(test): update sqllogictest * chore(test): update sqllogictest --- .../sql/src/planner/expression_parser.rs | 2 +- src/query/sql/src/planner/metadata.rs | 34 +++++----- .../statistics/collect_statistics.rs | 11 ++-- .../base/05_ddl/05_0003_ddl_alter_table.test | 24 +++++++ .../mode/standalone/explain/bloom_filter.test | 8 +-- .../mode/standalone/explain/explain.test | 64 +++++++++++++++++++ .../explain/prewhere_optimization.test | 6 +- .../explain_native/bloom_filter.test | 2 +- .../standalone/explain_native/explain.test | 52 +++++++++++++++ .../explain_native/prewhere_optimization.test | 2 +- 10 files changed, 172 insertions(+), 33 deletions(-) diff --git a/src/query/sql/src/planner/expression_parser.rs b/src/query/sql/src/planner/expression_parser.rs index eec13e6968b8..e6d7b20b87ac 100644 --- a/src/query/sql/src/planner/expression_parser.rs +++ b/src/query/sql/src/planner/expression_parser.rs @@ -297,7 +297,7 @@ pub fn parse_computed_expr_to_string( field.data_type().clone(), 0, None, - None, + Some(field.column_id), None, None, ); diff --git a/src/query/sql/src/planner/metadata.rs b/src/query/sql/src/planner/metadata.rs index 6bd121374f96..f42e6d5477f8 100644 --- a/src/query/sql/src/planner/metadata.rs +++ b/src/query/sql/src/planner/metadata.rs @@ -229,7 +229,7 @@ impl Metadata { data_type: TableDataType, table_index: IndexType, path_indices: Option>, - leaf_index: Option, + column_id: Option, column_position: Option, virtual_computed_expr: Option, ) -> IndexType { @@ -241,7 +241,7 @@ impl Metadata { column_index, table_index, path_indices, - leaf_index, + column_id, virtual_computed_expr, }); self.columns.push(column_entry); @@ -370,8 +370,6 @@ impl Metadata { } } - // build leaf index in DFS order for primitive columns. - let mut leaf_index = 0; while let Some((indices, field)) = fields.pop_front() { if indices.is_empty() { self.add_base_table_column( @@ -379,7 +377,7 @@ impl Metadata { field.data_type().clone(), table_index, None, - None, + Some(field.column_id), None, Some(field.computed_expr().unwrap().expr().clone()), ); @@ -402,25 +400,29 @@ impl Metadata { field.data_type().clone(), table_index, path_indices, - None, + Some(field.column_id), None, None, ); - let mut i = fields_type.len(); - for (inner_field_name, inner_field_type) in - fields_name.iter().zip(fields_type.iter()).rev() + let mut inner_column_id = field.column_id; + for (index, (inner_field_name, inner_field_type)) in + fields_name.iter().zip(fields_type.iter()).enumerate() { - i -= 1; let mut inner_indices = indices.clone(); - inner_indices.push(i); + inner_indices.push(index); // create tuple inner field let inner_name = format!( "{}:{}", field.name(), display_tuple_field_name(inner_field_name) ); - let inner_field = TableField::new(&inner_name, inner_field_type.clone()); + let inner_field = TableField::new_from_column_id( + &inner_name, + inner_field_type.clone(), + inner_column_id, + ); + inner_column_id += inner_field_type.num_leaf_columns() as u32; fields.push_front((inner_indices, inner_field)); } } else { @@ -429,11 +431,10 @@ impl Metadata { field.data_type().clone(), table_index, path_indices, - Some(leaf_index), + Some(field.column_id), Some(indices[0] + 1), None, ); - leaf_index += 1; } } @@ -573,9 +574,8 @@ pub struct BaseTableColumn { /// Path indices for inner column of struct data type. pub path_indices: Option>, - /// Leaf index is the primitive column index in Parquet, constructed in DFS order. - /// None if the data type of column is struct. - pub leaf_index: Option, + /// The column id in table schema. + pub column_id: Option, /// Virtual computed expression, generated in query. pub virtual_computed_expr: Option, } diff --git a/src/query/sql/src/planner/optimizer/statistics/collect_statistics.rs b/src/query/sql/src/planner/optimizer/statistics/collect_statistics.rs index 9f04d669d98e..836cf19aa405 100644 --- a/src/query/sql/src/planner/optimizer/statistics/collect_statistics.rs +++ b/src/query/sql/src/planner/optimizer/statistics/collect_statistics.rs @@ -80,19 +80,18 @@ impl CollectStatisticsOptimizer { for column in columns.iter() { if let ColumnEntry::BaseTableColumn(BaseTableColumn { column_index, - path_indices, - leaf_index, + column_id, virtual_computed_expr, .. }) = column { - if path_indices.is_none() && virtual_computed_expr.is_none() { - if let Some(col_id) = *leaf_index { + if virtual_computed_expr.is_none() { + if let Some(column_id) = *column_id { let col_stat = column_statistics_provider - .column_statistics(col_id as ColumnId); + .column_statistics(column_id as ColumnId); column_stats.insert(*column_index, col_stat.cloned()); let histogram = - column_statistics_provider.histogram(col_id as ColumnId); + column_statistics_provider.histogram(column_id as ColumnId); histograms.insert(*column_index, histogram); } } diff --git a/tests/sqllogictests/suites/base/05_ddl/05_0003_ddl_alter_table.test b/tests/sqllogictests/suites/base/05_ddl/05_0003_ddl_alter_table.test index 7d1003527337..1287b596f930 100644 --- a/tests/sqllogictests/suites/base/05_ddl/05_0003_ddl_alter_table.test +++ b/tests/sqllogictests/suites/base/05_ddl/05_0003_ddl_alter_table.test @@ -255,3 +255,27 @@ DROP TABLE IF EXISTS t; statement ok DROP TABLE IF EXISTS t1; + +statement ok +CREATE OR REPLACE TABLE t1 (id VARCHAR NULL, size VARCHAR NULL, create_time int NULL, path VARCHAR NULL); + +statement ok +CREATE OR REPLACE TABLE t1_random like t1 Engine = Random; + +statement ok +INSERT INTO t1 SELECT * FROM t1_random LIMIT 10; + +statement ok +ALTER TABLE t1 DROP COLUMN size; + +statement ok +ALTER TABLE t1 DROP COLUMN path; + +statement ok +SELECT * FROM t1 WHERE create_time > 111113; + +statement ok +DROP TABLE IF EXISTS t1; + +statement ok +DROP TABLE IF EXISTS t1_random; diff --git a/tests/sqllogictests/suites/mode/standalone/explain/bloom_filter.test b/tests/sqllogictests/suites/mode/standalone/explain/bloom_filter.test index 9582dc645922..18f12f481efa 100644 --- a/tests/sqllogictests/suites/mode/standalone/explain/bloom_filter.test +++ b/tests/sqllogictests/suites/mode/standalone/explain/bloom_filter.test @@ -128,11 +128,11 @@ explain select 1 from bloom_test_t where c2=3; EvalScalar ├── output columns: [1 (#3)] ├── expressions: [1] -├── estimated rows: 1.00 +├── estimated rows: 2.67 └── Filter ├── output columns: [] ├── filters: [is_true(bloom_test_t.c2 (#1) = 3)] - ├── estimated rows: 1.00 + ├── estimated rows: 2.67 └── TableScan ├── table: default.default.bloom_test_t ├── output columns: [c2 (#1)] @@ -150,11 +150,11 @@ explain select 1 from bloom_test_t where c3=12345; EvalScalar ├── output columns: [1 (#3)] ├── expressions: [1] -├── estimated rows: 0.00 +├── estimated rows: 1.00 └── Filter ├── output columns: [] ├── filters: [is_true(bloom_test_t.c3 (#2) = 12345)] - ├── estimated rows: 0.00 + ├── estimated rows: 1.00 └── TableScan ├── table: default.default.bloom_test_t ├── output columns: [c3 (#2)] diff --git a/tests/sqllogictests/suites/mode/standalone/explain/explain.test b/tests/sqllogictests/suites/mode/standalone/explain/explain.test index fdc502f01325..9f19dc078761 100644 --- a/tests/sqllogictests/suites/mode/standalone/explain/explain.test +++ b/tests/sqllogictests/suites/mode/standalone/explain/explain.test @@ -1763,3 +1763,67 @@ INSERT OVERWRITE ALL INTO cat.db1.t1 VALUES (order_id, 'PriorityHandling') INTO statement ok drop table orders_placed + +# Test Tuple Statistics +statement ok +CREATE OR REPLACE TABLE t(a TUPLE(INT, INT)); + +statement ok +INSERT INTO t VALUES((1, 2)), ((3, 4)); + +query T +EXPLAIN SELECT * FROM t WHERE a.1 > 0; +---- +Filter +├── output columns: [t.a (#0)] +├── filters: [is_true(t.a:"1" (#2) > 0)] +├── estimated rows: 2.00 +└── TableScan + ├── table: default.default.t + ├── output columns: [a (#0), a:"1" (#2)] + ├── read rows: 2 + ├── read size: < 1 KiB + ├── partitions total: 1 + ├── partitions scanned: 1 + ├── pruning stats: [segments: , blocks: ] + ├── push downs: [filters: [is_true(t.a:"1" (#2) > 0)], limit: NONE] + └── estimated rows: 2.00 + +query T +EXPLAIN SELECT * FROM t WHERE a.1 > 1; +---- +Filter +├── output columns: [t.a (#0)] +├── filters: [is_true(t.a:"1" (#2) > 1)] +├── estimated rows: 1.00 +└── TableScan + ├── table: default.default.t + ├── output columns: [a (#0), a:"1" (#2)] + ├── read rows: 2 + ├── read size: < 1 KiB + ├── partitions total: 1 + ├── partitions scanned: 1 + ├── pruning stats: [segments: , blocks: ] + ├── push downs: [filters: [is_true(t.a:"1" (#2) > 1)], limit: NONE] + └── estimated rows: 2.00 + +query T +EXPLAIN SELECT * FROM t WHERE a.2 > 1; +---- +Filter +├── output columns: [t.a (#0)] +├── filters: [is_true(t.a:"2" (#1) > 1)] +├── estimated rows: 2.00 +└── TableScan + ├── table: default.default.t + ├── output columns: [a (#0), a:"2" (#1)] + ├── read rows: 2 + ├── read size: < 1 KiB + ├── partitions total: 1 + ├── partitions scanned: 1 + ├── pruning stats: [segments: , blocks: ] + ├── push downs: [filters: [is_true(t.a:"2" (#1) > 1)], limit: NONE] + └── estimated rows: 2.00 + +statement ok +DROP TABLE IF EXISTS t; diff --git a/tests/sqllogictests/suites/mode/standalone/explain/prewhere_optimization.test b/tests/sqllogictests/suites/mode/standalone/explain/prewhere_optimization.test index 619814786a02..7540375271ee 100644 --- a/tests/sqllogictests/suites/mode/standalone/explain/prewhere_optimization.test +++ b/tests/sqllogictests/suites/mode/standalone/explain/prewhere_optimization.test @@ -117,16 +117,16 @@ explain select * from t_where_optimizer where s:a > 0 ---- Filter ├── output columns: [t_where_optimizer.id (#0), t_where_optimizer.s (#1)] -├── filters: [t_where_optimizer.s:a (#2) > 0] +├── filters: [t_where_optimizer.s:a (#3) > 0] ├── estimated rows: 0.00 └── TableScan ├── table: default.default.t_where_optimizer - ├── output columns: [id (#0), s (#1), s:a (#2)] + ├── output columns: [id (#0), s (#1), s:a (#3)] ├── read rows: 0 ├── read size: 0 ├── partitions total: 0 ├── partitions scanned: 0 - ├── push downs: [filters: [t_where_optimizer.s:a (#2) > 0], limit: NONE] + ├── push downs: [filters: [t_where_optimizer.s:a (#3) > 0], limit: NONE] └── estimated rows: 0.00 statement ok diff --git a/tests/sqllogictests/suites/mode/standalone/explain_native/bloom_filter.test b/tests/sqllogictests/suites/mode/standalone/explain_native/bloom_filter.test index 7b0af1b11a56..6e4db9430542 100644 --- a/tests/sqllogictests/suites/mode/standalone/explain_native/bloom_filter.test +++ b/tests/sqllogictests/suites/mode/standalone/explain_native/bloom_filter.test @@ -115,7 +115,7 @@ TableScan ├── partitions scanned: 0 ├── pruning stats: [segments: , blocks: ] ├── push downs: [filters: [is_true(bloom_test_t.c2 (#1) = 3)], limit: NONE] -└── estimated rows: 1.00 +└── estimated rows: 2.67 statement ok drop table bloom_test_t diff --git a/tests/sqllogictests/suites/mode/standalone/explain_native/explain.test b/tests/sqllogictests/suites/mode/standalone/explain_native/explain.test index e2cbebda7b86..8f93ac474229 100644 --- a/tests/sqllogictests/suites/mode/standalone/explain_native/explain.test +++ b/tests/sqllogictests/suites/mode/standalone/explain_native/explain.test @@ -996,3 +996,55 @@ drop table t2; statement ok drop table t3; + +# Test Tuple Statistics +statement ok +CREATE OR REPLACE TABLE t(a TUPLE(INT, INT)); + +statement ok +INSERT INTO t VALUES((1, 2)), ((3, 4)); + +query T +EXPLAIN SELECT * FROM t WHERE a.1 > 0; +---- +TableScan +├── table: default.default.t +├── output columns: [a (#0)] +├── read rows: 2 +├── read size: < 1 KiB +├── partitions total: 1 +├── partitions scanned: 1 +├── pruning stats: [segments: , blocks: ] +├── push downs: [filters: [is_true(t.a:"1" (#2) > 0)], limit: NONE] +└── estimated rows: 2.00 + +query T +EXPLAIN SELECT * FROM t WHERE a.1 > 1; +---- +TableScan +├── table: default.default.t +├── output columns: [a (#0)] +├── read rows: 2 +├── read size: < 1 KiB +├── partitions total: 1 +├── partitions scanned: 1 +├── pruning stats: [segments: , blocks: ] +├── push downs: [filters: [is_true(t.a:"1" (#2) > 1)], limit: NONE] +└── estimated rows: 1.00 + +query T +EXPLAIN SELECT * FROM t WHERE a.2 > 1; +---- +TableScan +├── table: default.default.t +├── output columns: [a (#0)] +├── read rows: 2 +├── read size: < 1 KiB +├── partitions total: 1 +├── partitions scanned: 1 +├── pruning stats: [segments: , blocks: ] +├── push downs: [filters: [is_true(t.a:"2" (#1) > 1)], limit: NONE] +└── estimated rows: 2.00 + +statement ok +DROP TABLE IF EXISTS t; diff --git a/tests/sqllogictests/suites/mode/standalone/explain_native/prewhere_optimization.test b/tests/sqllogictests/suites/mode/standalone/explain_native/prewhere_optimization.test index c4b626237f79..4eff69639641 100644 --- a/tests/sqllogictests/suites/mode/standalone/explain_native/prewhere_optimization.test +++ b/tests/sqllogictests/suites/mode/standalone/explain_native/prewhere_optimization.test @@ -118,7 +118,7 @@ TableScan ├── read size: 0 ├── partitions total: 0 ├── partitions scanned: 0 -├── push downs: [filters: [t_where_optimizer.s:a (#2) > 0], limit: NONE] +├── push downs: [filters: [t_where_optimizer.s:a (#3) > 0], limit: NONE] └── estimated rows: 0.00 statement ok From 022c5a7b0846b59b2ddd35dec32ab3c12d52b4db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E7=82=8E=E6=B3=BC?= Date: Sat, 28 Sep 2024 16:27:18 +0800 Subject: [PATCH 04/14] refactor: gc_dropped_db_by_id() (#16538) --- src/common/base/src/lib.rs | 1 + src/common/base/src/vec_ext.rs | 27 ++++++ src/meta/api/src/schema_api_impl.rs | 143 +++++++++++++--------------- src/meta/api/src/util.rs | 14 +++ 4 files changed, 106 insertions(+), 79 deletions(-) create mode 100644 src/common/base/src/vec_ext.rs diff --git a/src/common/base/src/lib.rs b/src/common/base/src/lib.rs index 2fa7b8dd03dc..04ae30bd8bb0 100644 --- a/src/common/base/src/lib.rs +++ b/src/common/base/src/lib.rs @@ -33,6 +33,7 @@ pub mod headers; pub mod mem_allocator; pub mod rangemap; pub mod runtime; +pub mod vec_ext; pub mod version; pub use runtime::dump_backtrace; diff --git a/src/common/base/src/vec_ext.rs b/src/common/base/src/vec_ext.rs new file mode 100644 index 000000000000..f78dcb3625ce --- /dev/null +++ b/src/common/base/src/vec_ext.rs @@ -0,0 +1,27 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub trait VecExt { + /// Remove the first element that is equal to the given item. + fn remove_first(&mut self, item: &T) -> Option + where T: PartialEq; +} + +impl VecExt for Vec { + fn remove_first(&mut self, item: &T) -> Option + where T: PartialEq { + let pos = self.iter().position(|x| x == item)?; + Some(self.remove(pos)) + } +} diff --git a/src/meta/api/src/schema_api_impl.rs b/src/meta/api/src/schema_api_impl.rs index e3b482986866..85bca0d09995 100644 --- a/src/meta/api/src/schema_api_impl.rs +++ b/src/meta/api/src/schema_api_impl.rs @@ -27,6 +27,7 @@ use chrono::DateTime; use chrono::Utc; use databend_common_base::base::uuid::Uuid; use databend_common_base::display::display_slice::DisplaySliceExt; +use databend_common_base::vec_ext::VecExt; use databend_common_meta_app::app_error::AppError; use databend_common_meta_app::app_error::CommitTableMetaError; use databend_common_meta_app::app_error::CreateAsDropTableWithoutDropTime; @@ -175,7 +176,6 @@ use databend_common_meta_types::ConditionResult; use databend_common_meta_types::MatchSeqExt; use databend_common_meta_types::MetaError; use databend_common_meta_types::MetaId; -use databend_common_meta_types::TxnCondition; use databend_common_meta_types::TxnGetRequest; use databend_common_meta_types::TxnGetResponse; use databend_common_meta_types::TxnOp; @@ -218,6 +218,7 @@ use crate::util::deserialize_struct_get_response; use crate::util::mget_pb_values; use crate::util::txn_delete_exact; use crate::util::txn_op_put_pb; +use crate::util::txn_put_pb; use crate::util::txn_replace_exact; use crate::util::unknown_database_error; use crate::SchemaApi; @@ -3707,100 +3708,84 @@ async fn gc_dropped_db_by_id( db_name: String, ) -> Result<(), KVAppError> { // List tables by tenant, db_id, table_name. - let dbid_idlist = DatabaseIdHistoryIdent::new(tenant, db_name); - let (db_id_list_seq, db_id_list_opt) = kv_api.get_pb_seq_and_value(&dbid_idlist).await?; + let db_id_history_ident = DatabaseIdHistoryIdent::new(tenant, db_name); + let Some(seq_dbid_list) = kv_api.get_pb(&db_id_history_ident).await? else { + return Ok(()); + }; + + let mut db_id_list = seq_dbid_list.data; + + // If the db_id is not in the list, return. + if db_id_list.id_list.remove_first(&db_id).is_none() { + return Ok(()); + } - let mut db_id_list = match db_id_list_opt { - Some(list) => list, - None => return Ok(()), + let dbid = DatabaseId { db_id }; + let Some(seq_db_meta) = kv_api.get_pb(&dbid).await? else { + return Ok(()); }; - for (i, dbid) in db_id_list.id_list.iter().enumerate() { - if *dbid != db_id { - continue; - } - let dbid = DatabaseId { db_id }; - let (db_meta_seq, _db_meta) = kv_api.get_pb_seq_and_value(&dbid).await?; - if db_meta_seq == 0 { - return Ok(()); - } - let id_to_name = DatabaseIdToName { db_id }; - let (name_ident_seq, _name_ident) = kv_api.get_pb_seq_and_value(&id_to_name).await?; - if name_ident_seq == 0 { - return Ok(()); - } - let dbid_tbname_idlist = TableIdHistoryIdent { - database_id: db_id, - table_name: "".to_string(), - }; + // TODO: enable this when gc_in_progress is set. + // if !seq_db_meta.gc_in_progress { + // let err = UnknownDatabaseId::new( + // db_id, + // "database is not in gc_in_progress state, \ + // can not gc. \ + // First mark the database as gc_in_progress, \ + // then gc is allowed.", + // ); + // return Err(AppError::from(err).into()); + // } - let dir_name = DirName::new(dbid_tbname_idlist); + let id_to_name = DatabaseIdToName { db_id }; + let Some(seq_name) = kv_api.get_pb(&id_to_name).await? else { + return Ok(()); + }; - let table_id_list_keys = list_keys(kv_api, &dir_name).await?; - let keys: Vec = table_id_list_keys - .iter() - .map(|table_id_list_key| { - TableIdHistoryIdent { - database_id: db_id, - table_name: table_id_list_key.table_name.clone(), - } - .to_string_key() - }) - .collect(); + let table_history_ident = TableIdHistoryIdent { + database_id: db_id, + table_name: "dummy".to_string(), + }; + let dir_name = DirName::new(table_history_ident); - let mut txn = TxnRequest::default(); + let table_history_items = kv_api.list_pb_vec(&dir_name).await?; - for c in keys.chunks(DEFAULT_MGET_SIZE) { - let tb_id_list_seq_vec: Vec<(u64, Option)> = - mget_pb_values(kv_api, c).await?; - let mut iter = c.iter(); - for (tb_id_list_seq, tb_id_list_opt) in tb_id_list_seq_vec { - let tb_id_list = match tb_id_list_opt { - Some(list) => list, - None => { - continue; - } - }; + let mut txn = TxnRequest::default(); - for tb_id in tb_id_list.id_list { - let table_id_ident = TableId { table_id: tb_id }; - remove_copied_files_for_dropped_table(kv_api, &table_id_ident).await?; - remove_data_for_dropped_table(kv_api, &table_id_ident, &mut txn).await?; - remove_index_for_dropped_table(kv_api, tenant, &table_id_ident, &mut txn) - .await?; - } + for (ident, table_history) in table_history_items { + for tb_id in table_history.id_list.iter() { + let table_id_ident = TableId { table_id: *tb_id }; - let id_key = iter.next().unwrap(); - txn.if_then.push(TxnOp::delete(id_key)); - txn.condition - .push(TxnCondition::eq_seq(id_key, tb_id_list_seq)); - } + // TODO: mark table as gc_in_progress - // for id_key in c { - // if_then.push(txn_op_del(id_key)); - // } - } - db_id_list.id_list.remove(i); - txn.condition - .push(txn_cond_seq(&dbid_idlist, Eq, db_id_list_seq)); - if db_id_list.id_list.is_empty() { - txn.if_then.push(txn_op_del(&dbid_idlist)); - } else { - // save new db id list - txn.if_then - .push(txn_op_put(&dbid_idlist, serialize_struct(&db_id_list)?)); + remove_copied_files_for_dropped_table(kv_api, &table_id_ident).await?; + remove_data_for_dropped_table(kv_api, &table_id_ident, &mut txn).await?; + remove_index_for_dropped_table(kv_api, tenant, &table_id_ident, &mut txn).await?; } - txn.condition.push(txn_cond_seq(&dbid, Eq, db_meta_seq)); - txn.if_then.push(txn_op_del(&dbid)); txn.condition - .push(txn_cond_seq(&id_to_name, Eq, name_ident_seq)); - txn.if_then.push(txn_op_del(&id_to_name)); + .push(txn_cond_eq_seq(&ident, table_history.seq)); + txn.if_then.push(txn_op_del(&ident)); + } - let _resp = kv_api.transaction(txn).await?; - break; + txn.condition + .push(txn_cond_eq_seq(&db_id_history_ident, seq_dbid_list.seq)); + if db_id_list.id_list.is_empty() { + txn.if_then.push(txn_op_del(&db_id_history_ident)); + } else { + // save new db id list + txn.if_then + .push(txn_put_pb(&db_id_history_ident, &db_id_list)?); } + txn.condition.push(txn_cond_eq_seq(&dbid, seq_db_meta.seq)); + txn.if_then.push(txn_op_del(&dbid)); + txn.condition + .push(txn_cond_eq_seq(&id_to_name, seq_name.seq)); + txn.if_then.push(txn_op_del(&id_to_name)); + + let _resp = kv_api.transaction(txn).await?; + Ok(()) } diff --git a/src/meta/api/src/util.rs b/src/meta/api/src/util.rs index 72526372d0b7..df60eb580457 100644 --- a/src/meta/api/src/util.rs +++ b/src/meta/api/src/util.rs @@ -305,6 +305,20 @@ pub fn txn_cond_seq(key: &impl kvapi::Key, op: ConditionResult, seq: u64) -> Txn } } +pub fn txn_put_pb(key: &K, value: &K::ValueType) -> Result +where + K: kvapi::Key, + K::ValueType: FromToProto + 'static, +{ + let p = value.to_pb().map_err(|e| InvalidArgument::new(e, ""))?; + + let mut buf = vec![]; + prost::Message::encode(&p, &mut buf).map_err(|e| InvalidArgument::new(e, ""))?; + + Ok(TxnOp::put(key.to_string_key(), buf)) +} + +/// Deprecate this. Replace it with `txn_put_pb().with_ttl()` pub fn txn_op_put_pb( key: &K, value: &K::ValueType, From d8d6b47e70d659e1e5e038684ba2e5a8ad3545b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E7=82=8E=E6=B3=BC?= Date: Sat, 28 Sep 2024 21:38:34 +0800 Subject: [PATCH 05/14] fix: Only when all tables are included, the db can be dropped (#16539) --- src/meta/api/src/schema_api_impl.rs | 113 +++++----------------------- 1 file changed, 18 insertions(+), 95 deletions(-) diff --git a/src/meta/api/src/schema_api_impl.rs b/src/meta/api/src/schema_api_impl.rs index 85bca0d09995..404e6b7f1cf2 100644 --- a/src/meta/api/src/schema_api_impl.rs +++ b/src/meta/api/src/schema_api_impl.rs @@ -2684,7 +2684,7 @@ impl + ?Sized> SchemaApi for KV { let table_infos = do_get_table_history(self, db_filter, capacity).await?; // A DB can be removed only when all its tables are removed. - if vacuum_db && capacity >= table_infos.len() { + if vacuum_db && capacity > table_infos.len() { vacuum_ids.push(DroppedId::Db { db_id: db_info.database_id.db_id, db_name: db_info.name_ident.database_name().to_string(), @@ -3578,31 +3578,6 @@ async fn batch_filter_table_info( Ok(()) } -type TableFilterInfoList<'a> = Vec<( - Range>>, - &'a Arc, - u64, - String, -)>; - -#[logcall::logcall(input = "")] -#[fastrace::trace] -async fn get_gc_table_info( - kv_api: &(impl kvapi::KVApi + ?Sized), - limit: usize, - table_id_list: &TableFilterInfoList<'_>, -) -> Result, u64)>, KVAppError> { - let table_id_list = &table_id_list[..std::cmp::min(limit, table_id_list.len())]; - - let mut filter_tb_infos = vec![]; - - for chunk in table_id_list.chunks(DEFAULT_MGET_SIZE) { - batch_filter_table_info(kv_api, chunk, &mut filter_tb_infos).await?; - } - - Ok(filter_tb_infos) -} - #[logcall::logcall(input = "")] #[fastrace::trace] async fn do_get_table_history( @@ -3610,88 +3585,35 @@ async fn do_get_table_history( db_filter: (Range>>, Arc), limit: usize, ) -> Result, u64)>, KVAppError> { - let mut filter_tb_infos = vec![]; - - // step 1: list db table name with db id - let mut filter_db_info_with_table_id_key_list: Vec<_> = vec![]; - let (drop_time_range, db_info) = db_filter; let db_id = db_info.database_id.db_id; - // List tables by tenant, db_id, table_name. let dbid_tbname_idlist = TableIdHistoryIdent { database_id: db_id, table_name: "dummy".to_string(), }; let dir_name = DirName::new(dbid_tbname_idlist); - let strm = kv_api.list_pb_keys(&dir_name).await?; - let table_id_list_keys = strm.try_collect::>().await?; - - let keys = table_id_list_keys - .iter() - .map(|table_id_list_key| (drop_time_range.clone(), &db_info, table_id_list_key.clone())) - .collect::>(); - - filter_db_info_with_table_id_key_list.extend(keys); - - // step 2: list all table id of table by table name - let keys = filter_db_info_with_table_id_key_list - .iter() - .map(|(_, db_info, table_id_list_key)| TableIdHistoryIdent { - database_id: db_info.database_id.db_id, - table_name: table_id_list_key.table_name.clone(), - }) - .collect::>(); - - let mut filter_db_info_with_table_id_list: TableFilterInfoList<'_> = vec![]; - let mut table_id_list_keys_iter = filter_db_info_with_table_id_key_list.into_iter(); - for c in keys.chunks(DEFAULT_MGET_SIZE) { - let strm = kv_api.get_pb_values(c.to_vec()).await?; - let table_id_list_vec = strm - .try_filter_map(|x| async move { Ok(x) }) - .try_collect::>() - .await?; + let table_history_kvs = kv_api.list_pb_vec(&dir_name).await?; - for seq_table_id_list in table_id_list_vec { - let (filter, db_info, table_id_list_key) = table_id_list_keys_iter.next().unwrap(); - let tb_id_list = seq_table_id_list.data; - - let id_list: Vec<_> = tb_id_list - .id_list - .iter() - .map(|id| { - ( - filter.clone(), - db_info, - *id, - table_id_list_key.table_name.clone(), - ) - }) - .collect(); + let mut the_list = vec![]; - filter_db_info_with_table_id_list.extend(id_list); - if filter_db_info_with_table_id_list.len() < DEFAULT_MGET_SIZE { - continue; - } - - let ret = get_gc_table_info(kv_api, limit, &filter_db_info_with_table_id_list).await?; - filter_tb_infos.extend(ret); - filter_db_info_with_table_id_list.clear(); - - if filter_tb_infos.len() >= limit { - return Ok(filter_tb_infos); - } + for (ident, table_history) in table_history_kvs { + for table_id in table_history.id_list.iter() { + the_list.push(( + drop_time_range.clone(), + &db_info, + *table_id, + ident.table_name.clone(), + )); } + } - if !filter_db_info_with_table_id_list.is_empty() { - let ret = get_gc_table_info(kv_api, limit, &filter_db_info_with_table_id_list).await?; - filter_tb_infos.extend(ret); - filter_db_info_with_table_id_list.clear(); + let mut filter_tb_infos = vec![]; - if filter_tb_infos.len() >= limit { - return Ok(filter_tb_infos); - } - } + for c in the_list[..std::cmp::min(limit, the_list.len())].chunks(DEFAULT_MGET_SIZE) { + let mut infos = vec![]; + batch_filter_table_info(kv_api, c, &mut infos).await?; + filter_tb_infos.extend(infos); } Ok(filter_tb_infos) @@ -3880,6 +3802,7 @@ async fn update_txn_to_remove_table_history( txn.if_then .push(txn_op_put_pb(table_id_history_ident, &history, None)?); } + Ok(()) } From c6bd10b9563e937a1bd2694822a0e825d9d2079e Mon Sep 17 00:00:00 2001 From: sundyli <543950155@qq.com> Date: Sat, 28 Sep 2024 23:44:00 +0800 Subject: [PATCH 06/14] chore(query): add more asserts (#16536) * add tests * add tests --- src/query/catalog/src/plan/stream_column.rs | 5 +---- src/query/expression/src/block.rs | 14 ++++++++++++-- src/query/expression/src/kernels/filter.rs | 13 ++++++++++++- src/query/expression/src/kernels/sort_compare.rs | 2 ++ 4 files changed, 27 insertions(+), 7 deletions(-) diff --git a/src/query/catalog/src/plan/stream_column.rs b/src/query/catalog/src/plan/stream_column.rs index fd10a1c80bd2..40cdc1d0db72 100644 --- a/src/query/catalog/src/plan/stream_column.rs +++ b/src/query/catalog/src/plan/stream_column.rs @@ -126,10 +126,7 @@ impl StreamColumnMeta { } pub fn build_origin_block_row_num(num_rows: usize) -> BlockEntry { - let mut row_ids = Vec::with_capacity(num_rows); - for i in 0..num_rows { - row_ids.push(i as u64); - } + let row_ids = (0..num_rows as u64).collect(); let column = Value::Column(UInt64Type::from_data(row_ids)); BlockEntry::new( diff --git a/src/query/expression/src/block.rs b/src/query/expression/src/block.rs index 7911dd33d14c..820d5841d4a3 100644 --- a/src/query/expression/src/block.rs +++ b/src/query/expression/src/block.rs @@ -117,7 +117,6 @@ impl DataBlock { num_rows: usize, meta: Option, ) -> Self { - #[cfg(debug_assertions)] Self::check_columns_valid(&columns, num_rows).unwrap(); Self { @@ -130,6 +129,7 @@ impl DataBlock { fn check_columns_valid(columns: &[BlockEntry], num_rows: usize) -> Result<()> { for entry in columns.iter() { if let Value::Column(c) = &entry.value { + #[cfg(debug_assertions)] c.check_valid()?; if c.len() != num_rows { return Err(ErrorCode::Internal(format!( @@ -264,6 +264,12 @@ impl DataBlock { } pub fn slice(&self, range: Range) -> Self { + assert!( + range.end <= self.num_rows(), + "range {:?} out of len {}", + range, + self.num_rows() + ); let columns = self .columns() .iter() @@ -279,7 +285,11 @@ impl DataBlock { .collect(); Self { columns, - num_rows: range.end - range.start, + num_rows: if range.is_empty() { + 0 + } else { + range.end - range.start + }, meta: self.meta.clone(), } } diff --git a/src/query/expression/src/kernels/filter.rs b/src/query/expression/src/kernels/filter.rs index 053d7842b8cf..1b730323a24f 100644 --- a/src/query/expression/src/kernels/filter.rs +++ b/src/query/expression/src/kernels/filter.rs @@ -112,6 +112,7 @@ struct FilterVisitor<'a> { filter: &'a Bitmap, result: Option>, num_rows: usize, + original_rows: usize, } impl<'a> FilterVisitor<'a> { @@ -121,6 +122,7 @@ impl<'a> FilterVisitor<'a> { filter, result: None, num_rows, + original_rows: filter.len(), } } } @@ -130,6 +132,8 @@ impl<'a> ValueVisitor for FilterVisitor<'a> { match value { Value::Scalar(c) => self.visit_scalar(c), Value::Column(c) => { + assert!(c.len() == self.original_rows); + if c.len() == self.num_rows || c.len() == 0 { self.result = Some(Value::Column(c)); } else if self.num_rows == 0 { @@ -255,7 +259,14 @@ impl<'a> ValueVisitor for FilterVisitor<'a> { Ok(()) } - fn visit_boolean(&mut self, bitmap: Bitmap) -> Result<()> { + fn visit_boolean(&mut self, mut bitmap: Bitmap) -> Result<()> { + // faster path for all bits set + if bitmap.unset_bits() == 0 { + bitmap.slice(0, self.num_rows); + self.result = Some(Value::Column(BooleanType::upcast_column(bitmap))); + return Ok(()); + } + let capacity = self.num_rows.saturating_add(7) / 8; let mut builder: Vec = Vec::with_capacity(capacity); let mut builder_ptr = builder.as_mut_ptr(); diff --git a/src/query/expression/src/kernels/sort_compare.rs b/src/query/expression/src/kernels/sort_compare.rs index f492ba7178c9..45d91f0098d0 100644 --- a/src/query/expression/src/kernels/sort_compare.rs +++ b/src/query/expression/src/kernels/sort_compare.rs @@ -263,6 +263,7 @@ impl ValueVisitor for SortCompare { // faster path for numeric fn visit_number(&mut self, column: Buffer) -> Result<()> { let values = column.as_slice(); + assert!(values.len() == self.rows); self.generic_sort(values, |c, idx| c[idx as usize], |a: T, b: T| a.cmp(&b)); Ok(()) } @@ -276,6 +277,7 @@ impl ValueVisitor for SortCompare { } fn visit_typed_column(&mut self, col: T::Column) -> Result<()> { + assert!(T::column_len(&col) == self.rows); self.generic_sort( &col, |c, idx| -> T::ScalarRef<'_> { unsafe { T::index_column_unchecked(c, idx as _) } }, From 4129816b5e4fc72ce688879599dc2252e70058ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E7=82=8E=E6=B3=BC?= Date: Sun, 29 Sep 2024 08:47:22 +0800 Subject: [PATCH 07/14] refactor: SchemaAPI::do_get_table_history (#16540) --- src/meta/api/src/schema_api_impl.rs | 116 +++++++++++----------------- 1 file changed, 44 insertions(+), 72 deletions(-) diff --git a/src/meta/api/src/schema_api_impl.rs b/src/meta/api/src/schema_api_impl.rs index 404e6b7f1cf2..88304f4432a1 100644 --- a/src/meta/api/src/schema_api_impl.rs +++ b/src/meta/api/src/schema_api_impl.rs @@ -3528,56 +3528,6 @@ fn build_upsert_table_deduplicated_label(deduplicated_label: String) -> TxnOp { ) } -#[allow(clippy::type_complexity)] -#[logcall::logcall(input = "")] -#[fastrace::trace] -async fn batch_filter_table_info( - kv_api: &(impl kvapi::KVApi + ?Sized), - args: &[( - Range>>, - &Arc, - u64, - String, - )], - filter_tb_infos: &mut Vec<(Arc, u64)>, -) -> Result<(), KVAppError> { - let table_id_idents = args - .iter() - .map(|(_f, _db, table_id, _table_name)| TableId::new(*table_id)); - - let seq_metas = kv_api.get_pb_values_vec(table_id_idents).await?; - - for (seq_meta, (rng, db_info, table_id, table_name)) in seq_metas.into_iter().zip(args.iter()) { - let Some(seq_meta) = seq_meta else { - error!( - "batch_filter_table_info cannot find {:?} table_meta", - table_id - ); - continue; - }; - - if !rng.contains(&seq_meta.data.drop_on) { - continue; - } - - let tb_info = TableInfo { - ident: TableIdent { - table_id: *table_id, - seq: seq_meta.seq, - }, - desc: format!("'{}'.'{}'", db_info.name_ident.database_name(), table_name,), - name: (*table_name).clone(), - meta: seq_meta.data, - db_type: DatabaseType::NormalDB, - catalog_info: Default::default(), - }; - - filter_tb_infos.push((Arc::new(tb_info), db_info.database_id.db_id)); - } - - Ok(()) -} - #[logcall::logcall(input = "")] #[fastrace::trace] async fn do_get_table_history( @@ -3595,25 +3545,48 @@ async fn do_get_table_history( let dir_name = DirName::new(dbid_tbname_idlist); let table_history_kvs = kv_api.list_pb_vec(&dir_name).await?; - let mut the_list = vec![]; + let mut args = vec![]; for (ident, table_history) in table_history_kvs { for table_id in table_history.id_list.iter() { - the_list.push(( - drop_time_range.clone(), - &db_info, - *table_id, - ident.table_name.clone(), - )); + args.push((TableId::new(*table_id), ident.table_name.clone())); } } let mut filter_tb_infos = vec![]; - for c in the_list[..std::cmp::min(limit, the_list.len())].chunks(DEFAULT_MGET_SIZE) { - let mut infos = vec![]; - batch_filter_table_info(kv_api, c, &mut infos).await?; - filter_tb_infos.extend(infos); + for chunk in args[..std::cmp::min(limit, args.len())].chunks(DEFAULT_MGET_SIZE) { + let table_id_idents = chunk.iter().map(|(table_id, _)| table_id.clone()); + + let seq_metas = kv_api.get_pb_values_vec(table_id_idents).await?; + + for (seq_meta, (table_id, table_name)) in seq_metas.into_iter().zip(chunk.iter()) { + let Some(seq_meta) = seq_meta else { + error!( + "batch_filter_table_info cannot find {:?} table_meta", + table_id + ); + continue; + }; + + if !drop_time_range.contains(&seq_meta.data.drop_on) { + continue; + } + + let tb_info = TableInfo { + ident: TableIdent { + table_id: table_id.table_id, + seq: seq_meta.seq, + }, + desc: format!("'{}'.'{}'", db_info.name_ident.database_name(), table_name,), + name: (*table_name).clone(), + meta: seq_meta.data, + db_type: DatabaseType::NormalDB, + catalog_info: Default::default(), + }; + + filter_tb_infos.push((Arc::new(tb_info), db_info.database_id.db_id)); + } } Ok(filter_tb_infos) @@ -3681,7 +3654,7 @@ async fn gc_dropped_db_by_id( // TODO: mark table as gc_in_progress remove_copied_files_for_dropped_table(kv_api, &table_id_ident).await?; - remove_data_for_dropped_table(kv_api, &table_id_ident, &mut txn).await?; + let _ = remove_data_for_dropped_table(kv_api, &table_id_ident, &mut txn).await?; remove_index_for_dropped_table(kv_api, tenant, &table_id_ident, &mut txn).await?; } @@ -3731,7 +3704,7 @@ async fn gc_dropped_table_by_id( let mut txn = TxnRequest::default(); // 1) - remove_data_for_dropped_table(kv_api, table_id_ident, &mut txn).await?; + let _ = remove_data_for_dropped_table(kv_api, table_id_ident, &mut txn).await?; // 2) let table_id_history_ident = TableIdHistoryIdent { @@ -3814,21 +3787,20 @@ async fn remove_data_for_dropped_table( kv_api: &(impl kvapi::KVApi + ?Sized), table_id: &TableId, txn: &mut TxnRequest, -) -> Result<(), KVAppError> { +) -> Result, MetaError> { let seq_meta = kv_api.get_pb(table_id).await?; let Some(seq_meta) = seq_meta else { - error!( - "gc_dropped_table_by_id cannot find {:?} table_meta", - table_id - ); - return Ok(()); + let err = format!("cannot find TableMeta by id: {:?}, ", table_id); + error!("{}", err); + return Ok(Err(err)); }; // TODO: enable this check. Currently when gc db, the table may not be dropped. // if seq_meta.data.drop_on.is_none() { - // warn!("gc_dropped_table_by_id {:?} is not dropped", table_id); - // return Ok(()); + // let err = format!("Table {:?} is not dropped, can not remove", table_id); + // warn!("{}", err); + // return Ok(Err(err)); // } txn_delete_exact(txn, table_id, seq_meta.seq); @@ -3844,7 +3816,7 @@ async fn remove_data_for_dropped_table( txn_delete_exact(txn, &id_to_name, seq_name.seq); } - Ok(()) + Ok(Ok(())) } async fn remove_index_for_dropped_table( From 5bfafbeb055dc19c8a565e44366b14595ba6717a Mon Sep 17 00:00:00 2001 From: arkzuse <88483186+arkzuse@users.noreply.github.com> Date: Sun, 29 Sep 2024 07:56:41 +0530 Subject: [PATCH 08/14] feat: short sql length setting (#16502) feature: short sql length setting --- src/common/base/src/base/string.rs | 9 +++---- src/common/base/tests/it/string.rs | 25 ++++++++++++++----- .../service/src/interpreters/interpreter.rs | 5 +++- .../src/servers/http/clickhouse_handler.rs | 11 +++++++- .../src/servers/http/v1/query/http_query.rs | 2 +- .../service/src/sessions/query_ctx_shared.rs | 7 +++++- src/query/settings/src/settings_default.rs | 6 +++++ .../settings/src/settings_getter_setter.rs | 8 ++++++ src/query/settings/tests/it/setting.rs | 19 ++++++++++++++ 9 files changed, 77 insertions(+), 15 deletions(-) diff --git a/src/common/base/src/base/string.rs b/src/common/base/src/base/string.rs index e240e2630d13..0b462d0d4c93 100644 --- a/src/common/base/src/base/string.rs +++ b/src/common/base/src/base/string.rs @@ -193,8 +193,7 @@ pub fn mask_connection_info(sql: &str) -> String { /// Maximum length of the SQL query to be displayed or log. /// If the query exceeds this length and starts with keywords, /// it will be truncated and appended with the remaining length. -pub fn short_sql(sql: String) -> String { - const MAX_LENGTH: usize = 128; +pub fn short_sql(sql: String, max_length: u64) -> String { let keywords = ["INSERT"]; fn starts_with_any(query: &str, keywords: &[&str]) -> bool { @@ -209,10 +208,10 @@ pub fn short_sql(sql: String) -> String { // of multiple Unicode code points. // This ensures that we handle complex characters like emojis or // accented characters properly. - if query.graphemes(true).count() > MAX_LENGTH && starts_with_any(query, &keywords) { - let truncated: String = query.graphemes(true).take(MAX_LENGTH).collect(); + if query.graphemes(true).count() > max_length as usize && starts_with_any(query, &keywords) { + let truncated: String = query.graphemes(true).take(max_length as usize).collect(); let original_length = query.graphemes(true).count(); - let remaining_length = original_length.saturating_sub(MAX_LENGTH); + let remaining_length = original_length.saturating_sub(max_length as usize); // Append the remaining length indicator truncated + &format!("...[{} more characters]", remaining_length) } else { diff --git a/src/common/base/tests/it/string.rs b/src/common/base/tests/it/string.rs index 4b49c57f2673..5cdf951f7b4d 100644 --- a/src/common/base/tests/it/string.rs +++ b/src/common/base/tests/it/string.rs @@ -111,9 +111,10 @@ fn test_mask_connection_info() { #[test] fn test_short_sql() { + let max_length: u64 = 128; // Test case 1: SQL query shorter than 128 characters let sql1 = "SELECT * FROM users WHERE id = 1;".to_string(); - assert_eq!(short_sql(sql1.clone()), sql1); + assert_eq!(short_sql(sql1.clone(), max_length), sql1); // Test case 2: SQL query longer than 128 characters and starts with "INSERT" let long_sql_insert = "INSERT INTO users (id, name, email) VALUES ".to_string() @@ -123,23 +124,32 @@ fn test_short_sql() { let truncated: String = long_sql_insert.graphemes(true).take(128).collect(); truncated + &format!("...[{} more characters]", expected_length_insert) }; - assert_eq!(short_sql(long_sql_insert.clone()), expected_result_insert); + assert_eq!( + short_sql(long_sql_insert.clone(), max_length), + expected_result_insert + ); // Test case 3: SQL query longer than 128 characters but does not start with "INSERT" let long_sql_update = "UPDATE users SET name = 'John' WHERE id = 1;".to_string() + &"id = 1 OR ".repeat(20); // Make sure this creates a string longer than 128 characters - assert_eq!(short_sql(long_sql_update.clone()), long_sql_update); + assert_eq!( + short_sql(long_sql_update.clone(), max_length), + long_sql_update + ); // Test case 4: Empty SQL query let empty_sql = "".to_string(); - assert_eq!(short_sql(empty_sql.clone()), empty_sql); + assert_eq!(short_sql(empty_sql.clone(), max_length), empty_sql); // Test case 5: SQL query with leading whitespace let sql_with_whitespace = " INSERT INTO users (id, name, email) VALUES (1, 'John Doe', 'john@example.com');" .to_string(); let trimmed_sql = sql_with_whitespace.trim_start().to_string(); - assert_eq!(short_sql(sql_with_whitespace.clone()), trimmed_sql); + assert_eq!( + short_sql(sql_with_whitespace.clone(), max_length), + trimmed_sql + ); // Test case 6: SQL query with multiple emojis to test truncation at an emoji point let emoji_sql = "INSERT INTO users (id, name) VALUES (1, 'John Doe 😊😊😊😊😊😊😊😊😊😊');" @@ -150,5 +160,8 @@ fn test_short_sql() { let remaining_length = emoji_sql.graphemes(true).count().saturating_sub(128); truncated + &format!("...[{} more characters]", remaining_length) }; - assert_eq!(short_sql(emoji_sql.clone()), expected_emoji_result); + assert_eq!( + short_sql(emoji_sql.clone(), max_length), + expected_emoji_result + ); } diff --git a/src/query/service/src/interpreters/interpreter.rs b/src/query/service/src/interpreters/interpreter.rs index fa1c9e311377..c74892dc32a0 100644 --- a/src/query/service/src/interpreters/interpreter.rs +++ b/src/query/service/src/interpreters/interpreter.rs @@ -207,7 +207,10 @@ pub async fn interpreter_plan_sql(ctx: Arc, sql: &str) -> Result<( Arc::new(ServiceQueryExecutor::new(ctx.clone())), ); let result = planner.plan_sql(sql).await; - let short_sql = short_sql(sql.to_string()); + let short_sql = short_sql( + sql.to_string(), + ctx.get_settings().get_short_sql_max_length()?, + ); let mut stmt = if let Ok((_, extras)) = &result { Some(extras.statement.clone()) } else { diff --git a/src/query/service/src/servers/http/clickhouse_handler.rs b/src/query/service/src/servers/http/clickhouse_handler.rs index 8020159f1d14..22f41136144f 100644 --- a/src/query/service/src/servers/http/clickhouse_handler.rs +++ b/src/query/service/src/servers/http/clickhouse_handler.rs @@ -330,7 +330,16 @@ pub async fn clickhouse_handler_post( // other parts of the request already logged in middleware let len = sql.len(); let msg = if len > n { - format!("{}...(omit {} bytes)", short_sql(sql.clone()), len - n) + format!( + "{}...(omit {} bytes)", + short_sql( + sql.clone(), + ctx.get_settings() + .get_short_sql_max_length() + .unwrap_or(1000) + ), + len - n + ) } else { sql.to_string() }; diff --git a/src/query/service/src/servers/http/v1/query/http_query.rs b/src/query/service/src/servers/http/v1/query/http_query.rs index 8641e665392f..f714a72b9ba0 100644 --- a/src/query/service/src/servers/http/v1/query/http_query.rs +++ b/src/query/service/src/servers/http/v1/query/http_query.rs @@ -130,7 +130,7 @@ impl Debug for HttpQueryRequest { f.debug_struct("HttpQueryRequest") .field("session_id", &self.session_id) .field("session", &self.session) - .field("sql", &short_sql(self.sql.clone())) + .field("sql", &short_sql(self.sql.clone(), 1000)) .field("pagination", &self.pagination) .field("string_fields", &self.string_fields) .field("stage_attachment", &self.stage_attachment) diff --git a/src/query/service/src/sessions/query_ctx_shared.rs b/src/query/service/src/sessions/query_ctx_shared.rs index 55c1f44bdd7a..cb073e69e772 100644 --- a/src/query/service/src/sessions/query_ctx_shared.rs +++ b/src/query/service/src/sessions/query_ctx_shared.rs @@ -487,7 +487,12 @@ impl QueryContextShared { pub fn attach_query_str(&self, kind: QueryKind, query: String) { { let mut running_query = self.running_query.write(); - *running_query = Some(short_sql(query)); + *running_query = Some(short_sql( + query, + self.get_settings() + .get_short_sql_max_length() + .unwrap_or(1000), + )); } { diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs index bef9c5ae9dbc..a1a6ddb005f1 100644 --- a/src/query/settings/src/settings_default.rs +++ b/src/query/settings/src/settings_default.rs @@ -909,6 +909,12 @@ impl DefaultSettings { mode: SettingMode::Both, range: Some(SettingRange::Numeric(0..=u64::MAX)), }), + ("short_sql_max_length", DefaultSettingValue { + value: UserSettingValue::UInt64(128), + desc: "Sets the maximum length for truncating SQL queries in short_sql function.", + mode: SettingMode::Both, + range: Some(SettingRange::Numeric(1..=1024*1024)), + }), ]); Ok(Arc::new(DefaultSettings { diff --git a/src/query/settings/src/settings_getter_setter.rs b/src/query/settings/src/settings_getter_setter.rs index 521fa92c11d4..e54806e1550f 100644 --- a/src/query/settings/src/settings_getter_setter.rs +++ b/src/query/settings/src/settings_getter_setter.rs @@ -748,4 +748,12 @@ impl Settings { pub fn get_max_spill_io_requests(&self) -> Result { self.try_get_u64("max_spill_io_requests") } + + pub fn get_short_sql_max_length(&self) -> Result { + self.try_get_u64("short_sql_max_length") + } + + pub fn set_short_sql_max_length(&self, val: u64) -> Result<()> { + self.try_set_u64("short_sql_max_length", val) + } } diff --git a/src/query/settings/tests/it/setting.rs b/src/query/settings/tests/it/setting.rs index b318034a0d79..f99634773add 100644 --- a/src/query/settings/tests/it/setting.rs +++ b/src/query/settings/tests/it/setting.rs @@ -98,6 +98,25 @@ async fn test_set_settings() { let expect = "WrongValueForVariable. Code: 2803, Text = Value xx is not within the allowed values [\"None\", \"LZ4\", \"ZSTD\"]."; assert_eq!(expect, format!("{}", result.unwrap_err())); } + + // Number Range + { + // Ok + settings + .set_setting("short_sql_max_length".to_string(), "1000".to_string()) + .unwrap(); + + // Range 1024*1024 + let result = + settings.set_setting("short_sql_max_length".to_string(), "1048577".to_string()); + let expect = "WrongValueForVariable. Code: 2803, Text = Value 1048577 is not within the range [1, 1048576]."; + assert_eq!(expect, format!("{}", result.unwrap_err())); + + // Range 1 + let result = settings.set_setting("short_sql_max_length".to_string(), "0".to_string()); + let expect = "WrongValueForVariable. Code: 2803, Text = Value 0 is not within the range [1, 1048576]."; + assert_eq!(expect, format!("{}", result.unwrap_err())); + } } #[tokio::test(flavor = "multi_thread", worker_threads = 1)] From de5a3d6adb4b4cfd20dec1cd401e3bdbaa5192b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E7=82=8E=E6=B3=BC?= Date: Sun, 29 Sep 2024 15:50:53 +0800 Subject: [PATCH 09/14] refactor: `DroppedId` for listing db/tables for gc (#16542) There is no need to store tables inside `DroppedId::Db`: the tables belonging to a DB for gc can still be stored in `DroppedId::Table`. --- src/meta/api/src/schema_api_impl.rs | 44 +++--- src/meta/api/src/schema_api_test_suite.rs | 145 +++++++----------- src/meta/app/src/schema/table.rs | 11 +- .../interpreter_vacuum_drop_tables.rs | 16 +- 4 files changed, 77 insertions(+), 139 deletions(-) diff --git a/src/meta/api/src/schema_api_impl.rs b/src/meta/api/src/schema_api_impl.rs index 88304f4432a1..877ffdcb35f2 100644 --- a/src/meta/api/src/schema_api_impl.rs +++ b/src/meta/api/src/schema_api_impl.rs @@ -2678,31 +2678,25 @@ impl + ?Sized> SchemaApi for KV { drop_time_range.clone() }; - let db_filter = (table_drop_time_range, db_info.clone()); - let capacity = the_limit - vacuum_table_infos.len(); - let table_infos = do_get_table_history(self, db_filter, capacity).await?; + let table_infos = + do_get_table_history(self, table_drop_time_range, db_info.clone(), capacity) + .await?; + + for (table_info, db_id) in table_infos.iter() { + vacuum_ids.push(DroppedId::new_table( + *db_id, + table_info.ident.table_id, + table_info.name.clone(), + )); + } // A DB can be removed only when all its tables are removed. if vacuum_db && capacity > table_infos.len() { vacuum_ids.push(DroppedId::Db { db_id: db_info.database_id.db_id, db_name: db_info.name_ident.database_name().to_string(), - tables: table_infos - .iter() - .map(|(table_info, _)| { - (table_info.ident.table_id, table_info.name.clone()) - }) - .collect(), }); - } else { - for (table_info, db_id) in table_infos.iter().take(capacity) { - vacuum_ids.push(DroppedId::new_table( - *db_id, - table_info.ident.table_id, - table_info.name.clone(), - )); - } } vacuum_table_infos.extend( @@ -2742,8 +2736,8 @@ impl + ?Sized> SchemaApi for KV { name_ident: tenant_dbname.clone(), meta: db_meta, }); - let db_filter = (drop_time_range.clone(), db_info); - let table_infos = do_get_table_history(self, db_filter, the_limit).await?; + let table_infos = + do_get_table_history(self, drop_time_range.clone(), db_info, the_limit).await?; let mut drop_ids = vec![]; let mut drop_table_infos = vec![]; @@ -2766,11 +2760,9 @@ impl + ?Sized> SchemaApi for KV { async fn gc_drop_tables(&self, req: GcDroppedTableReq) -> Result<(), KVAppError> { for drop_id in req.drop_ids { match drop_id { - DroppedId::Db { - db_id, - db_name, - tables: _, - } => gc_dropped_db_by_id(self, db_id, &req.tenant, db_name).await?, + DroppedId::Db { db_id, db_name } => { + gc_dropped_db_by_id(self, db_id, &req.tenant, db_name).await? + } DroppedId::Table { name, id } => { gc_dropped_table_by_id(self, &req.tenant, &name, &id).await? } @@ -3532,10 +3524,10 @@ fn build_upsert_table_deduplicated_label(deduplicated_label: String) -> TxnOp { #[fastrace::trace] async fn do_get_table_history( kv_api: &(impl kvapi::KVApi + ?Sized), - db_filter: (Range>>, Arc), + drop_time_range: Range>>, + db_info: Arc, limit: usize, ) -> Result, u64)>, KVAppError> { - let (drop_time_range, db_info) = db_filter; let db_id = db_info.database_id.db_id; let dbid_tbname_idlist = TableIdHistoryIdent { diff --git a/src/meta/api/src/schema_api_test_suite.rs b/src/meta/api/src/schema_api_test_suite.rs index e17b6b752288..9f6e253cd309 100644 --- a/src/meta/api/src/schema_api_test_suite.rs +++ b/src/meta/api/src/schema_api_test_suite.rs @@ -4071,8 +4071,9 @@ impl SchemaApiTestSuite { }; let created_on = Utc::now(); - let mut drop_ids_1 = vec![]; - let mut drop_ids_2 = vec![]; + // The expected drop_ids built with and without retention time boundary + let mut drop_ids_boundary = vec![]; + let mut drop_ids_no_boundary = vec![]; // first create a database drop within filter time info!("--- create db1"); @@ -4081,42 +4082,38 @@ impl SchemaApiTestSuite { let req = CreateDatabaseReq { create_option: CreateOption::Create, name_ident: db_name.clone(), - meta: DatabaseMeta { - engine: "".to_string(), - ..DatabaseMeta::default() - }, + meta: DatabaseMeta::default(), }; let res = mt.create_database(req).await?; - drop_ids_1.push(DroppedId::Db { - db_id: *res.db_id, - db_name: db_name.database_name().to_string(), - tables: vec![], - }); - drop_ids_2.push(DroppedId::Db { - db_id: *res.db_id, - db_name: db_name.database_name().to_string(), - tables: vec![], - }); + let db1_id = res.db_id.db_id; let req = CreateTableReq { create_option: CreateOption::Create, - name_ident: TableNameIdent { - tenant: Tenant::new_or_err(tenant_name, func_name!())?, - db_name: "db1".to_string(), - table_name: "tb1".to_string(), - }, - + name_ident: TableNameIdent::new(&tenant, "db1", "tb1"), table_meta: table_meta(created_on), as_dropped: false, }; - let _resp = mt.create_table(req.clone()).await?; + let resp = mt.create_table(req.clone()).await?; + let db1_tb1_id = resp.table_id; mt.drop_database(DropDatabaseReq { if_exists: false, name_ident: DatabaseNameIdent::new(&tenant, "db1"), }) .await?; + + drop_ids_boundary.push(DroppedId::new_table(db1_id, db1_tb1_id, "tb1")); + drop_ids_boundary.push(DroppedId::Db { + db_id: db1_id, + db_name: db_name.database_name().to_string(), + }); + + drop_ids_no_boundary.push(DroppedId::new_table(db1_id, db1_tb1_id, "tb1")); + drop_ids_no_boundary.push(DroppedId::Db { + db_id: db1_id, + db_name: db_name.database_name().to_string(), + }); } // second create a database drop outof filter time, but has a table drop within filter time @@ -4125,18 +4122,14 @@ impl SchemaApiTestSuite { let create_db_req = CreateDatabaseReq { create_option: CreateOption::Create, name_ident: DatabaseNameIdent::new(&tenant, "db2"), - meta: DatabaseMeta { - engine: "".to_string(), - ..DatabaseMeta::default() - }, + meta: Default::default(), }; let res = mt.create_database(create_db_req.clone()).await?; - let db_id = res.db_id; - drop_ids_2.push(DroppedId::Db { - db_id: *db_id, + let db2_id = res.db_id; + drop_ids_no_boundary.push(DroppedId::Db { + db_id: *db2_id, db_name: "db2".to_string(), - tables: vec![], }); info!("--- create and drop db2.tb1"); @@ -4153,16 +4146,21 @@ impl SchemaApiTestSuite { as_dropped: false, }; let resp = mt.create_table(req.clone()).await?; - drop_ids_1.push(DroppedId::new_table( + drop_ids_boundary.push(DroppedId::new_table( *res.db_id, resp.table_id, table_name.table_name.clone(), )); + drop_ids_no_boundary.push(DroppedId::new_table( + *db2_id, + resp.table_id, + table_name.table_name.clone(), + )); mt.drop_table_by_id(DropTableByIdReq { if_exists: false, tenant: req.name_ident.tenant.clone(), - db_id: *db_id, + db_id: *db2_id, table_name: req.name_ident.table_name.clone(), tb_id: resp.table_id, engine: "FUSE".to_string(), @@ -4176,12 +4174,7 @@ impl SchemaApiTestSuite { let mut table_meta = table_meta(created_on); let req = CreateTableReq { create_option: CreateOption::Create, - name_ident: TableNameIdent { - tenant: Tenant::new_or_err(tenant_name, func_name!())?, - db_name: "db2".to_string(), - table_name: "tb2".to_string(), - }, - + name_ident: TableNameIdent::new(&tenant, "db2", "tb2"), table_meta: table_meta.clone(), as_dropped: false, }; @@ -4189,7 +4182,7 @@ impl SchemaApiTestSuite { mt.drop_table_by_id(DropTableByIdReq { if_exists: false, tenant: req.name_ident.tenant.clone(), - db_id: *db_id, + db_id: *db2_id, table_name: req.name_ident.table_name.clone(), tb_id: resp.table_id, engine: "FUSE".to_string(), @@ -4201,22 +4194,25 @@ impl SchemaApiTestSuite { table_meta.drop_on = Some(created_on + Duration::seconds(100)); let data = serialize_struct(&table_meta)?; upsert_test_data(mt.as_kv_api(), &id_key, data).await?; + + drop_ids_no_boundary.push(DroppedId::new_table( + *db2_id, + resp.table_id, + "tb2".to_string(), + )); } info!("--- create db2.tb3"); + let db2_tb3_id; { let req = CreateTableReq { create_option: CreateOption::Create, - name_ident: TableNameIdent { - tenant: Tenant::new_or_err(tenant_name, func_name!())?, - db_name: "db2".to_string(), - table_name: "tb3".to_string(), - }, - + name_ident: TableNameIdent::new(&tenant, "db2", "tb3"), table_meta: table_meta(created_on), as_dropped: false, }; - let _resp = mt.create_table(req.clone()).await?; + let resp = mt.create_table(req.clone()).await?; + db2_tb3_id = resp.table_id; } mt.drop_database(DropDatabaseReq { @@ -4227,9 +4223,11 @@ impl SchemaApiTestSuite { // change db meta to make this db drop time outof filter time let mut drop_db_meta = create_db_req.meta.clone(); drop_db_meta.drop_on = Some(created_on + Duration::seconds(100)); - let id_key = db_id; + let id_key = db2_id; let data = serialize_struct(&drop_db_meta)?; upsert_test_data(mt.as_kv_api(), &id_key, data).await?; + + drop_ids_no_boundary.push(DroppedId::new_table(*db2_id, db2_tb3_id, "tb3".to_string())); } // third create a database not dropped, but has a table drop within filter time @@ -4237,10 +4235,7 @@ impl SchemaApiTestSuite { let create_db_req = CreateDatabaseReq { create_option: CreateOption::Create, name_ident: DatabaseNameIdent::new(&tenant, "db3"), - meta: DatabaseMeta { - engine: "".to_string(), - ..DatabaseMeta::default() - }, + meta: Default::default(), }; let res = mt.create_database(create_db_req.clone()).await?; @@ -4250,26 +4245,13 @@ impl SchemaApiTestSuite { { let req = CreateTableReq { create_option: CreateOption::Create, - name_ident: TableNameIdent { - tenant: Tenant::new_or_err(tenant_name, func_name!())?, - db_name: "db3".to_string(), - table_name: "tb1".to_string(), - }, - + name_ident: TableNameIdent::new(&tenant, "db3", "tb1"), table_meta: table_meta(created_on), as_dropped: false, }; let resp = mt.create_table(req.clone()).await?; - drop_ids_1.push(DroppedId::new_table( - *db_id, - resp.table_id, - "tb1".to_string(), - )); - drop_ids_2.push(DroppedId::new_table( - *db_id, - resp.table_id, - "tb1".to_string(), - )); + drop_ids_boundary.push(DroppedId::new_table(*db_id, resp.table_id, "tb1")); + drop_ids_no_boundary.push(DroppedId::new_table(*db_id, resp.table_id, "tb1")); mt.drop_table_by_id(DropTableByIdReq { if_exists: false, tenant: req.name_ident.tenant.clone(), @@ -4287,21 +4269,12 @@ impl SchemaApiTestSuite { let mut table_meta = table_meta(created_on); let req = CreateTableReq { create_option: CreateOption::Create, - name_ident: TableNameIdent { - tenant: Tenant::new_or_err(tenant_name, func_name!())?, - db_name: "db3".to_string(), - table_name: "tb2".to_string(), - }, - + name_ident: TableNameIdent::new(&tenant, "db3", "tb2"), table_meta: table_meta.clone(), as_dropped: false, }; let resp = mt.create_table(req.clone()).await?; - drop_ids_2.push(DroppedId::new_table( - *db_id, - resp.table_id, - "tb2".to_string(), - )); + drop_ids_no_boundary.push(DroppedId::new_table(*db_id, resp.table_id, "tb2")); mt.drop_table_by_id(DropTableByIdReq { if_exists: false, tenant: req.name_ident.tenant.clone(), @@ -4323,12 +4296,7 @@ impl SchemaApiTestSuite { { let req = CreateTableReq { create_option: CreateOption::Create, - name_ident: TableNameIdent { - tenant: Tenant::new_or_err(tenant_name, func_name!())?, - db_name: "db3".to_string(), - table_name: "tb3".to_string(), - }, - + name_ident: TableNameIdent::new(&tenant, "db3", "tb3"), table_meta: table_meta(created_on), as_dropped: false, }; @@ -4348,7 +4316,7 @@ impl SchemaApiTestSuite { .map(|x| x.cmp_key()) .collect::>(); - let want = drop_ids_1 + let want = drop_ids_boundary .iter() .map(|x| x.cmp_key()) .collect::>(); @@ -4360,8 +4328,7 @@ impl SchemaApiTestSuite { "'db2'.'tb1'".to_string(), "'db3'.'tb1'".to_string(), ] - .iter() - .cloned() + .into_iter() .collect(); let actual: BTreeSet = resp .drop_table_infos @@ -4382,7 +4349,7 @@ impl SchemaApiTestSuite { .map(|x| x.cmp_key()) .collect::>(); - let want = drop_ids_2 + let want = drop_ids_no_boundary .iter() .map(|x| x.cmp_key()) .collect::>(); diff --git a/src/meta/app/src/schema/table.rs b/src/meta/app/src/schema/table.rs index 7542b8177ddd..0b4df049bbe5 100644 --- a/src/meta/app/src/schema/table.rs +++ b/src/meta/app/src/schema/table.rs @@ -984,15 +984,8 @@ impl ListDroppedTableReq { #[derive(Clone, Debug, PartialEq, Eq)] pub enum DroppedId { - Db { - db_id: u64, - db_name: String, - tables: Vec<(u64, String)>, - }, - Table { - name: DBIdTableName, - id: TableId, - }, + Db { db_id: u64, db_name: String }, + Table { name: DBIdTableName, id: TableId }, } impl DroppedId { diff --git a/src/query/service/src/interpreters/interpreter_vacuum_drop_tables.rs b/src/query/service/src/interpreters/interpreter_vacuum_drop_tables.rs index d1b87942fb07..bf6704d5fed4 100644 --- a/src/query/service/src/interpreters/interpreter_vacuum_drop_tables.rs +++ b/src/query/service/src/interpreters/interpreter_vacuum_drop_tables.rs @@ -172,23 +172,9 @@ impl Interpreter for VacuumDropTablesInterpreter { let mut success_dropped_ids = vec![]; for drop_id in drop_ids { match &drop_id { - DroppedId::Db { - db_id, - db_name, - tables, - } => { + DroppedId::Db { db_id: _, db_name } => { if !failed_dbs.contains(db_name) { success_dropped_ids.push(drop_id); - } else { - for (table_id, table_name) in tables.iter() { - if !failed_tables.contains(table_id) { - success_dropped_ids.push(DroppedId::new_table( - *db_id, - *table_id, - table_name.clone(), - )); - } - } } } DroppedId::Table { name: _, id } => { From dd45c198baebe945b5bfe3fb0b8a09965daff1b2 Mon Sep 17 00:00:00 2001 From: everpcpc Date: Sun, 29 Sep 2024 18:41:41 +0800 Subject: [PATCH 10/14] chore: update repo for workflows (#16546) * z * chore: update repo for workflows * z --- .github/ISSUE_TEMPLATE/20_bug_report.yml | 6 +++--- .github/ISSUE_TEMPLATE/config.yml | 2 +- .github/actions/publish_binary/action.yml | 4 ++-- .github/actions/publish_deb/action.yml | 10 +++++----- .github/actions/setup_bendsql/action.yml | 4 ++-- .github/scripts/notify_release.js | 2 +- .github/workflows/release.yml | 8 ++++---- 7 files changed, 18 insertions(+), 18 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/20_bug_report.yml b/.github/ISSUE_TEMPLATE/20_bug_report.yml index e9c1e2ec7db7..9e7a44d69e56 100644 --- a/.github/ISSUE_TEMPLATE/20_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/20_bug_report.yml @@ -8,17 +8,17 @@ body: value: | Thank you very much for submitting feedback to Databend to help Databend develop better. - If it is an idea or help wanted, please go to: [Discussion](https://github.com/datafuselabs/databend/discussions) + If it is an idea or help wanted, please go to: [Discussion](https://github.com/databendlabs/databend/discussions) - type: checkboxes attributes: label: Search before asking description: > - Please make sure to search in the [issues](https://github.com/datafuselabs/databend/issues) first to see + Please make sure to search in the [issues](https://github.com/databendlabs/databend/issues) first to see whether the same issue was reported already. options: - label: > - I had searched in the [issues](https://github.com/datafuselabs/databend/issues) and found no similar + I had searched in the [issues](https://github.com/databendlabs/databend/issues) and found no similar issues. required: true diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 497a278cdfd5..c7fae142eb61 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,5 +1,5 @@ blank_issues_enabled: true contact_links: - name: Question - url: https://github.com/datafuselabs/databend/discussions/categories/q-a + url: https://github.com/databendlabs/databend/discussions/categories/q-a about: Please ask and answer questions on the discussions Q&A category. diff --git a/.github/actions/publish_binary/action.yml b/.github/actions/publish_binary/action.yml index 1220c7e7df15..f73fd02f419d 100644 --- a/.github/actions/publish_binary/action.yml +++ b/.github/actions/publish_binary/action.yml @@ -44,7 +44,7 @@ runs: if: inputs.category == 'default' run: | aws s3 cp ${{ steps.name.outputs.name }}.tar.gz s3://repo/databend/${{ inputs.version }}/${{ steps.name.outputs.name }}.tar.gz --no-progress - gh api /repos/datafuselabs/databend/tags > tags.json + gh api /repos/databendlabs/databend/tags > tags.json aws s3 cp ./tags.json s3://repo/databend/tags.json - gh api /repos/datafuselabs/databend/releases > releases.json + gh api /repos/databendlabs/databend/releases > releases.json aws s3 cp ./releases.json s3://repo/databend/releases.json diff --git a/.github/actions/publish_deb/action.yml b/.github/actions/publish_deb/action.yml index 19703fd07720..c64911b0a3e2 100644 --- a/.github/actions/publish_deb/action.yml +++ b/.github/actions/publish_deb/action.yml @@ -32,8 +32,8 @@ runs: version=${{ inputs.version }} deb_version=${version/-/.} deb_version=${deb_version/v/} - wget -q https://github.com/datafuselabs/databend/releases/download/${version}/databend_${deb_version}_amd64.deb - wget -q https://github.com/datafuselabs/databend/releases/download/${version}/databend_${deb_version}_arm64.deb + wget -q https://github.com/databendlabs/databend/releases/download/${version}/databend_${deb_version}_amd64.deb + wget -q https://github.com/databendlabs/databend/releases/download/${version}/databend_${deb_version}_arm64.deb reprepro includedeb stable databend_${deb_version}_amd64.deb reprepro includedeb stable databend_${deb_version}_arm64.deb @@ -41,10 +41,10 @@ runs: shell: bash working-directory: scripts/distribution/deb run: | - version=$(gh release view --repo datafuselabs/bendsql --json name | jq -r '.name') + version=$(gh release view --repo databendlabs/bendsql --json name | jq -r '.name') deb_version=${version/v/} - wget -q https://github.com/datafuselabs/bendsql/releases/download/${version}/bendsql_${deb_version}_amd64.deb - wget -q https://github.com/datafuselabs/bendsql/releases/download/${version}/bendsql_${deb_version}_arm64.deb + wget -q https://github.com/databendlabs/bendsql/releases/download/${version}/bendsql_${deb_version}_amd64.deb + wget -q https://github.com/databendlabs/bendsql/releases/download/${version}/bendsql_${deb_version}_arm64.deb reprepro includedeb stable bendsql_${deb_version}_amd64.deb reprepro includedeb stable bendsql_${deb_version}_arm64.deb diff --git a/.github/actions/setup_bendsql/action.yml b/.github/actions/setup_bendsql/action.yml index 6fc2daca1352..79718f8744d9 100644 --- a/.github/actions/setup_bendsql/action.yml +++ b/.github/actions/setup_bendsql/action.yml @@ -10,7 +10,7 @@ runs: if bendsql --version; then exit 0 fi - curl --retry 5 -Lo /tmp/bendsql.tar.gz https://github.com/datafuselabs/bendsql/releases/download/v0.18.3/bendsql-x86_64-unknown-linux-gnu.tar.gz + curl --retry 5 -Lo /tmp/bendsql.tar.gz https://github.com/databendlabs/bendsql/releases/download/v0.18.3/bendsql-x86_64-unknown-linux-gnu.tar.gz tar -xzf /tmp/bendsql.tar.gz -C /tmp mv /tmp/bendsql /usr/local/bin/bendsql bendsql --version @@ -21,7 +21,7 @@ runs: if bendsql --version; then exit 0 fi - curl --retry 5 -Lo /tmp/bendsql.tar.gz https://github.com/datafuselabs/bendsql/releases/download/v0.18.3/bendsql-x86_64-apple-darwin.tar.gz + curl --retry 5 -Lo /tmp/bendsql.tar.gz https://github.com/databendlabs/bendsql/releases/download/v0.18.3/bendsql-x86_64-apple-darwin.tar.gz tar -xzf /tmp/bendsql.tar.gz -C /tmp mv /tmp/bendsql /usr/local/bin/bendsql bendsql --version diff --git a/.github/scripts/notify_release.js b/.github/scripts/notify_release.js index f6646208d4a0..6463c1b986c9 100644 --- a/.github/scripts/notify_release.js +++ b/.github/scripts/notify_release.js @@ -43,7 +43,7 @@ module.exports = async ({ context, core }) => { { tag: "a", text: "Release Notes", - href: `https://github.com/datafuselabs/databend/releases/tag/${VERSION}`, + href: `https://github.com/databendlabs/databend/releases/tag/${VERSION}`, }, ], ], diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 7f892ac3dd28..a08a02c1b58c 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -71,7 +71,7 @@ jobs: - name: Checkout Docs uses: actions/checkout@v4 with: - repository: datafuselabs/databend-docs + repository: databendlabs/databend-docs ref: main - name: Get date shell: bash @@ -83,10 +83,10 @@ jobs: mkdir -p docs/release-stable df="docs/release-stable/${{ env.DATE }}_${{ needs.create_release.outputs.version }}.md" echo "---" > $df - gh release view --repo datafuselabs/databend ${{ needs.create_release.outputs.version }} >> $df + gh release view --repo databendlabs/databend ${{ needs.create_release.outputs.version }} >> $df sed -i -E 's/^--$/---/g' $df sed -i -E '/^asset:/d' $df - sed -i -E 's_https://github.com/datafuselabs/databend/pull/([0-9]+)_[#\1](https://github.com/datafuselabs/databend/pull/\1)_g' $df + sed -i -E 's_https://github.com/databendlabs/databend/pull/([0-9]+)_[#\1](https://github.com/databendlabs/databend/pull/\1)_g' $df git add docs/release-stable git status - uses: peter-evans/create-pull-request@v4 @@ -649,7 +649,7 @@ jobs: # - name: checkout share endpoint # uses: actions/checkout@v4 # with: - # repository: datafuselabs/share-endpoint + # repository: databendlabs/share-endpoint # token: ${{ secrets.DATABEND_BOT_TOKEN }} # path: share-endpoint # - name: Download artifacts From 0ed932bd060203568ebed40acb4acf11b02cdc67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E7=82=8E=E6=B3=BC?= Date: Mon, 30 Sep 2024 11:32:41 +0800 Subject: [PATCH 11/14] refactor: `get_history_tables_for_gc()` should not return `TableInfo`, but just table name, id and values (#16545) `SchemaApi` can not provide enough information to build a valid `TableInfo`, such as, it does not know about catalog type and database type. Therefore `get_history_tables_for_gc()` should just return `table name, table id and the table meta` to its caller to let the build a `TableInfo` if needed. --- src/meta/api/src/schema_api_impl.rs | 96 +++++++++++++++------------- src/meta/app/src/schema/mod.rs | 1 + src/meta/app/src/schema/table.rs | 12 ++++ src/meta/app/src/schema/table_niv.rs | 49 ++++++++++++++ 4 files changed, 112 insertions(+), 46 deletions(-) create mode 100644 src/meta/app/src/schema/table_niv.rs diff --git a/src/meta/api/src/schema_api_impl.rs b/src/meta/api/src/schema_api_impl.rs index 877ffdcb35f2..ffdf39541597 100644 --- a/src/meta/api/src/schema_api_impl.rs +++ b/src/meta/api/src/schema_api_impl.rs @@ -67,6 +67,7 @@ use databend_common_meta_app::schema::index_id_ident::IndexIdIdent; use databend_common_meta_app::schema::index_id_to_name_ident::IndexIdToNameIdent; use databend_common_meta_app::schema::index_name_ident::IndexName; use databend_common_meta_app::schema::least_visible_time_ident::LeastVisibleTimeIdent; +use databend_common_meta_app::schema::table_niv::TableNIV; use databend_common_meta_app::schema::CatalogIdToNameIdent; use databend_common_meta_app::schema::CatalogInfo; use databend_common_meta_app::schema::CatalogMeta; @@ -2679,32 +2680,34 @@ impl + ?Sized> SchemaApi for KV { }; let capacity = the_limit - vacuum_table_infos.len(); - let table_infos = - do_get_table_history(self, table_drop_time_range, db_info.clone(), capacity) - .await?; - - for (table_info, db_id) in table_infos.iter() { - vacuum_ids.push(DroppedId::new_table( - *db_id, - table_info.ident.table_id, - table_info.name.clone(), - )); + let table_nivs = get_history_tables_for_gc( + self, + table_drop_time_range, + db_info.database_id.db_id, + capacity, + ) + .await?; + + for table_niv in table_nivs.iter() { + vacuum_ids.push(DroppedId::from(table_niv.clone())); } // A DB can be removed only when all its tables are removed. - if vacuum_db && capacity > table_infos.len() { + if vacuum_db && capacity > table_nivs.len() { vacuum_ids.push(DroppedId::Db { db_id: db_info.database_id.db_id, db_name: db_info.name_ident.database_name().to_string(), }); } - vacuum_table_infos.extend( - table_infos - .iter() - .take(capacity) - .map(|(table_info, _)| table_info.clone()), - ); + vacuum_table_infos.extend(table_nivs.iter().take(capacity).map(|niv| { + Arc::new(TableInfo::new( + db_info.name_ident.database_name(), + &niv.name().table_name, + TableIdent::new(niv.id().table_id, niv.value().seq), + niv.value().data.clone(), + )) + })); } return Ok(ListDroppedTableResp { @@ -2736,18 +2739,26 @@ impl + ?Sized> SchemaApi for KV { name_ident: tenant_dbname.clone(), meta: db_meta, }); - let table_infos = - do_get_table_history(self, drop_time_range.clone(), db_info, the_limit).await?; + let table_nivs = get_history_tables_for_gc( + self, + drop_time_range.clone(), + db_info.database_id.db_id, + the_limit, + ) + .await?; + let mut drop_ids = vec![]; let mut drop_table_infos = vec![]; - for (table_info, db_id) in table_infos.iter().take(the_limit) { - drop_ids.push(DroppedId::new_table( - *db_id, - table_info.ident.table_id, - table_info.name.clone(), - )); - drop_table_infos.push(table_info.clone()); + for niv in table_nivs.iter() { + drop_ids.push(DroppedId::from(niv.clone())); + + drop_table_infos.push(Arc::new(TableInfo::new( + db_info.name_ident.database_name(), + &niv.name().table_name, + TableIdent::new(niv.id().table_id, niv.value().seq), + niv.value().data.clone(), + ))); } Ok(ListDroppedTableResp { @@ -3520,21 +3531,22 @@ fn build_upsert_table_deduplicated_label(deduplicated_label: String) -> TxnOp { ) } +/// Lists all dropped and non-dropped tables belonging to a Database, +/// returns those tables that are eligible for garbage collection, +/// i.e., whose dropped time is in the specified range. #[logcall::logcall(input = "")] #[fastrace::trace] -async fn do_get_table_history( +async fn get_history_tables_for_gc( kv_api: &(impl kvapi::KVApi + ?Sized), drop_time_range: Range>>, - db_info: Arc, + db_id: u64, limit: usize, -) -> Result, u64)>, KVAppError> { - let db_id = db_info.database_id.db_id; - - let dbid_tbname_idlist = TableIdHistoryIdent { +) -> Result, KVAppError> { + let ident = TableIdHistoryIdent { database_id: db_id, table_name: "dummy".to_string(), }; - let dir_name = DirName::new(dbid_tbname_idlist); + let dir_name = DirName::new(ident); let table_history_kvs = kv_api.list_pb_vec(&dir_name).await?; let mut args = vec![]; @@ -3565,19 +3577,11 @@ async fn do_get_table_history( continue; } - let tb_info = TableInfo { - ident: TableIdent { - table_id: table_id.table_id, - seq: seq_meta.seq, - }, - desc: format!("'{}'.'{}'", db_info.name_ident.database_name(), table_name,), - name: (*table_name).clone(), - meta: seq_meta.data, - db_type: DatabaseType::NormalDB, - catalog_info: Default::default(), - }; - - filter_tb_infos.push((Arc::new(tb_info), db_info.database_id.db_id)); + filter_tb_infos.push(TableNIV::new( + DBIdTableName::new(db_id, table_name.clone()), + table_id.clone(), + seq_meta, + )); } } diff --git a/src/meta/app/src/schema/mod.rs b/src/meta/app/src/schema/mod.rs index e7e12d9e7bce..81a6f5d72f09 100644 --- a/src/meta/app/src/schema/mod.rs +++ b/src/meta/app/src/schema/mod.rs @@ -28,6 +28,7 @@ pub mod index_id_to_name_ident; pub mod index_name_ident; pub mod least_visible_time_ident; pub mod table_lock_ident; +pub mod table_niv; pub mod virtual_column_ident; mod create_option; diff --git a/src/meta/app/src/schema/table.rs b/src/meta/app/src/schema/table.rs index 0b4df049bbe5..a7a6e48fc8d1 100644 --- a/src/meta/app/src/schema/table.rs +++ b/src/meta/app/src/schema/table.rs @@ -39,6 +39,7 @@ use super::CatalogInfo; use super::CreateOption; use super::DatabaseId; use crate::schema::database_name_ident::DatabaseNameIdent; +use crate::schema::table_niv::TableNIV; use crate::storage::StorageParams; use crate::tenant::Tenant; use crate::tenant::ToTenant; @@ -988,7 +989,18 @@ pub enum DroppedId { Table { name: DBIdTableName, id: TableId }, } +impl From for DroppedId { + fn from(value: TableNIV) -> Self { + let (name, id, _) = value.unpack(); + Self::Table { name, id } + } +} + impl DroppedId { + pub fn new_table_name_id(name: DBIdTableName, id: TableId) -> DroppedId { + DroppedId::Table { name, id } + } + pub fn new_table(db_id: u64, table_id: u64, table_name: impl ToString) -> DroppedId { DroppedId::Table { name: DBIdTableName::new(db_id, table_name), diff --git a/src/meta/app/src/schema/table_niv.rs b/src/meta/app/src/schema/table_niv.rs new file mode 100644 index 000000000000..744d869ce165 --- /dev/null +++ b/src/meta/app/src/schema/table_niv.rs @@ -0,0 +1,49 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use databend_common_meta_types::SeqV; + +use crate::schema::DBIdTableName; +use crate::schema::TableId; +use crate::schema::TableMeta; + +/// The **Name, ID, Value** for a table metadata stored in meta-service. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct TableNIV { + name: DBIdTableName, + id: TableId, + value: SeqV, +} + +impl TableNIV { + pub fn new(name: DBIdTableName, id: TableId, value: SeqV) -> Self { + TableNIV { name, id, value } + } + + pub fn name(&self) -> &DBIdTableName { + &self.name + } + + pub fn id(&self) -> &TableId { + &self.id + } + + pub fn value(&self) -> &SeqV { + &self.value + } + + pub fn unpack(self) -> (DBIdTableName, TableId, SeqV) { + (self.name, self.id, self.value) + } +} From abd8266714d7ffc49b7d930d342f459a1bc0d2dd Mon Sep 17 00:00:00 2001 From: baishen Date: Mon, 30 Sep 2024 12:04:34 +0800 Subject: [PATCH 12/14] refactor(storage): improve inverted index match phrase query (#16547) * refactor(storage): improve inverted index match phrase query * fix typos * fix machete * fix machete --- Cargo.lock | 23 +- Cargo.toml | 6 +- .../arrow/src/arrow/array/list/mutable.rs | 4 +- src/common/arrow/src/arrow/offset.rs | 22 +- .../tests/it/inverted_index/index_refresh.rs | 4 +- .../transform_add_internal_columns.rs | 4 +- .../flight_sql/flight_sql_service/session.rs | 2 +- .../http/v1/session/refresh_handler.rs | 10 +- src/query/sql/src/planner/planner_cache.rs | 2 +- src/query/storages/common/index/Cargo.toml | 3 +- .../common/index/src/inverted_index.rs | 1076 ++++++++++------- src/query/storages/common/index/src/lib.rs | 5 +- .../inverted_index/inverted_index_loader.rs | 3 +- .../inverted_index/inverted_index_reader.rs | 235 ++-- .../src/io/write/inverted_index_writer.rs | 178 --- .../fuse/src/pruning/inverted_index_pruner.rs | 10 +- src/query/storages/stream/src/stream_table.rs | 9 +- 17 files changed, 825 insertions(+), 771 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 81df0e0cbfb4..f8cdc9ca06e4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5396,7 +5396,6 @@ version = "0.1.0" dependencies = [ "anyerror", "cbordata", - "crc32fast", "criterion", "databend-common-arrow", "databend-common-ast", @@ -5411,8 +5410,8 @@ dependencies = [ "match-template", "parquet", "rand 0.8.5", + "roaring", "serde", - "serde_json", "tantivy", "tantivy-common", "tantivy-fst", @@ -11086,7 +11085,7 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" [[package]] name = "ownedbytes" version = "0.7.0" -source = "git+https://github.com/datafuse-extras/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" +source = "git+https://github.com/datafuse-extras/tantivy?rev=7502370#7502370b68e6822a687ee071660e350b67808533" dependencies = [ "stable_deref_trait", ] @@ -14770,7 +14769,7 @@ checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417" [[package]] name = "tantivy" version = "0.22.0" -source = "git+https://github.com/datafuse-extras/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" +source = "git+https://github.com/datafuse-extras/tantivy?rev=7502370#7502370b68e6822a687ee071660e350b67808533" dependencies = [ "aho-corasick", "arc-swap", @@ -14820,7 +14819,7 @@ dependencies = [ [[package]] name = "tantivy-bitpacker" version = "0.6.0" -source = "git+https://github.com/datafuse-extras/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" +source = "git+https://github.com/datafuse-extras/tantivy?rev=7502370#7502370b68e6822a687ee071660e350b67808533" dependencies = [ "bitpacking 0.9.2", ] @@ -14828,7 +14827,7 @@ dependencies = [ [[package]] name = "tantivy-columnar" version = "0.3.0" -source = "git+https://github.com/datafuse-extras/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" +source = "git+https://github.com/datafuse-extras/tantivy?rev=7502370#7502370b68e6822a687ee071660e350b67808533" dependencies = [ "downcast-rs", "fastdivide", @@ -14843,7 +14842,7 @@ dependencies = [ [[package]] name = "tantivy-common" version = "0.7.0" -source = "git+https://github.com/datafuse-extras/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" +source = "git+https://github.com/datafuse-extras/tantivy?rev=7502370#7502370b68e6822a687ee071660e350b67808533" dependencies = [ "async-trait", "byteorder", @@ -14866,7 +14865,7 @@ dependencies = [ [[package]] name = "tantivy-jieba" version = "0.11.0" -source = "git+https://github.com/datafuse-extras/tantivy-jieba?rev=124a8fc#124a8fc8c8a9f1389af5a9bfa497fb358ecc556e" +source = "git+https://github.com/datafuse-extras/tantivy-jieba?rev=0e300e9#0e300e9085651b7e6659dfcc7b0ea0fa9cab09c2" dependencies = [ "jieba-rs", "lazy_static", @@ -14876,7 +14875,7 @@ dependencies = [ [[package]] name = "tantivy-query-grammar" version = "0.22.0" -source = "git+https://github.com/datafuse-extras/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" +source = "git+https://github.com/datafuse-extras/tantivy?rev=7502370#7502370b68e6822a687ee071660e350b67808533" dependencies = [ "nom", ] @@ -14884,7 +14883,7 @@ dependencies = [ [[package]] name = "tantivy-sstable" version = "0.3.0" -source = "git+https://github.com/datafuse-extras/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" +source = "git+https://github.com/datafuse-extras/tantivy?rev=7502370#7502370b68e6822a687ee071660e350b67808533" dependencies = [ "tantivy-bitpacker", "tantivy-common", @@ -14895,7 +14894,7 @@ dependencies = [ [[package]] name = "tantivy-stacker" version = "0.3.0" -source = "git+https://github.com/datafuse-extras/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" +source = "git+https://github.com/datafuse-extras/tantivy?rev=7502370#7502370b68e6822a687ee071660e350b67808533" dependencies = [ "murmurhash32", "rand_distr", @@ -14905,7 +14904,7 @@ dependencies = [ [[package]] name = "tantivy-tokenizer-api" version = "0.3.0" -source = "git+https://github.com/datafuse-extras/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" +source = "git+https://github.com/datafuse-extras/tantivy?rev=7502370#7502370b68e6822a687ee071660e350b67808533" dependencies = [ "serde", ] diff --git a/Cargo.toml b/Cargo.toml index d723c441a9d6..2e207de5095e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -408,7 +408,7 @@ openai_api_rust = { git = "https://github.com/datafuse-extras/openai-api", rev = orc-rust = { git = "https://github.com/datafuse-extras/datafusion-orc", rev = "03372b97" } recursive = { git = "https://github.com/datafuse-extras/recursive.git", rev = "6af35a1" } sled = { git = "https://github.com/datafuse-extras/sled", tag = "v0.34.7-datafuse.1" } -tantivy = { git = "https://github.com/datafuse-extras/tantivy", rev = "37aeac0" } -tantivy-common = { git = "https://github.com/datafuse-extras/tantivy", rev = "37aeac0", package = "tantivy-common" } -tantivy-jieba = { git = "https://github.com/datafuse-extras/tantivy-jieba", rev = "124a8fc" } +tantivy = { git = "https://github.com/datafuse-extras/tantivy", rev = "7502370" } +tantivy-common = { git = "https://github.com/datafuse-extras/tantivy", rev = "7502370", package = "tantivy-common" } +tantivy-jieba = { git = "https://github.com/datafuse-extras/tantivy-jieba", rev = "0e300e9" } xorfilter-rs = { git = "https://github.com/datafuse-extras/xorfilter", tag = "databend-alpha.4" } diff --git a/src/common/arrow/src/arrow/array/list/mutable.rs b/src/common/arrow/src/arrow/array/list/mutable.rs index 1a63727a45ad..4834b84c4bff 100644 --- a/src/common/arrow/src/arrow/array/list/mutable.rs +++ b/src/common/arrow/src/arrow/array/list/mutable.rs @@ -182,7 +182,9 @@ impl MutableListArray { pub fn try_push_valid(&mut self) -> Result<()> { let total_length = self.values.len(); let offset = self.offsets.last().to_usize(); - let length = total_length.checked_sub(offset).ok_or(Error::Overflow)?; + let length = total_length + .checked_sub(offset) + .ok_or_else(|| Error::Overflow)?; self.offsets.try_push_usize(length)?; if let Some(validity) = &mut self.validity { diff --git a/src/common/arrow/src/arrow/offset.rs b/src/common/arrow/src/arrow/offset.rs index 7d56d883ed8c..dd24cd0bce21 100644 --- a/src/common/arrow/src/arrow/offset.rs +++ b/src/common/arrow/src/arrow/offset.rs @@ -127,7 +127,9 @@ impl Offsets { pub fn try_push(&mut self, length: O) -> Result<(), Error> { let old_length = self.last(); assert!(length >= O::zero()); - let new_length = old_length.checked_add(&length).ok_or(Error::Overflow)?; + let new_length = old_length + .checked_add(&length) + .ok_or_else(|| Error::Overflow)?; self.0.push(new_length); Ok(()) } @@ -140,10 +142,12 @@ impl Offsets { /// * checks that this length does not overflow #[inline] pub fn try_push_usize(&mut self, length: usize) -> Result<(), Error> { - let length = O::from_usize(length).ok_or(Error::Overflow)?; + let length = O::from_usize(length).ok_or_else(|| Error::Overflow)?; let old_length = self.last(); - let new_length = old_length.checked_add(&length).ok_or(Error::Overflow)?; + let new_length = old_length + .checked_add(&length) + .ok_or_else(|| Error::Overflow)?; self.0.push(new_length); Ok(()) } @@ -267,8 +271,8 @@ impl Offsets { let last_offset = original_offset .checked_add(total_length) - .ok_or(Error::Overflow)?; - O::from_usize(last_offset).ok_or(Error::Overflow)?; + .ok_or_else(|| Error::Overflow)?; + O::from_usize(last_offset).ok_or_else(|| Error::Overflow)?; Ok(()) } @@ -279,7 +283,9 @@ impl Offsets { let mut length = *self.last(); let other_length = *other.last(); // check if the operation would overflow - length.checked_add(&other_length).ok_or(Error::Overflow)?; + length + .checked_add(&other_length) + .ok_or_else(|| Error::Overflow)?; let lengths = other.as_slice().windows(2).map(|w| w[1] - w[0]); let offsets = lengths.map(|new_length| { @@ -306,7 +312,9 @@ impl Offsets { let other_length = other.last().expect("Length to be non-zero"); let mut length = *self.last(); // check if the operation would overflow - length.checked_add(other_length).ok_or(Error::Overflow)?; + length + .checked_add(other_length) + .ok_or_else(|| Error::Overflow)?; let lengths = other.windows(2).map(|w| w[1] - w[0]); let offsets = lengths.map(|new_length| { diff --git a/src/query/ee/tests/it/inverted_index/index_refresh.rs b/src/query/ee/tests/it/inverted_index/index_refresh.rs index 611e1ff11181..9d0d45879293 100644 --- a/src/query/ee/tests/it/inverted_index/index_refresh.rs +++ b/src/query/ee/tests/it/inverted_index/index_refresh.rs @@ -143,7 +143,6 @@ async fn test_fuse_do_refresh_inverted_index() -> Result<()> { &index_version, ); - let field_nums = query_fields.len(); let has_score = true; let need_position = false; let mut field_ids = HashSet::new(); @@ -177,7 +176,6 @@ async fn test_fuse_do_refresh_inverted_index() -> Result<()> { let matched_rows = index_reader .clone() .do_filter( - field_nums, need_position, has_score, query.box_clone(), @@ -185,7 +183,7 @@ async fn test_fuse_do_refresh_inverted_index() -> Result<()> { &index_record, &fuzziness, tokenizer_manager, - block_meta.row_count as u32, + block_meta.row_count, &index_loc, ) .await?; diff --git a/src/query/service/src/pipelines/processors/transforms/transform_add_internal_columns.rs b/src/query/service/src/pipelines/processors/transforms/transform_add_internal_columns.rs index f62db09eee1e..7b35277c25e2 100644 --- a/src/query/service/src/pipelines/processors/transforms/transform_add_internal_columns.rs +++ b/src/query/service/src/pipelines/processors/transforms/transform_add_internal_columns.rs @@ -40,8 +40,8 @@ impl Transform for TransformAddInternalColumns { fn transform(&mut self, mut block: DataBlock) -> Result { if let Some(meta) = block.take_meta() { - let internal_column_meta = - InternalColumnMeta::downcast_from(meta).ok_or(ErrorCode::Internal("It's a bug"))?; + let internal_column_meta = InternalColumnMeta::downcast_from(meta) + .ok_or_else(|| ErrorCode::Internal("It's a bug"))?; let num_rows = block.num_rows(); for internal_column in self.internal_columns.values() { let column = diff --git a/src/query/service/src/servers/flight_sql/flight_sql_service/session.rs b/src/query/service/src/servers/flight_sql/flight_sql_service/session.rs index 82214bc67e4f..1cc7a6de4731 100644 --- a/src/query/service/src/servers/flight_sql/flight_sql_service/session.rs +++ b/src/query/service/src/servers/flight_sql/flight_sql_service/session.rs @@ -64,7 +64,7 @@ impl FlightSqlServiceImpl { pub(super) fn get_user_password(metadata: &MetadataMap) -> Result<(String, String), String> { let basic = "Basic "; let authorization = Self::get_header_value(metadata, "authorization") - .ok_or("authorization not parsable".to_string())?; + .ok_or_else(|| "authorization not parsable".to_string())?; if !authorization.starts_with(basic) { return Err(format!("Auth type not implemented: {authorization}")); diff --git a/src/query/service/src/servers/http/v1/session/refresh_handler.rs b/src/query/service/src/servers/http/v1/session/refresh_handler.rs index 84b0bac34462..830330b7c140 100644 --- a/src/query/service/src/servers/http/v1/session/refresh_handler.rs +++ b/src/query/service/src/servers/http/v1/session/refresh_handler.rs @@ -48,11 +48,11 @@ pub async fn refresh_handler( let mgr = ClientSessionManager::instance(); match &ctx.credential { Credential::Jwt { .. } => { - let session_id = - req.session_id - .ok_or(HttpErrorCode::bad_request(ErrorCode::BadArguments( - "JWT session should provide session_id when refresh session", - )))?; + let session_id = req.session_id.ok_or_else(|| { + HttpErrorCode::bad_request(ErrorCode::BadArguments( + "JWT session should provide session_id when refresh session", + )) + })?; mgr.refresh_in_memory_states(&session_id); let tenant = ctx.session.get_current_tenant(); diff --git a/src/query/sql/src/planner/planner_cache.rs b/src/query/sql/src/planner/planner_cache.rs index 8da310f3bb55..2a8f0e51b8aa 100644 --- a/src/query/sql/src/planner/planner_cache.rs +++ b/src/query/sql/src/planner/planner_cache.rs @@ -165,7 +165,7 @@ impl TableRefVisitor { let func_name = func.name.name.to_lowercase(); // If the function is not suitable for caching, we should not cache the plan - if !is_cacheable_function(&func_name) || func_name == "score" { + if !is_cacheable_function(&func_name) { self.cache_miss = true; } } diff --git a/src/query/storages/common/index/Cargo.toml b/src/query/storages/common/index/Cargo.toml index 1d1372be7302..2ee4d664f7df 100644 --- a/src/query/storages/common/index/Cargo.toml +++ b/src/query/storages/common/index/Cargo.toml @@ -16,7 +16,6 @@ ignored = ["xorfilter-rs", "match-template"] [dependencies] anyerror = { workspace = true } cbordata = { version = "0.6.0" } -crc32fast = "1.3.2" databend-common-arrow = { workspace = true } databend-common-ast = { workspace = true } databend-common-exception = { workspace = true } @@ -29,8 +28,8 @@ levenshtein_automata = "0.2.1" log = { workspace = true } match-template = { workspace = true } parquet = { workspace = true } +roaring = "0.10.1" serde = { workspace = true } -serde_json = { workspace = true } tantivy = { workspace = true } tantivy-common = { workspace = true } tantivy-fst = "0.5" diff --git a/src/query/storages/common/index/src/inverted_index.rs b/src/query/storages/common/index/src/inverted_index.rs index 6d8db4c5b424..09580866f2de 100644 --- a/src/query/storages/common/index/src/inverted_index.rs +++ b/src/query/storages/common/index/src/inverted_index.rs @@ -36,19 +36,20 @@ use std::collections::BTreeMap; use std::collections::HashMap; use std::collections::HashSet; -use std::collections::VecDeque; use std::io; use std::io::BufWriter; use std::io::Cursor; use std::io::Read; use std::io::Write; use std::marker::PhantomData; +use std::ops::BitAndAssign; +use std::ops::BitOrAssign; +use std::ops::SubAssign; use std::path::Path; use std::path::PathBuf; use std::result; use std::sync::Arc; -use crc32fast::Hasher; use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::Scalar; @@ -61,6 +62,7 @@ use levenshtein_automata::Distance; use levenshtein_automata::LevenshteinAutomatonBuilder; use levenshtein_automata::DFA; use log::warn; +use roaring::RoaringTreemap; use tantivy::directory::error::DeleteError; use tantivy::directory::error::OpenReadError; use tantivy::directory::error::OpenWriteError; @@ -73,14 +75,21 @@ use tantivy::directory::WatchCallback; use tantivy::directory::WatchHandle; use tantivy::directory::WritePtr; use tantivy::positions::PositionReader; +use tantivy::postings::BlockSegmentPostings; use tantivy::postings::TermInfo; +use tantivy::query::AllQuery; use tantivy::query::BooleanQuery; +use tantivy::query::BoostQuery; +use tantivy::query::ConstScoreQuery; +use tantivy::query::EmptyQuery; use tantivy::query::FuzzyTermQuery; use tantivy::query::Occur; +use tantivy::query::PhrasePrefixQuery; use tantivy::query::PhraseQuery; use tantivy::query::Query; use tantivy::query::QueryClone; use tantivy::query::TermQuery; +use tantivy::schema::Field; use tantivy::Directory; use tantivy::Term; use tantivy_common::BinarySerializable; @@ -88,60 +97,9 @@ use tantivy_common::HasLen; use tantivy_common::VInt; use tantivy_fst::Automaton; use tantivy_fst::IntoStreamer; +use tantivy_fst::Regex; use tantivy_fst::Streamer; -// tantivy version is used to generate the footer data - -// Index major version. -const INDEX_MAJOR_VERSION: u32 = 0; -// Index minor version. -const INDEX_MINOR_VERSION: u32 = 22; -// Index patch version. -const INDEX_PATCH_VERSION: u32 = 0; -// Index format version. -const INDEX_FORMAT_VERSION: u32 = 6; - -// The magic byte of the footer to identify corruption -// or an old version of the footer. -const FOOTER_MAGIC_NUMBER: u32 = 1337; - -type CrcHashU32 = u32; - -/// Structure version for the index. -#[derive(Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] -pub struct Version { - major: u32, - minor: u32, - patch: u32, - index_format_version: u32, -} - -/// A Footer is appended every part of data, like tantivy file. -#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)] -struct Footer { - version: Version, - crc: CrcHashU32, -} - -impl Footer { - fn new(crc: CrcHashU32) -> Self { - let version = Version { - major: INDEX_MAJOR_VERSION, - minor: INDEX_MINOR_VERSION, - patch: INDEX_PATCH_VERSION, - index_format_version: INDEX_FORMAT_VERSION, - }; - Footer { version, crc } - } - - fn append_footer(&self, write: &mut W) -> Result<()> { - let footer_payload_len = write.write(serde_json::to_string(&self)?.as_ref())?; - BinarySerializable::serialize(&(footer_payload_len as u32), write)?; - BinarySerializable::serialize(&FOOTER_MAGIC_NUMBER, write)?; - Ok(()) - } -} - fn extract_footer(data: FileSlice) -> Result<(Vec, Vec)> { // The following code is copied from tantivy `CompositeFile::open` function. // extract field number and offsets of each fields. @@ -240,59 +198,12 @@ pub fn extract_component_fields( Ok(()) } -// Build footer for tantivy files. -// Footer is used to check whether the data is valid when open a file. -pub fn build_tantivy_footer(bytes: &[u8]) -> Result> { - let mut hasher = Hasher::new(); - hasher.update(bytes); - let crc = hasher.finalize(); - - let footer = Footer::new(crc); - let mut buf = Vec::new(); - footer.append_footer(&mut buf)?; - Ok(buf) -} - -#[derive( - Copy, Clone, Debug, PartialEq, PartialOrd, Eq, Ord, Hash, serde::Serialize, serde::Deserialize, -)] -struct Field(u32); - -impl Field { - /// Create a new field object for the given FieldId. - const fn from_field_id(field_id: u32) -> Field { - Field(field_id) - } - - /// Returns a u32 identifying uniquely a field within a schema. - #[allow(dead_code)] - const fn field_id(self) -> u32 { - self.0 - } -} - -impl BinarySerializable for Field { - fn serialize(&self, writer: &mut W) -> io::Result<()> { - self.0.serialize(writer) - } - - fn deserialize(reader: &mut R) -> io::Result { - u32::deserialize(reader).map(Field) - } -} - #[derive(Eq, PartialEq, Hash, Copy, Ord, PartialOrd, Clone, Debug)] struct FileAddr { field: Field, idx: usize, } -impl FileAddr { - fn new(field: Field, idx: usize) -> FileAddr { - FileAddr { field, idx } - } -} - impl BinarySerializable for FileAddr { fn serialize(&self, writer: &mut W) -> io::Result<()> { self.field.serialize(writer)?; @@ -307,36 +218,6 @@ impl BinarySerializable for FileAddr { } } -// Build empty position data to be used when there are no phrase terms in the query. -// This can reduce data reading and speed up the query -fn build_empty_position_data(field_nums: usize) -> Result { - let offsets: Vec<_> = (0..field_nums) - .map(|i| { - let field = Field::from_field_id(i as u32); - let file_addr = FileAddr::new(field, 0); - (file_addr, 0) - }) - .collect(); - - let mut buf = Vec::new(); - VInt(offsets.len() as u64).serialize(&mut buf)?; - - let mut prev_offset = 0; - for (file_addr, offset) in offsets { - VInt(offset - prev_offset).serialize(&mut buf)?; - file_addr.serialize(&mut buf)?; - prev_offset = offset; - } - - let footer_len = buf.len() as u32; - footer_len.serialize(&mut buf)?; - - let mut footer = build_tantivy_footer(&buf)?; - buf.append(&mut footer); - - Ok(OwnedBytes::new(buf)) -} - struct DfaWrapper(pub DFA); impl Automaton for DfaWrapper { @@ -362,354 +243,704 @@ impl Automaton for DfaWrapper { } } -// Term value contains values associated with a Term -// used to match query and collect matched doc ids. +// Collect matched `doc_ids` for a Query. #[derive(Clone)] -pub struct TermValue { - // term info - pub term_info: TermInfo, - // term matched doc ids - pub doc_ids: Vec, - // term frequencies for each doc - pub term_freqs: Vec, - // position reader is used to read positions in doc for phrase query - pub position_reader: Option, +pub struct DocIdsCollector { + row_count: u64, + need_position: bool, + // key is `term`, value is `term_id`, + // These terms are in `fst`, means that those terms exist in the block, + // we need to use related term infos to determine the matched `doc_ids`. + // Use `term_id` as key to get related information and avoid copy `term`. + term_map: HashMap, + // key is `term_id`, value is `field_id` and `term_info`. + term_infos: HashMap, + // key is `term_id`, value is related `BlockSegmentPostings`, + // used to read `doc_ids` and `term_freqs`. + block_postings_map: HashMap, + // key is `term_id`, value is related `PositionReader`, + // used to read `positions` in each docs. + position_reader_map: HashMap, + // key is `term_id`, value is related `doc_ids`. + // `doc_ids` is lazy loaded when used. + doc_ids: HashMap, + // key is `term_id`, value is related `term_freqs`. + // `term_freqs` is lazy loaded when used. + term_freqs: HashMap>, } -// Check if fst contains terms in query. -// If not, we can skip read other parts of inverted index. -pub fn check_term_fsts_match( - query: Box, - fst_maps: &HashMap>, - fuzziness: &Option, - matched_terms: &mut HashMap, - fuzziness_terms: &mut HashMap>, -) -> bool { - if let Some(term_query) = query.downcast_ref::() { - let term = term_query.term(); - let field = term.field(); - let field_id = field.field_id() as usize; - if let Some(fst_map) = fst_maps.get(&field_id) { - if let Some(idx) = fst_map.get(term.serialized_value_bytes()) { - matched_terms.insert(term.clone(), idx); - return true; - } +impl DocIdsCollector { + pub fn create( + row_count: u64, + need_position: bool, + terms: HashMap, + term_infos: HashMap, + block_postings_map: HashMap, + position_reader_map: HashMap, + ) -> Self { + let term_len = terms.len(); + let term_map = terms + .into_iter() + .map(|(term, (_, term_id))| (term, term_id)) + .collect::>(); + + Self { + row_count, + need_position, + term_map, + term_infos, + block_postings_map, + position_reader_map, + doc_ids: HashMap::with_capacity(term_len), + term_freqs: HashMap::with_capacity(term_len), } - false - } else if let Some(bool_query) = query.downcast_ref::() { - let mut matched_num = 0; - for (occur, sub_query) in bool_query.clauses() { - let matched = check_term_fsts_match( - sub_query.box_clone(), - fst_maps, - fuzziness, - matched_terms, - fuzziness_terms, - ); - if matched { - matched_num += 1; - } - match occur { - Occur::Should => {} - Occur::Must => { - if !matched { - return false; + } + + fn check_term_match( + fst_map: &tantivy_fst::Map, + field_id: u32, + term: &Term, + matched_terms: &mut HashMap, + ) -> bool { + if let Some(term_id) = fst_map.get(term.serialized_value_bytes()) { + matched_terms.insert(term.clone(), (field_id, term_id)); + true + } else { + false + } + } + + fn get_fst_map( + field_id: u32, + fst_maps: &HashMap>, + ) -> Result<&tantivy_fst::Map> { + let fst_map = fst_maps.get(&field_id).ok_or_else(|| { + ErrorCode::TantivyError(format!( + "inverted index fst field `{}` does not exist", + field_id + )) + })?; + + Ok(fst_map) + } + + // Check if fst contains terms in query. + // If not, we can skip read other parts of inverted index. + pub fn check_term_fsts_match( + query: Box, + fst_maps: &HashMap>, + fuzziness: &Option, + matched_terms: &mut HashMap, + prefix_terms: &mut HashMap>, + fuzziness_terms: &mut HashMap>, + ) -> Result { + if let Some(term_query) = query.downcast_ref::() { + let term = term_query.term(); + let field = term.field(); + let field_id = field.field_id(); + let fst_map = Self::get_fst_map(field_id, fst_maps)?; + let matched = Self::check_term_match(fst_map, field_id, term, matched_terms); + Ok(matched) + } else if let Some(bool_query) = query.downcast_ref::() { + let mut matched_any = false; + for (occur, sub_query) in bool_query.clauses() { + let matched = Self::check_term_fsts_match( + sub_query.box_clone(), + fst_maps, + fuzziness, + matched_terms, + prefix_terms, + fuzziness_terms, + )?; + match occur { + Occur::Should => { + if matched { + matched_any = true; + } + } + Occur::Must => { + if matched { + matched_any = true; + } else { + return Ok(false); + } + } + Occur::MustNot => { + // Matched means that the block contains the term, + // but we still need to filter out the `doc_ids`. + matched_any = true; } } - Occur::MustNot => {} } - } - matched_num > 0 - } else if let Some(phrase_query) = query.downcast_ref::() { - // PhraseQuery must match all terms. - let field = phrase_query.field(); - let field_id = field.field_id() as usize; - if let Some(fst_map) = fst_maps.get(&field_id) { + Ok(matched_any) + } else if let Some(phrase_query) = query.downcast_ref::() { + // PhraseQuery must match all terms. + let field = phrase_query.field(); + let field_id = field.field_id(); + let fst_map = Self::get_fst_map(field_id, fst_maps)?; + let mut matched_all = true; for term in phrase_query.phrase_terms() { - let matched = if let Some(idx) = fst_map.get(term.serialized_value_bytes()) { - matched_terms.insert(term.clone(), idx); - true - } else { - false - }; + let matched = Self::check_term_match(fst_map, field_id, &term, matched_terms); if !matched { matched_all = false; break; } } - matched_all - } else { - false - } - } else if let Some(fuzzy_term_query) = query.downcast_ref::() { - // FuzzyTermQuery match terms by levenshtein distance. - let fuzziness = fuzziness.unwrap(); - - let term = fuzzy_term_query.term(); - let field = term.field(); - let field_id = field.field_id() as usize; - if let Some(fst_map) = fst_maps.get(&field_id) { - // build levenshtein automaton + Ok(matched_all) + } else if let Some(phrase_prefix_query) = query.downcast_ref::() { + // PhrasePrefixQuery must match all terms. + let field = phrase_prefix_query.field(); + let field_id = field.field_id(); + let fst_map = Self::get_fst_map(field_id, fst_maps)?; + + let mut matched_all = true; + for term in phrase_prefix_query.phrase_terms() { + let matched = Self::check_term_match(fst_map, field_id, &term, matched_terms); + if !matched { + matched_all = false; + break; + } + } + if !matched_all { + return Ok(false); + } + + // using regex to check prefix term, get related term ids. + let (_, prefix_term) = phrase_prefix_query.prefix_term_with_offset(); + let term_str = String::from_utf8_lossy(prefix_term.serialized_value_bytes()); + let key = format!("{}.*", term_str); + let re = Regex::new(&key).map_err(|_| { + ErrorCode::TantivyError(format!("inverted index create regex `{}` failed", key)) + })?; + + let mut prefix_term_ids = vec![]; + let mut stream = fst_map.search(&re).into_stream(); + while let Some((key, term_id)) = stream.next() { + let key_str = unsafe { std::str::from_utf8_unchecked(key) }; + let prefix_term = Term::from_field_text(field, key_str); + matched_terms.insert(prefix_term.clone(), (field_id, term_id)); + prefix_term_ids.push(term_id); + } + let matched = !prefix_term_ids.is_empty(); + if matched { + prefix_terms.insert(prefix_term.clone(), prefix_term_ids); + } + Ok(matched) + } else if let Some(fuzzy_term_query) = query.downcast_ref::() { + // FuzzyTermQuery match terms by levenshtein distance. + let fuzziness = fuzziness.unwrap(); + let term = fuzzy_term_query.term(); + let field = term.field(); + let field_id = field.field_id(); + let fst_map = Self::get_fst_map(field_id, fst_maps)?; + + // build levenshtein automaton to check fuzziness term, get related term ids. let lev_automaton_builder = LevenshteinAutomatonBuilder::new(fuzziness, true); let term_str = String::from_utf8_lossy(term.serialized_value_bytes()); let automaton = DfaWrapper(lev_automaton_builder.build_dfa(&term_str)); - let mut fuzz_term_values = vec![]; + let mut fuzz_term_ids = vec![]; let mut stream = fst_map.search(automaton).into_stream(); - while let Some((key, idx)) = stream.next() { + while let Some((key, term_id)) = stream.next() { let key_str = unsafe { std::str::from_utf8_unchecked(key) }; let fuzz_term = Term::from_field_text(field, key_str); - matched_terms.insert(fuzz_term.clone(), idx); - fuzz_term_values.push(fuzz_term); + matched_terms.insert(fuzz_term.clone(), (field_id, term_id)); + fuzz_term_ids.push(term_id); + } + let matched = !fuzz_term_ids.is_empty(); + if matched { + fuzziness_terms.insert(term.clone(), fuzz_term_ids); } - let matched = !fuzz_term_values.is_empty(); - fuzziness_terms.insert(term.clone(), fuzz_term_values); - matched + Ok(matched) + } else if let Some(boost_query) = query.downcast_ref::() { + Self::check_term_fsts_match( + boost_query.query(), + fst_maps, + fuzziness, + matched_terms, + prefix_terms, + fuzziness_terms, + ) + } else if let Some(const_query) = query.downcast_ref::() { + Self::check_term_fsts_match( + const_query.query(), + fst_maps, + fuzziness, + matched_terms, + prefix_terms, + fuzziness_terms, + ) + } else if let Some(_empty_query) = query.downcast_ref::() { + Ok(false) + } else if let Some(_all_query) = query.downcast_ref::() { + Ok(true) } else { - false + Err(ErrorCode::TantivyError(format!( + "inverted index unsupported query `{:?}`", + query + ))) } - } else { - // TODO: handle other Query types - let mut matched = false; - query.query_terms(&mut |term, _| { - let field = term.field(); - let field_id = field.field_id() as usize; - if let Some(fst_map) = fst_maps.get(&field_id) { - if let Some(idx) = fst_map.get(term.serialized_value_bytes()) { - matched_terms.insert(term.clone(), idx); - matched = true; - } - } - }); - - matched } -} -// collect matched rows by term value -pub fn collect_matched_rows( - query: Box, - row_count: u32, - fuzziness_terms: &HashMap>, - term_values: &mut HashMap, -) -> Vec { - if let Some(term_query) = query.downcast_ref::() { - let term = term_query.term(); - if let Some(term_value) = term_values.get(term) { - term_value.doc_ids.clone() - } else { - vec![] - } - } else if let Some(bool_query) = query.downcast_ref::() { - let mut should_doc_ids_opt = None; - let mut must_doc_ids_opt = None; - let mut must_not_doc_ids_opt = None; - for (occur, sub_query) in bool_query.clauses() { - let doc_ids = collect_matched_rows( - sub_query.box_clone(), - row_count, - fuzziness_terms, - term_values, - ); - let doc_id_set = HashSet::from_iter(doc_ids.into_iter()); - match occur { - Occur::Should => { - if should_doc_ids_opt.is_none() { - should_doc_ids_opt = Some(doc_id_set); - } else { - let should_doc_ids = should_doc_ids_opt.unwrap(); - should_doc_ids_opt = - Some(should_doc_ids.union(&doc_id_set).copied().collect()) - } - } - Occur::Must => { - if must_doc_ids_opt.is_none() { - must_doc_ids_opt = Some(doc_id_set); - } else { - let must_doc_ids = must_doc_ids_opt.unwrap(); - must_doc_ids_opt = - Some(must_doc_ids.intersection(&doc_id_set).copied().collect()) - } + // get `doc_ids` of a `term_id`, + fn get_doc_ids(&mut self, term_id: u64) -> Result<&RoaringTreemap> { + if let std::collections::hash_map::Entry::Vacant(doc_ids_entry) = + self.doc_ids.entry(term_id) + { + // `doc_ids` are lazy loaded when used. + let block_postings = self.block_postings_map.get_mut(&term_id).ok_or_else(|| { + ErrorCode::TantivyError(format!( + "inverted index block postings `{}` does not exist", + term_id + )) + })?; + + let term_freqs_len = if self.need_position { + let (_, term_info) = self.term_infos.get(&term_id).ok_or_else(|| { + ErrorCode::TantivyError(format!( + "inverted index term info `{}` does not exist", + term_id + )) + })?; + term_info.doc_freq as usize + } else { + 0 + }; + let mut doc_ids = RoaringTreemap::new(); + let mut term_freqs = Vec::with_capacity(term_freqs_len); + // `doc_ids` are stored in multiple blocks and need to be decode sequentially. + // TODO: We can skip some blocks by checking related `doc_ids`. + loop { + let block_doc_ids = block_postings.docs(); + if block_doc_ids.is_empty() { + break; } - Occur::MustNot => { - if must_not_doc_ids_opt.is_none() { - must_not_doc_ids_opt = Some(doc_id_set); - } else { - let must_not_doc_ids = must_not_doc_ids_opt.unwrap(); - must_not_doc_ids_opt = - Some(must_not_doc_ids.union(&doc_id_set).copied().collect()) - } + doc_ids + .append(block_doc_ids.iter().map(|id| *id as u64)) + .unwrap(); + + // `term_freqs` is only used if the query need position. + if self.need_position { + let block_term_freqs = block_postings.freqs(); + term_freqs.extend_from_slice(block_term_freqs); } + block_postings.advance(); } + doc_ids_entry.insert(doc_ids); + self.term_freqs.insert(term_id, term_freqs); } - let doc_ids = if let Some(mut should_doc_ids) = should_doc_ids_opt { - if let Some(must_doc_ids) = must_doc_ids_opt { - should_doc_ids = should_doc_ids - .intersection(&must_doc_ids) - .copied() - .collect() + let doc_ids = self.doc_ids.get(&term_id).unwrap(); + Ok(doc_ids) + } + + // get the position `offsets` and `term_freqs` of a `term_id` in each `docs`, + // which is used to read `positions`. + fn get_position_offsets( + &mut self, + term_id: u64, + all_doc_ids: &RoaringTreemap, + ) -> Result> { + let doc_ids = self.doc_ids.get(&term_id).unwrap(); + let term_freqs = self.term_freqs.get(&term_id).unwrap(); + + let mut doc_offset = 0; + let mut offset_and_term_freqs = HashMap::with_capacity(all_doc_ids.len() as usize); + for (doc_id, term_freq) in doc_ids.iter().zip(term_freqs.iter()) { + if all_doc_ids.len() as usize == offset_and_term_freqs.len() { + break; } - if let Some(must_not_doc_ids) = must_not_doc_ids_opt { - should_doc_ids = should_doc_ids - .difference(&must_not_doc_ids) - .copied() - .collect() + if !all_doc_ids.contains(doc_id) { + doc_offset += *term_freq as u64; + continue; } - should_doc_ids - } else if let Some(mut must_doc_ids) = must_doc_ids_opt { - if let Some(must_not_doc_ids) = must_not_doc_ids_opt { - must_doc_ids = must_doc_ids - .difference(&must_not_doc_ids) - .copied() - .collect() + + offset_and_term_freqs.insert(doc_id, (doc_offset, *term_freq)); + doc_offset += *term_freq as u64; + } + + Ok(offset_and_term_freqs) + } + + // get `positions` of a `term_id` in a `doc`. + fn get_positions( + &mut self, + term_id: u64, + doc_offset: u64, + term_freq: u32, + ) -> Result { + let position_reader = self.position_reader_map.get_mut(&term_id).ok_or_else(|| { + ErrorCode::TantivyError(format!( + "inverted index position reader `{}` does not exist", + term_id + )) + })?; + + let mut positions = vec![0; term_freq as usize]; + position_reader.read(doc_offset, &mut positions[..]); + for i in 1..positions.len() { + positions[i] += positions[i - 1]; + } + let term_poses = + RoaringTreemap::from_sorted_iter(positions.into_iter().map(|i| i as u64)).unwrap(); + Ok(term_poses) + } + + // The phrase query matches `doc_ids` as follows: + // + // 1. Collect the position for each term in the query. + // 2. Collect the `doc_ids` of each term and take + // the intersection to get the candidate `doc_ids`. + // 3. Iterate over the candidate `doc_ids` to check whether + // the position of terms matches the position of terms in query. + // 4. Each position in the first term is a possible query phrase beginning. + // Verify that the beginning is valid by checking whether corresponding + // positions in other terms exist. If not, delete the possible position + // in the first term. After traversing all terms, determine if there are + // any positions left in the first term. If there are, then the `doc_id` + // is matched. + // + // If the query is a prefix phrase query, also check if any prefix terms + // match the positions. + pub fn collect_phrase_matched_rows( + &mut self, + phrase_terms: Vec<(usize, Term)>, + prefix_term: Option<(usize, &Vec)>, + ) -> Result> { + let mut query_term_poses = Vec::with_capacity(phrase_terms.len()); + for (term_pos, term) in &phrase_terms { + // term not exist means this phrase in not matched. + let Some(term_id) = self.term_map.get(term) else { + return Ok(None); + }; + query_term_poses.push((*term_pos, *term_id)); + } + if query_term_poses.is_empty() { + return Ok(None); + } + + let first_term_pos = &query_term_poses[0].0; + let first_term_id = &query_term_poses[0].1; + + let mut term_ids = HashSet::with_capacity(phrase_terms.len() + 1); + term_ids.insert(*first_term_id); + + let first_doc_ids = self.get_doc_ids(*first_term_id)?; + let mut candidate_doc_ids = RoaringTreemap::new(); + candidate_doc_ids.bitor_assign(first_doc_ids); + + // Collect the `doc_ids` of other terms in the query, and take the intersection, + // obtains the candidate `doc_ids` containing all terms. + let mut query_term_offsets = Vec::with_capacity(query_term_poses.len() - 1); + for (term_pos, term_id) in query_term_poses.iter().skip(1) { + if !term_ids.contains(term_id) { + let doc_ids = self.get_doc_ids(*term_id)?; + + candidate_doc_ids.bitand_assign(doc_ids); + if candidate_doc_ids.is_empty() { + break; + } + term_ids.insert(*term_id); } - must_doc_ids - } else if let Some(must_not_doc_ids) = must_not_doc_ids_opt { - let all_doc_ids = HashSet::from_iter(0..row_count); - let doc_ids = all_doc_ids.difference(&must_not_doc_ids).copied().collect(); - doc_ids - } else { - HashSet::new() - }; - - let mut doc_ids = Vec::from_iter(doc_ids); - doc_ids.sort(); - doc_ids - } else if let Some(phrase_query) = query.downcast_ref::() { - let mut union_doc_ids = HashSet::new(); - let mut intersection_doc_ids_opt = None; - - for term in phrase_query.phrase_terms() { - if let Some(term_value) = term_values.get(&term) { - let doc_id_set = HashSet::from_iter(term_value.doc_ids.clone()); - union_doc_ids = union_doc_ids.union(&doc_id_set).copied().collect(); - if intersection_doc_ids_opt.is_none() { - intersection_doc_ids_opt = Some(doc_id_set); - } else { - let intersection_doc_ids = intersection_doc_ids_opt.unwrap(); - intersection_doc_ids_opt = Some( - intersection_doc_ids - .intersection(&doc_id_set) - .copied() - .collect(), - ); + let term_pos_offset = (term_pos - first_term_pos) as u64; + query_term_offsets.push((*term_id, term_pos_offset)); + } + // If the candidate `doc_ids` is empty, all docs are not matched. + if candidate_doc_ids.is_empty() { + return Ok(None); + } + + // If the query is a prefix phrase query, + // also need to collect `doc_ids` of prefix terms. + if let Some((_, prefix_term_ids)) = prefix_term { + let mut all_prefix_doc_ids = RoaringTreemap::new(); + for prefix_term_id in prefix_term_ids { + let prefix_doc_ids = self.get_doc_ids(*prefix_term_id)?; + // If the `doc_ids` does not intersect at all, this prefix can be ignored. + if candidate_doc_ids.is_disjoint(prefix_doc_ids) { + continue; } + + all_prefix_doc_ids.bitor_assign(prefix_doc_ids); + term_ids.insert(*prefix_term_id); } + // If there is no matched prefix `doc_ids`, the prefix term does not matched. + if all_prefix_doc_ids.is_empty() { + return Ok(None); + } + + // Get the intersection of phrase `doc_ids` and prefix `doc_ids` + candidate_doc_ids.bitand_assign(all_prefix_doc_ids); } - let intersection_doc_ids = intersection_doc_ids_opt.unwrap_or_default(); - if intersection_doc_ids.is_empty() { - return vec![]; + // Collect the position `offset` and `term_freqs` for each terms, + // which can be used to read positions. + let mut offset_and_term_freqs_map = HashMap::new(); + for term_id in term_ids.into_iter() { + let offset_and_term_freqs = self.get_position_offsets(term_id, &candidate_doc_ids)?; + offset_and_term_freqs_map.insert(term_id, offset_and_term_freqs); } - let mut union_doc_ids = Vec::from_iter(union_doc_ids); - union_doc_ids.sort(); - // check each docs - let mut matched_doc_ids = vec![]; - for doc_id in union_doc_ids { - if !intersection_doc_ids.contains(&doc_id) { - continue; + let first_offset_and_term_freqs = offset_and_term_freqs_map.get(first_term_id).unwrap(); + + // Check every candidate `doc_ids` if the position of each terms match the query. + let mut all_doc_ids = RoaringTreemap::new(); + let mut offset_poses = RoaringTreemap::new(); + let mut term_poses_map = HashMap::new(); + for (doc_id, (first_doc_offset, first_term_freq)) in first_offset_and_term_freqs.iter() { + let mut first_term_poses = + self.get_positions(*first_term_id, *first_doc_offset, *first_term_freq)?; + + term_poses_map.clear(); + term_poses_map.insert(first_term_id, first_term_poses.clone()); + + for (term_id, term_pos_offset) in &query_term_offsets { + if !term_poses_map.contains_key(term_id) { + let offset_and_term_freqs = offset_and_term_freqs_map.get(term_id).unwrap(); + let (doc_offset, term_freq) = offset_and_term_freqs.get(doc_id).unwrap(); + + let term_poses = self.get_positions(*term_id, *doc_offset, *term_freq)?; + term_poses_map.insert(term_id, term_poses); + } + let term_poses = term_poses_map.get(term_id).unwrap(); + + // Using the position of the first term and the offset of this term with the first term, + // calculate all possible positions for this term. + offset_poses.clear(); + offset_poses + .append(first_term_poses.iter().map(|pos| pos + term_pos_offset)) + .unwrap(); + + // Term possible positions subtract term actual positions, + // remaining positions are not matched and need to be removed in the first term. + offset_poses.sub_assign(term_poses); + for offset_pos in &offset_poses { + first_term_poses.remove(offset_pos - term_pos_offset); + } + if first_term_poses.is_empty() { + break; + } } - let mut term_pos_map = HashMap::new(); - for term in phrase_query.phrase_terms() { - let mut offset = 0; - let mut term_freq = 0; - if let Some(term_value) = term_values.get_mut(&term) { - for i in 0..term_value.doc_ids.len() { - if term_value.doc_ids[i] < doc_id { - offset += term_value.term_freqs[i] as u64; - } else { - term_freq = term_value.term_freqs[i] as usize; - break; + // If the query is a prefix phrase query, + // also need to check if any prefix term match. + if let Some((prefix_term_pos, prefix_term_ids)) = prefix_term { + if !first_term_poses.is_empty() { + let mut prefix_matched = false; + let prefix_term_pos_offset = (prefix_term_pos - first_term_pos) as u64; + for prefix_term_id in prefix_term_ids { + if !term_poses_map.contains_key(prefix_term_id) { + if let Some(offset_and_term_freqs) = + offset_and_term_freqs_map.get(prefix_term_id) + { + if let Some((doc_offset, term_freq)) = + offset_and_term_freqs.get(doc_id) + { + let term_poses = self.get_positions( + *prefix_term_id, + *doc_offset, + *term_freq, + )?; + term_poses_map.insert(prefix_term_id, term_poses); + } + } } - } - // collect positions in the docs - if let Some(position_reader) = term_value.position_reader.as_mut() { - let mut pos_output = vec![0; term_freq]; - position_reader.read(offset, &mut pos_output[..]); - for i in 1..pos_output.len() { - pos_output[i] += pos_output[i - 1]; + if let Some(term_poses) = term_poses_map.get(prefix_term_id) { + offset_poses.clear(); + offset_poses + .append( + first_term_poses + .iter() + .map(|pos| pos + prefix_term_pos_offset), + ) + .unwrap(); + + offset_poses.bitand_assign(term_poses); + // If any of the possible prefix term positions exist, + // the prefix phrase query is matched. + if !offset_poses.is_empty() { + prefix_matched = true; + break; + } } - let positions = VecDeque::from_iter(pos_output); - term_pos_map.insert(term.clone(), positions); + } + if prefix_matched { + all_doc_ids.insert(*doc_id); } } + } else { + let matched = !first_term_poses.is_empty(); + if matched { + all_doc_ids.insert(*doc_id); + } } + } + + if !all_doc_ids.is_empty() { + Ok(Some(all_doc_ids)) + } else { + Ok(None) + } + } - let mut is_first = true; - let mut distance = 0; - let mut matched = true; - let mut last_position = 0; - for (query_position, term) in phrase_query.phrase_terms_with_offsets() { - if let Some(positions) = term_pos_map.get_mut(&term) { - let mut find_position = false; - while let Some(doc_position) = positions.pop_front() { - // skip previous positions. - if doc_position < last_position { + // collect matched rows by term value + pub fn collect_matched_rows( + &mut self, + query: Box, + prefix_terms: &HashMap>, + fuzziness_terms: &HashMap>, + ) -> Result> { + if let Some(term_query) = query.downcast_ref::() { + let term = term_query.term(); + if let Some(term_id) = self.term_map.get(term) { + let doc_ids = self.get_doc_ids(*term_id)?; + Ok(Some(doc_ids.clone())) + } else { + Ok(None) + } + } else if let Some(bool_query) = query.downcast_ref::() { + let mut should_doc_ids: Option = None; + let mut must_doc_ids: Option = None; + let mut must_not_doc_ids: Option = None; + for (occur, sub_query) in bool_query.clauses() { + let doc_ids = self.collect_matched_rows( + sub_query.box_clone(), + prefix_terms, + fuzziness_terms, + )?; + if doc_ids.is_none() { + match occur { + Occur::Should => { continue; } - last_position = doc_position; - let doc_distance = doc_position - (query_position as u32); - if is_first { - is_first = false; - distance = doc_distance; - } else { - // distance must same as first term. - if doc_distance != distance { - matched = false; + Occur::Must => { + if must_doc_ids.is_none() { + must_doc_ids = Some(RoaringTreemap::new()); + } + continue; + } + Occur::MustNot => { + if must_not_doc_ids.is_none() { + must_not_doc_ids = Some(RoaringTreemap::new()); } + continue; } - find_position = true; - break; } - if !find_position { - matched = false; + } + let doc_ids = doc_ids.unwrap(); + match occur { + Occur::Should => { + if let Some(ref mut should_doc_ids) = should_doc_ids { + should_doc_ids.bitor_assign(&doc_ids); + } else { + should_doc_ids = Some(doc_ids); + } + } + Occur::Must => { + if let Some(ref mut must_doc_ids) = must_doc_ids { + must_doc_ids.bitand_assign(&doc_ids); + } else { + must_doc_ids = Some(doc_ids); + } + } + Occur::MustNot => { + if let Some(ref mut must_not_doc_ids) = must_not_doc_ids { + must_not_doc_ids.bitor_assign(&doc_ids); + } else { + must_not_doc_ids = Some(doc_ids); + } } - } else { - matched = false; } - if !matched { - break; + } + + let all_doc_ids = match (should_doc_ids, must_doc_ids, must_not_doc_ids) { + // only should + (Some(should_doc_ids), None, None) => should_doc_ids, + // only must + (None, Some(must_doc_ids), None) => must_doc_ids, + // only must not + (None, None, Some(must_not_doc_ids)) => { + let mut all_doc_ids = RoaringTreemap::from_iter(0..self.row_count); + all_doc_ids.sub_assign(must_not_doc_ids); + all_doc_ids + } + // should and must + (Some(mut should_doc_ids), Some(must_doc_ids), None) => { + should_doc_ids.bitor_assign(must_doc_ids); + should_doc_ids + } + // should and must not + (Some(mut should_doc_ids), None, Some(must_not_doc_ids)) => { + should_doc_ids.sub_assign(must_not_doc_ids); + should_doc_ids + } + // must and must not + (None, Some(mut must_doc_ids), Some(must_not_doc_ids)) => { + must_doc_ids.sub_assign(must_not_doc_ids); + must_doc_ids + } + // should, must and must not + (Some(mut should_doc_ids), Some(must_doc_ids), Some(must_not_doc_ids)) => { + should_doc_ids.bitor_assign(must_doc_ids); + should_doc_ids.sub_assign(must_not_doc_ids); + should_doc_ids + } + (None, None, None) => { + return Ok(None); } + }; + + if !all_doc_ids.is_empty() { + Ok(Some(all_doc_ids)) + } else { + Ok(None) } - if matched { - matched_doc_ids.push(doc_id); + } else if let Some(phrase_query) = query.downcast_ref::() { + let phrase_terms = phrase_query.phrase_terms_with_offsets(); + self.collect_phrase_matched_rows(phrase_terms, None) + } else if let Some(phrase_prefix_query) = query.downcast_ref::() { + let phrase_terms = phrase_prefix_query.phrase_terms_with_offsets(); + let (prefix_term_pos, prefix_term) = phrase_prefix_query.prefix_term_with_offset(); + + let Some(prefix_term_ids) = prefix_terms.get(&prefix_term) else { + return Ok(None); + }; + let prefix_term = Some((prefix_term_pos, prefix_term_ids)); + + self.collect_phrase_matched_rows(phrase_terms, prefix_term) + } else if let Some(fuzzy_term_query) = query.downcast_ref::() { + let mut all_doc_ids = RoaringTreemap::new(); + let term = fuzzy_term_query.term(); + + let Some(fuzz_term_ids) = fuzziness_terms.get(term) else { + return Ok(None); + }; + // collect related terms of the original term. + for term_id in fuzz_term_ids { + let doc_ids = self.get_doc_ids(*term_id)?; + all_doc_ids.bitor_assign(doc_ids); } - } - matched_doc_ids - } else if let Some(fuzzy_term_query) = query.downcast_ref::() { - let mut fuzz_doc_ids = HashSet::new(); - let term = fuzzy_term_query.term(); - - // collect related terms of the original term. - if let Some(related_terms) = fuzziness_terms.get(term) { - for term in related_terms { - if let Some(term_value) = term_values.get(term) { - let doc_id_set: HashSet = HashSet::from_iter(term_value.doc_ids.clone()); - fuzz_doc_ids = fuzz_doc_ids.union(&doc_id_set).copied().collect(); - } + if !all_doc_ids.is_empty() { + Ok(Some(all_doc_ids)) + } else { + Ok(None) } - let mut doc_ids = Vec::from_iter(fuzz_doc_ids); - doc_ids.sort(); - doc_ids + } else if let Some(boost_query) = query.downcast_ref::() { + self.collect_matched_rows(boost_query.query(), prefix_terms, fuzziness_terms) + } else if let Some(const_query) = query.downcast_ref::() { + self.collect_matched_rows(const_query.query(), prefix_terms, fuzziness_terms) + } else if let Some(_empty_query) = query.downcast_ref::() { + Ok(None) + } else if let Some(_all_query) = query.downcast_ref::() { + let all_doc_ids = RoaringTreemap::from_iter(0..self.row_count); + Ok(Some(all_doc_ids)) } else { - vec![] + Err(ErrorCode::TantivyError(format!( + "inverted index unsupported query `{:?}`", + query + ))) } - } else { - let mut union_doc_ids = HashSet::new(); - query.query_terms(&mut |term, _| { - if let Some(term_value) = term_values.get(term) { - let doc_id_set: HashSet = HashSet::from_iter(term_value.doc_ids.clone()); - union_doc_ids = union_doc_ids.union(&doc_id_set).copied().collect(); - } - }); - - let mut doc_ids = Vec::from_iter(union_doc_ids); - doc_ids.sort(); - doc_ids } } @@ -798,7 +1029,7 @@ pub struct InvertedIndexDirectory { } impl InvertedIndexDirectory { - pub fn try_create(field_nums: usize, files: Vec>) -> Result { + pub fn try_create(files: Vec>) -> Result { let mut file_map = BTreeMap::::new(); for file in files.into_iter() { @@ -812,12 +1043,7 @@ impl InvertedIndexDirectory { let fast_data = file_map.remove("fast").unwrap(); let store_data = file_map.remove("store").unwrap(); let fieldnorm_data = file_map.remove("fieldnorm").unwrap(); - // If there are no phrase terms in the query, - // we can use empty position data instead. - let pos_data = match file_map.remove("pos") { - Some(pos_data) => pos_data, - None => build_empty_position_data(field_nums)?, - }; + let pos_data = file_map.remove("pos").unwrap(); let idx_data = file_map.remove("idx").unwrap(); let term_data = file_map.remove("term").unwrap(); let meta_data = file_map.remove("meta.json").unwrap(); diff --git a/src/query/storages/common/index/src/lib.rs b/src/query/storages/common/index/src/lib.rs index f60a1b16982e..48a30df64cae 100644 --- a/src/query/storages/common/index/src/lib.rs +++ b/src/query/storages/common/index/src/lib.rs @@ -26,15 +26,12 @@ pub use bloom_index::BloomIndex; pub use bloom_index::BloomIndexMeta; pub use bloom_index::FilterEvalResult; pub use index::Index; -pub use inverted_index::build_tantivy_footer; -pub use inverted_index::check_term_fsts_match; -pub use inverted_index::collect_matched_rows; pub use inverted_index::extract_component_fields; pub use inverted_index::extract_fsts; +pub use inverted_index::DocIdsCollector; pub use inverted_index::InvertedIndexDirectory; pub use inverted_index::InvertedIndexFile; pub use inverted_index::InvertedIndexMeta; -pub use inverted_index::TermValue; pub use page_index::PageIndex; pub use range_index::statistics_to_domain; pub use range_index::RangeIndex; diff --git a/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_loader.rs b/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_loader.rs index 411dc316b98b..658a8bc38c95 100644 --- a/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_loader.rs +++ b/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_loader.rs @@ -131,7 +131,6 @@ pub(crate) async fn load_inverted_index_file<'a>( #[fastrace::trace] pub(crate) async fn load_inverted_index_directory<'a>( dal: Operator, - field_nums: usize, index_path: &'a str, ) -> Result { // load inverted index meta, contains the offsets of each files. @@ -154,7 +153,7 @@ pub(crate) async fn load_inverted_index_directory<'a>( let files: Vec<_> = try_join_all(futs).await?.into_iter().collect(); // use those files to create inverted index directory - let directory = InvertedIndexDirectory::try_create(field_nums, files)?; + let directory = InvertedIndexDirectory::try_create(files)?; Ok(directory) } diff --git a/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs b/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs index e3baee6c0c0f..6441a0fccf97 100644 --- a/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs +++ b/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs @@ -17,12 +17,10 @@ use std::collections::HashSet; use std::sync::Arc; use std::time::Instant; +use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::types::F32; use databend_common_metrics::storage::metrics_inc_block_inverted_index_search_milliseconds; -use databend_storages_common_index::check_term_fsts_match; -use databend_storages_common_index::collect_matched_rows; -use databend_storages_common_index::TermValue; use databend_storages_common_table_meta::meta::SingleColumnMeta; use futures_util::future::try_join_all; use opendal::Operator; @@ -40,6 +38,7 @@ use tantivy::tokenizer::TokenizerManager; use tantivy::Index; use tantivy_fst::raw::Fst; +use crate::index::DocIdsCollector; use crate::io::read::inverted_index::inverted_index_loader::load_inverted_index_directory; use crate::io::read::inverted_index::inverted_index_loader::load_inverted_index_file; use crate::io::read::inverted_index::inverted_index_loader::load_inverted_index_meta; @@ -61,15 +60,14 @@ impl InvertedIndexReader { #[allow(clippy::too_many_arguments)] pub async fn do_filter( self, - field_nums: usize, need_position: bool, has_score: bool, query: Box, - field_ids: &HashSet, + field_ids: &HashSet, index_record: &IndexRecordOption, fuzziness: &Option, tokenizer_manager: TokenizerManager, - row_count: u32, + row_count: u64, index_loc: &str, ) -> Result)>>> { let start = Instant::now(); @@ -79,7 +77,6 @@ impl InvertedIndexReader { index_loc, query, field_ids, - field_nums, need_position, has_score, index_record, @@ -101,15 +98,19 @@ impl InvertedIndexReader { &self, index_path: &'a str, name: &str, - field_ids: &HashSet, + field_ids: &HashSet, inverted_index_meta_map: &HashMap, - ) -> Result> { - let mut col_metas = vec![]; - let mut col_field_map = HashMap::new(); + ) -> Result> { + let mut col_metas = Vec::with_capacity(field_ids.len()); + let mut col_field_map = HashMap::with_capacity(field_ids.len()); for field_id in field_ids { let col_name = format!("{}-{}", name, field_id); - let col_meta = inverted_index_meta_map.get(&col_name).unwrap(); - + let col_meta = inverted_index_meta_map.get(&col_name).ok_or_else(|| { + ErrorCode::TantivyError(format!( + "inverted index column `{}` does not exist", + col_name + )) + })?; col_metas.push((col_name.clone(), col_meta)); col_field_map.insert(col_name, *field_id); } @@ -131,18 +132,16 @@ impl InvertedIndexReader { Ok(col_files) } - // first version search function, using tantivy searcher. - async fn search_v0<'a>( + // legacy query search function, using tantivy searcher. + async fn legacy_search<'a>( &self, index_path: &'a str, query: Box, - field_nums: usize, has_score: bool, tokenizer_manager: TokenizerManager, - row_count: u32, + row_count: u64, ) -> Result)>>> { - let directory = - load_inverted_index_directory(self.dal.clone(), field_nums, index_path).await?; + let directory = load_inverted_index_directory(self.dal.clone(), index_path).await?; let mut index = Index::open(directory)?; index.set_tokenizers(tokenizer_manager); @@ -179,34 +178,48 @@ impl InvertedIndexReader { } // Follow the process below to perform the query search: - // 1. Read the `fst` first, check if the term in the query matches, - // return if it doesn't matched. - // 2. Read the `term dict` to get the `postings_range` in `idx` - // and the `positions_range` in `pos` for each terms. - // 3. Read the `doc_ids` and `term_freqs` in `idx` for each terms - // using `postings_range`. - // 4. If it's a phrase query, read the `position` of each terms in - // `pos` using `positions_range`. - // 5. Collect matched doc ids using term-related information. + // + // 1. Read the `fst` first, check if the term in the query matches. + // If it matches, collect the terms that need to be checked + // for subsequent processing, otherwise, return directly + // and ignore the block. + // 2. Read the `term_info` for each terms from the `term_dict`, + // which contains three parts: + // `doc_freq` is the number of docs containing the term. + // `postings_range` is used to read posting list in the postings (`.idx`) file. + // `positions_range` is used to read positions in the positions (`.pos`) file. + // 3. Read `postings` data from postings (`.idx`) file by `postings_range` + // of each terms, and `positions` data from positions (`.pos`) file + // by `positions_range` of each terms. + // 4. Open `BlockSegmentPostings` using `postings` data for each terms, + // which can be used to read `doc_ids` and `term_freqs`. + // 5. If the query is a phrase query, Open `PositionReader` using + // `positions` data for each terms, which can be use to read + // term `positions` in each docs. + // 6. Collect matched `doc_ids` of the query. + // If the query is a term query, the `doc_ids` can read from `BlockSegmentPostings`. + // If the query is a phrase query, in addition to `doc_ids`, also need to + // use `PositionReader` to read the positions for each terms and check whether + // the position of terms in doc is the same as the position of terms in query. // // If the term does not match, only the `fst` file needs to be read. - // If the term matches, only the `idx` and `pos` data of the related terms - // need to be read instead of all the `idx` and `pos` data. + // If the term matches, the `term_dict` and `postings`, `positions` + // data of the related terms need to be read instead of all + // the `postings` and `positions` data. #[allow(clippy::too_many_arguments)] async fn search<'a>( &self, index_path: &'a str, query: Box, - field_ids: &HashSet, - field_nums: usize, + field_ids: &HashSet, need_position: bool, has_score: bool, index_record: &IndexRecordOption, fuzziness: &Option, tokenizer_manager: TokenizerManager, - row_count: u32, + row_count: u64, ) -> Result)>>> { - // 1. read index meta + // 1. read index meta. let inverted_index_meta = load_inverted_index_meta(self.dal.clone(), index_path).await?; let inverted_index_meta_map = inverted_index_meta @@ -220,18 +233,11 @@ impl InvertedIndexReader { // use compatible search function to read. if inverted_index_meta_map.contains_key("meta.json") { return self - .search_v0( - index_path, - query, - field_nums, - has_score, - tokenizer_manager, - row_count, - ) + .legacy_search(index_path, query, has_score, tokenizer_manager, row_count) .await; } - // 2. read fst files + // 2. read fst files. let fst_files = self .read_column_data(index_path, "fst", field_ids, &inverted_index_meta_map) .await?; @@ -250,15 +256,18 @@ impl InvertedIndexReader { // 3. check whether query is matched in the fsts. let mut matched_terms = HashMap::new(); + let mut prefix_terms = HashMap::new(); let mut fuzziness_terms = HashMap::new(); - let matched = check_term_fsts_match( + let matched = DocIdsCollector::check_term_fsts_match( query.box_clone(), &fst_maps, fuzziness, &mut matched_terms, + &mut prefix_terms, &mut fuzziness_terms, - ); + )?; + // if not matched, return without further check if !matched { return Ok(None); } @@ -268,71 +277,66 @@ impl InvertedIndexReader { .read_column_data(index_path, "term", field_ids, &inverted_index_meta_map) .await?; - let mut term_dict_maps = HashMap::new(); + let mut term_infos = HashMap::with_capacity(matched_terms.len()); for (field_id, term_dict_data) in term_dict_files.into_iter() { let term_dict_file = FileSlice::new(Arc::new(term_dict_data)); let term_info_store = TermInfoStore::open(term_dict_file)?; - term_dict_maps.insert(field_id, term_info_store); - } - let mut term_values = HashMap::new(); - for (term, term_ord) in matched_terms.iter() { - let field = term.field(); - let field_id = field.field_id() as usize; - - let term_dict = term_dict_maps.get(&field_id).unwrap(); - let term_info = term_dict.get(*term_ord); - - let term_value = TermValue { - term_info, - doc_ids: vec![], - term_freqs: vec![], - position_reader: None, - }; - term_values.insert(term.clone(), term_value); + for (_, (term_field_id, term_id)) in matched_terms.iter() { + if field_id == *term_field_id { + let term_info = term_info_store.get(*term_id); + term_infos.insert(*term_id, (field_id, term_info)); + } + } } // 5. read postings and optional positions. - // collect doc ids, term frequencies and optional position readers. - let mut slice_metas = Vec::with_capacity(term_values.len()); - let mut name_map = HashMap::new(); - for (term, term_value) in term_values.iter() { - let field = term.field(); - let field_id = field.field_id() as usize; - + let term_slice_len = if need_position { + term_infos.len() * 2 + } else { + term_infos.len() + }; + let mut slice_metas = Vec::with_capacity(term_slice_len); + let mut slice_name_map = HashMap::with_capacity(term_slice_len); + for (term_id, (field_id, term_info)) in term_infos.iter() { let idx_name = format!("idx-{}", field_id); - let idx_meta = inverted_index_meta_map.get(&idx_name).unwrap(); - + let idx_meta = inverted_index_meta_map.get(&idx_name).ok_or_else(|| { + ErrorCode::TantivyError(format!( + "inverted index column `{}` does not exist", + idx_name + )) + })?; // ignore 8 bytes total_num_tokens_slice - let offset = idx_meta.offset + 8 + (term_value.term_info.postings_range.start as u64); - let len = term_value.term_info.postings_range.len() as u64; + let offset = idx_meta.offset + 8 + (term_info.postings_range.start as u64); + let len = term_info.postings_range.len() as u64; let idx_slice_meta = SingleColumnMeta { offset, len, num_values: 1, }; - let idx_slice_name = - format!("{}-{}", idx_name, term_value.term_info.postings_range.start); + let idx_slice_name = format!("{}-{}", idx_name, term_info.postings_range.start); slice_metas.push((idx_slice_name.clone(), idx_slice_meta)); - name_map.insert(idx_slice_name, term.clone()); + slice_name_map.insert(idx_slice_name, *term_id); if need_position { let pos_name = format!("pos-{}", field_id); - let pos_meta = inverted_index_meta_map.get(&pos_name).unwrap(); - let offset = pos_meta.offset + (term_value.term_info.positions_range.start as u64); - let len = term_value.term_info.positions_range.len() as u64; + let pos_meta = inverted_index_meta_map.get(&pos_name).ok_or_else(|| { + ErrorCode::TantivyError(format!( + "inverted index column `{}` does not exist", + pos_name + )) + })?; + let offset = pos_meta.offset + (term_info.positions_range.start as u64); + let len = term_info.positions_range.len() as u64; let pos_slice_meta = SingleColumnMeta { offset, len, num_values: 1, }; - let pos_slice_name = format!( - "{}-{}", - pos_name, term_value.term_info.positions_range.start - ); + let pos_slice_name = format!("{}-{}", pos_name, term_info.positions_range.start); slice_metas.push((pos_slice_name.clone(), pos_slice_meta)); - name_map.insert(pos_slice_name, term.clone()); + slice_name_map.insert(pos_slice_name, *term_id); } } @@ -347,53 +351,58 @@ impl InvertedIndexReader { .map(|f| (f.name.clone(), f.data.clone())) .collect::>(); + let mut block_postings_map = HashMap::with_capacity(term_infos.len()); + let mut position_reader_map = HashMap::with_capacity(term_infos.len()); for (slice_name, slice_data) in slice_files.into_iter() { - let term = name_map.get(&slice_name).unwrap(); - let term_value = term_values.get_mut(term).unwrap(); + let term_id = slice_name_map.remove(&slice_name).unwrap(); + let (_, term_info) = term_infos.get(&term_id).unwrap(); if slice_name.starts_with("idx") { let posting_file = FileSlice::new(Arc::new(slice_data)); - let postings = BlockSegmentPostings::open( - term_value.term_info.doc_freq, + let block_postings = BlockSegmentPostings::open( + term_info.doc_freq, posting_file, *index_record, *index_record, )?; - let doc_ids = postings.docs(); - let term_freqs = postings.freqs(); - term_value.doc_ids = doc_ids.to_vec(); - term_value.term_freqs = term_freqs.to_vec(); + block_postings_map.insert(term_id, block_postings); } else if slice_name.starts_with("pos") { let position_reader = PositionReader::open(slice_data)?; - term_value.position_reader = Some(position_reader); + position_reader_map.insert(term_id, position_reader); } } - // 6. collect matched rows by term values. - let matched_docs = collect_matched_rows( - query.box_clone(), + // 6. collect matched doc ids. + let mut collector = DocIdsCollector::create( row_count, - &fuzziness_terms, - &mut term_values, + need_position, + matched_terms, + term_infos, + block_postings_map, + position_reader_map, ); - - if !matched_docs.is_empty() { - let mut matched_rows = Vec::with_capacity(matched_docs.len()); - if has_score { - // TODO: add score - for doc_id in matched_docs { - matched_rows.push((doc_id as usize, Some(F32::from(1.0)))); - } - } else { - for doc_id in matched_docs { - matched_rows.push((doc_id as usize, None)) + let matched_doc_ids = + collector.collect_matched_rows(query.box_clone(), &prefix_terms, &fuzziness_terms)?; + + if let Some(matched_doc_ids) = matched_doc_ids { + if !matched_doc_ids.is_empty() { + let mut matched_rows = Vec::with_capacity(matched_doc_ids.len() as usize); + let doc_ids_iter = matched_doc_ids.iter(); + if has_score { + // TODO: add score + for doc_id in doc_ids_iter { + matched_rows.push((doc_id as usize, Some(F32::from(1.0)))); + } + } else { + for doc_id in doc_ids_iter { + matched_rows.push((doc_id as usize, None)); + } } + return Ok(Some(matched_rows)); } - Ok(Some(matched_rows)) - } else { - Ok(None) } + Ok(None) } // delegation of [InvertedIndexFileReader::cache_key_of_index_columns] diff --git a/src/query/storages/fuse/src/io/write/inverted_index_writer.rs b/src/query/storages/fuse/src/io/write/inverted_index_writer.rs index 8935a3420e78..232ebe3ab773 100644 --- a/src/query/storages/fuse/src/io/write/inverted_index_writer.rs +++ b/src/query/storages/fuse/src/io/write/inverted_index_writer.rs @@ -14,8 +14,6 @@ use std::collections::BTreeMap; use std::collections::HashSet; -use std::io::Write; -use std::path::Path; use std::sync::Arc; use arrow_ipc::writer::write_message; @@ -52,16 +50,12 @@ use tantivy::tokenizer::Stemmer; use tantivy::tokenizer::StopWordFilter; use tantivy::tokenizer::TextAnalyzer; use tantivy::tokenizer::TokenizerManager; -use tantivy::Directory; -use tantivy::Index; use tantivy::IndexBuilder; use tantivy::IndexSettings; use tantivy::IndexWriter; use tantivy::SegmentComponent; use tantivy_jieba::JiebaTokenizer; -use crate::index::build_tantivy_footer; - pub struct InvertedIndexWriter { schema: DataSchemaRef, index_writer: IndexWriter, @@ -175,178 +169,6 @@ impl InvertedIndexWriter { Ok((inverted_index_schema, inverted_index_block)) } - - // The tantivy index data consists of eight files. - // `.managed.json` file stores the name of the file that the index contains, for example: - // [ - // "94bce521d5bc4eccbf3e7a0212093622.pos", - // "94bce521d5bc4eccbf3e7a0212093622.fieldnorm", - // "94bce521d5bc4eccbf3e7a0212093622.fast", - // "meta.json", - // "94bce521d5bc4eccbf3e7a0212093622.term", - // "94bce521d5bc4eccbf3e7a0212093622.store", - // "94bce521d5bc4eccbf3e7a0212093622.idx" - // ] - // - // `meta.json` file store the meta information associated with the index, for example: - // { - // "index_settings": { - // "docstore_compression": "lz4", - // "docstore_blocksize": 16384 - // }, - // "segments":[{ - // "segment_id": "94bce521-d5bc-4ecc-bf3e-7a0212093622", - // "max_doc": 6, - // "deletes": null - // }], - // "schema":[{ - // "name": "title", - // "type": "text", - // "options": { - // "indexing": { - // "record": "position", - // "fieldnorms": true, - // "tokenizer": "en" - // }, - // "stored": false, - // "fast": false - // } - // }, { - // "name": "content", - // "type": "text", - // "options": { - // "indexing": { - // "record": "position", - // "fieldnorms": true, - // "tokenizer": "en" - // }, - // "stored": false, - // "fast": false - // } - // }], - // "opstamp":0 - // } - // - // `terms` file stores the term dictionary, the value is - // an address into the `postings` file and the `positions` file. - // `postings` file stores the lists of document ids and freqs. - // `positions` file stores the positions of terms in each document. - // `field norms` file stores the sum of the length of the term. - // `fast fields` file stores column-oriented documents. - // `store` file stores row-oriented documents. - // - // More details can be seen here - // https://github.com/quickwit-oss/tantivy/blob/main/src/index/segment_component.rs#L8 - // - // We merge the data from these files into one file and - // record the offset to read each part of the data. - #[async_backtrace::framed] - fn write_index(mut writer: &mut W, index: &Index) -> Result<()> { - let directory = index.directory(); - - let managed_filepath = Path::new(".managed.json"); - let managed_bytes = directory.atomic_read(managed_filepath)?; - - let meta_filepath = Path::new("meta.json"); - let meta_data = directory.atomic_read(meta_filepath)?; - - let meta_string = std::str::from_utf8(&meta_data)?; - let meta_val: serde_json::Value = serde_json::from_str(meta_string)?; - let meta_json: String = serde_json::to_string(&meta_val)?; - - let segments = index.searchable_segments()?; - let segment = &segments[0]; - - let fast_fields = segment.open_read(SegmentComponent::FastFields)?; - let fast_fields_bytes = fast_fields.read_bytes()?; - - let store = segment.open_read(SegmentComponent::Store)?; - let store_bytes = store.read_bytes()?; - - let field_norms = segment.open_read(SegmentComponent::FieldNorms)?; - let field_norms_bytes = field_norms.read_bytes()?; - - let positions = segment.open_read(SegmentComponent::Positions)?; - let positions_bytes = positions.read_bytes()?; - - let postings = segment.open_read(SegmentComponent::Postings)?; - let postings_bytes = postings.read_bytes()?; - - let terms = segment.open_read(SegmentComponent::Terms)?; - let terms_bytes = terms.read_bytes()?; - - // write each tantivy files as part of data - let mut fast_fields_length = writer.write(&fast_fields_bytes)?; - let footer_length = Self::build_footer(&mut writer, &fast_fields_bytes)?; - fast_fields_length += footer_length; - - let mut store_length = writer.write(&store_bytes)?; - let footer_length = Self::build_footer(&mut writer, &store_bytes)?; - store_length += footer_length; - - let mut field_norms_length = writer.write(&field_norms_bytes)?; - let footer_length = Self::build_footer(&mut writer, &field_norms_bytes)?; - field_norms_length += footer_length; - - let mut positions_length = writer.write(&positions_bytes)?; - let footer_length = Self::build_footer(&mut writer, &positions_bytes)?; - positions_length += footer_length; - - let mut postings_length = writer.write(&postings_bytes)?; - let footer_length = Self::build_footer(&mut writer, &postings_bytes)?; - postings_length += footer_length; - - let mut terms_length = writer.write(&terms_bytes)?; - let footer_length = Self::build_footer(&mut writer, &terms_bytes)?; - terms_length += footer_length; - - let meta_length = writer.write(meta_json.as_bytes())?; - let managed_length = writer.write(&managed_bytes)?; - - // write offsets of each parts - let mut offset: u32 = 0; - let mut offsets = Vec::with_capacity(8); - offset += fast_fields_length as u32; - offsets.push(offset); - - offset += store_length as u32; - offsets.push(offset); - - offset += field_norms_length as u32; - offsets.push(offset); - - offset += positions_length as u32; - offsets.push(offset); - - offset += postings_length as u32; - offsets.push(offset); - - offset += terms_length as u32; - offsets.push(offset); - - offset += meta_length as u32; - offsets.push(offset); - - offset += managed_length as u32; - offsets.push(offset); - - // the number of offsets, used for multi index segments in one file - let nums = offsets.len() as u32; - for offset in offsets { - writer.write_all(&offset.to_le_bytes())?; - } - writer.write_all(&nums.to_le_bytes())?; - - writer.flush()?; - - Ok(()) - } - - fn build_footer(writer: &mut W, bytes: &[u8]) -> Result { - let buf = build_tantivy_footer(bytes)?; - let len = writer.write(&buf)?; - Ok(len) - } } // inverted index block include 5 types of data, diff --git a/src/query/storages/fuse/src/pruning/inverted_index_pruner.rs b/src/query/storages/fuse/src/pruning/inverted_index_pruner.rs index 9f58e4233edf..a5f585e15dd5 100644 --- a/src/query/storages/fuse/src/pruning/inverted_index_pruner.rs +++ b/src/query/storages/fuse/src/pruning/inverted_index_pruner.rs @@ -53,13 +53,12 @@ use crate::io::TableMetaLocationGenerator; pub struct InvertedIndexPruner { dal: Operator, has_score: bool, - field_nums: usize, index_name: String, index_version: String, need_position: bool, tokenizer_manager: TokenizerManager, query: Box, - field_ids: HashSet, + field_ids: HashSet, index_record: IndexRecordOption, fuzziness: Option, } @@ -84,7 +83,7 @@ impl InvertedIndexPruner { let mut field_ids = HashSet::new(); query.query_terms(&mut |term, pos| { let field = term.field(); - let field_id = field.field_id() as usize; + let field_id = field.field_id(); field_ids.insert(field_id); if pos { need_position = true; @@ -93,14 +92,12 @@ impl InvertedIndexPruner { // whether need to generate score internl column let has_score = inverted_index_info.has_score; - let field_nums = inverted_index_info.index_schema.num_fields(); let index_name = inverted_index_info.index_name.clone(); let index_version = inverted_index_info.index_version.clone(); return Ok(Some(Arc::new(InvertedIndexPruner { dal, has_score, - field_nums, index_name, index_version, need_position, @@ -130,7 +127,6 @@ impl InvertedIndexPruner { let matched_rows = inverted_index_reader .do_filter( - self.field_nums, self.need_position, self.has_score, self.query.box_clone(), @@ -138,7 +134,7 @@ impl InvertedIndexPruner { &self.index_record, &self.fuzziness, self.tokenizer_manager.clone(), - row_count as u32, + row_count, &index_loc, ) .await?; diff --git a/src/query/storages/stream/src/stream_table.rs b/src/query/storages/stream/src/stream_table.rs index cf5599b468d0..e593ab106acd 100644 --- a/src/query/storages/stream/src/stream_table.rs +++ b/src/query/storages/stream/src/stream_table.rs @@ -264,10 +264,9 @@ impl StreamTable { .get_table_name_by_id(source_table_id) .await .and_then(|opt| { - opt.ok_or(ErrorCode::UnknownTable(format!( - "Unknown table id: '{}'", - source_table_id - ))) + opt.ok_or_else(|| { + ErrorCode::UnknownTable(format!("Unknown table id: '{}'", source_table_id)) + }) }) } @@ -286,7 +285,7 @@ impl StreamTable { let source_table_meta = catalog .get_table_meta_by_id(source_table_id) .await? - .ok_or(ErrorCode::Internal("source database id must be set"))?; + .ok_or_else(|| ErrorCode::Internal("source database id must be set"))?; source_table_meta .data .options From c304e3c96b4823182b92777ac0f1a2504fe85c41 Mon Sep 17 00:00:00 2001 From: zhya Date: Mon, 30 Sep 2024 14:57:47 +0800 Subject: [PATCH 13/14] chore(storage): refactor compact source (#16527) refactor compact source --- .../src/pipelines/builders/builder_compact.rs | 141 +++++++++- src/query/storages/fuse/src/fuse_table.rs | 4 + .../storages/fuse/src/operations/compact.rs | 134 ---------- .../fuse/src/operations/mutation/meta/mod.rs | 1 + .../operations/mutation/meta/mutation_meta.rs | 44 ++++ .../mutation/processors/compact_source.rs | 243 ++++++++---------- .../src/operations/mutation/processors/mod.rs | 1 + 7 files changed, 289 insertions(+), 279 deletions(-) diff --git a/src/query/service/src/pipelines/builders/builder_compact.rs b/src/query/service/src/pipelines/builders/builder_compact.rs index 0888f5571715..03fadf872e31 100644 --- a/src/query/service/src/pipelines/builders/builder_compact.rs +++ b/src/query/service/src/pipelines/builders/builder_compact.rs @@ -12,15 +12,35 @@ // See the License for the specific language governing permissions and // limitations under the License. +use databend_common_base::runtime::Runtime; +use databend_common_catalog::plan::PartInfoType; +use databend_common_catalog::plan::Partitions; +use databend_common_catalog::plan::PartitionsShuffleKind; +use databend_common_catalog::plan::Projection; +use databend_common_catalog::table::Table; +use databend_common_catalog::table_context::TableContext; use databend_common_exception::Result; use databend_common_pipeline_sources::EmptySource; -use databend_common_sql::executor::physical_plans::CompactSource; +use databend_common_pipeline_sources::PrefetchAsyncSourcer; +use databend_common_pipeline_transforms::processors::TransformPipelineHelper; +use databend_common_sql::executor::physical_plans::CompactSource as PhysicalCompactSource; +use databend_common_sql::executor::physical_plans::MutationKind; +use databend_common_sql::StreamContext; +use databend_common_storages_fuse::operations::BlockCompactMutator; +use databend_common_storages_fuse::operations::CompactLazyPartInfo; +use databend_common_storages_fuse::operations::CompactSource; +use databend_common_storages_fuse::operations::CompactTransform; +use databend_common_storages_fuse::operations::TableMutationAggregator; +use databend_common_storages_fuse::operations::TransformSerializeBlock; use databend_common_storages_fuse::FuseTable; use crate::pipelines::PipelineBuilder; impl PipelineBuilder { - pub(crate) fn build_compact_source(&mut self, compact_block: &CompactSource) -> Result<()> { + pub(crate) fn build_compact_source( + &mut self, + compact_block: &PhysicalCompactSource, + ) -> Result<()> { let table = self .ctx .build_table_by_table_info(&compact_block.table_info, None)?; @@ -30,11 +50,120 @@ impl PipelineBuilder { return self.main_pipeline.add_source(EmptySource::create, 1); } - table.build_compact_source( + let is_lazy = compact_block.parts.partitions_type() == PartInfoType::LazyLevel; + let thresholds = table.get_block_thresholds(); + let cluster_key_id = table.cluster_key_id(); + let mut max_threads = self.ctx.get_settings().get_max_threads()? as usize; + + if is_lazy { + let query_ctx = self.ctx.clone(); + + let lazy_parts = compact_block + .parts + .partitions + .iter() + .map(|v| { + v.as_any() + .downcast_ref::() + .unwrap() + .clone() + }) + .collect::>(); + + let column_ids = compact_block.column_ids.clone(); + self.main_pipeline.set_on_init(move || { + let ctx = query_ctx.clone(); + let partitions = Runtime::with_worker_threads(2, None)?.block_on(async move { + let partitions = BlockCompactMutator::build_compact_tasks( + ctx.clone(), + column_ids.clone(), + cluster_key_id, + thresholds, + lazy_parts, + ) + .await?; + + Result::<_>::Ok(partitions) + })?; + + let partitions = Partitions::create(PartitionsShuffleKind::Mod, partitions); + query_ctx.set_partitions(partitions)?; + Ok(()) + }); + } else { + max_threads = max_threads.min(compact_block.parts.len()).max(1); + self.ctx.set_partitions(compact_block.parts.clone())?; + } + + let block_reader = table.create_block_reader( + self.ctx.clone(), + Projection::Columns(table.all_column_indices()), + false, + table.change_tracking_enabled(), + false, + )?; + let stream_ctx = if table.change_tracking_enabled() { + Some(StreamContext::try_create( + self.ctx.get_function_context()?, + table.schema_with_stream(), + table.get_table_info().ident.seq, + false, + false, + )?) + } else { + None + }; + // Add source pipe. + self.main_pipeline.add_source( + |output| { + let source = CompactSource::create(self.ctx.clone(), block_reader.clone(), 1); + PrefetchAsyncSourcer::create(self.ctx.clone(), output, source) + }, + max_threads, + )?; + let storage_format = table.get_storage_format(); + self.main_pipeline.add_block_meta_transformer(|| { + CompactTransform::create( + self.ctx.clone(), + block_reader.clone(), + storage_format, + stream_ctx.clone(), + ) + }); + + // sort + let cluster_stats_gen = table.cluster_gen_for_append( self.ctx.clone(), - compact_block.parts.clone(), - compact_block.column_ids.clone(), &mut self.main_pipeline, - ) + thresholds, + None, + )?; + self.main_pipeline.add_transform(|input, output| { + let proc = TransformSerializeBlock::try_create( + self.ctx.clone(), + input, + output, + table, + cluster_stats_gen.clone(), + MutationKind::Compact, + )?; + proc.into_processor() + })?; + + if is_lazy { + self.main_pipeline.try_resize(1)?; + self.main_pipeline.add_async_accumulating_transformer(|| { + TableMutationAggregator::create( + table, + self.ctx.clone(), + vec![], + vec![], + vec![], + Default::default(), + MutationKind::Compact, + ) + }); + } + Ok(()) } } diff --git a/src/query/storages/fuse/src/fuse_table.rs b/src/query/storages/fuse/src/fuse_table.rs index 1895b6ac8bb0..71ef0b77de75 100644 --- a/src/query/storages/fuse/src/fuse_table.rs +++ b/src/query/storages/fuse/src/fuse_table.rs @@ -461,6 +461,10 @@ impl FuseTable { }; Ok(retention_period) } + + pub fn get_storage_format(&self) -> FuseStorageFormat { + self.storage_format + } } #[async_trait::async_trait] diff --git a/src/query/storages/fuse/src/operations/compact.rs b/src/query/storages/fuse/src/operations/compact.rs index 491c089c382a..d17fe2f72cb6 100644 --- a/src/query/storages/fuse/src/operations/compact.rs +++ b/src/query/storages/fuse/src/operations/compact.rs @@ -12,31 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashSet; use std::sync::Arc; -use databend_common_base::runtime::Runtime; -use databend_common_catalog::plan::PartInfoType; use databend_common_catalog::plan::Partitions; -use databend_common_catalog::plan::PartitionsShuffleKind; -use databend_common_catalog::plan::Projection; use databend_common_catalog::table::CompactionLimits; use databend_common_exception::Result; -use databend_common_expression::ColumnId; use databend_common_expression::ComputedExpr; use databend_common_expression::FieldIndex; -use databend_common_pipeline_core::Pipeline; -use databend_common_pipeline_transforms::processors::TransformPipelineHelper; -use databend_common_sql::executor::physical_plans::MutationKind; -use databend_common_sql::StreamContext; -use databend_storages_common_table_meta::meta::Statistics; use databend_storages_common_table_meta::meta::TableSnapshot; -use crate::operations::common::TableMutationAggregator; -use crate::operations::common::TransformSerializeBlock; use crate::operations::mutation::BlockCompactMutator; -use crate::operations::mutation::CompactLazyPartInfo; -use crate::operations::mutation::CompactSource; use crate::operations::mutation::SegmentCompactMutator; use crate::FuseTable; use crate::Table; @@ -119,125 +104,6 @@ impl FuseTable { ))) } - pub fn build_compact_source( - &self, - ctx: Arc, - parts: Partitions, - column_ids: HashSet, - pipeline: &mut Pipeline, - ) -> Result<()> { - let is_lazy = parts.partitions_type() == PartInfoType::LazyLevel; - let thresholds = self.get_block_thresholds(); - let cluster_key_id = self.cluster_key_id(); - let mut max_threads = ctx.get_settings().get_max_threads()? as usize; - - if is_lazy { - let query_ctx = ctx.clone(); - - let lazy_parts = parts - .partitions - .into_iter() - .map(|v| { - v.as_any() - .downcast_ref::() - .unwrap() - .clone() - }) - .collect::>(); - - pipeline.set_on_init(move || { - let ctx = query_ctx.clone(); - let column_ids = column_ids.clone(); - let partitions = Runtime::with_worker_threads(2, None)?.block_on(async move { - let partitions = BlockCompactMutator::build_compact_tasks( - ctx.clone(), - column_ids, - cluster_key_id, - thresholds, - lazy_parts, - ) - .await?; - - Result::<_>::Ok(partitions) - })?; - - let partitions = Partitions::create(PartitionsShuffleKind::Mod, partitions); - query_ctx.set_partitions(partitions)?; - Ok(()) - }); - } else { - max_threads = max_threads.min(parts.len()).max(1); - ctx.set_partitions(parts)?; - } - - let all_column_indices = self.all_column_indices(); - let projection = Projection::Columns(all_column_indices); - let block_reader = self.create_block_reader( - ctx.clone(), - projection, - false, - self.change_tracking_enabled(), - false, - )?; - let stream_ctx = if self.change_tracking_enabled() { - Some(StreamContext::try_create( - ctx.get_function_context()?, - self.schema_with_stream(), - self.get_table_info().ident.seq, - false, - false, - )?) - } else { - None - }; - // Add source pipe. - pipeline.add_source( - |output| { - CompactSource::try_create( - ctx.clone(), - self.storage_format, - block_reader.clone(), - stream_ctx.clone(), - output, - ) - }, - max_threads, - )?; - - // sort - let cluster_stats_gen = - self.cluster_gen_for_append(ctx.clone(), pipeline, thresholds, None)?; - pipeline.add_transform( - |input: Arc, output| { - let proc = TransformSerializeBlock::try_create( - ctx.clone(), - input, - output, - self, - cluster_stats_gen.clone(), - MutationKind::Compact, - )?; - proc.into_processor() - }, - )?; - - if is_lazy { - pipeline.try_resize(1)?; - pipeline.add_async_accumulating_transformer(|| { - TableMutationAggregator::create( - self, - ctx.clone(), - vec![], - vec![], - vec![], - Statistics::default(), - MutationKind::Compact, - ) - }); - } - Ok(()) - } - async fn compact_options_with_segment_limit( &self, num_segment_limit: Option, diff --git a/src/query/storages/fuse/src/operations/mutation/meta/mod.rs b/src/query/storages/fuse/src/operations/mutation/meta/mod.rs index 776b7d48979b..0a14e6b7ad28 100644 --- a/src/query/storages/fuse/src/operations/mutation/meta/mod.rs +++ b/src/query/storages/fuse/src/operations/mutation/meta/mod.rs @@ -21,6 +21,7 @@ pub use compact_part::CompactExtraInfo; pub use compact_part::CompactLazyPartInfo; pub use compact_part::CompactTaskInfo; pub use mutation_meta::ClusterStatsGenType; +pub use mutation_meta::CompactSourceMeta; pub use mutation_meta::SerializeBlock; pub use mutation_meta::SerializeDataMeta; pub use mutation_part::DeletedSegmentInfo; diff --git a/src/query/storages/fuse/src/operations/mutation/meta/mutation_meta.rs b/src/query/storages/fuse/src/operations/mutation/meta/mutation_meta.rs index b98943f6821e..913462add22c 100644 --- a/src/query/storages/fuse/src/operations/mutation/meta/mutation_meta.rs +++ b/src/query/storages/fuse/src/operations/mutation/meta/mutation_meta.rs @@ -12,13 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::sync::Arc; + use databend_common_expression::BlockMetaInfo; use databend_common_expression::BlockMetaInfoDowncast; +use databend_storages_common_table_meta::meta::BlockMeta; use databend_storages_common_table_meta::meta::ClusterStatistics; use crate::operations::common::BlockMetaIndex; use crate::operations::mutation::CompactExtraInfo; use crate::operations::mutation::DeletedSegmentInfo; +use crate::MergeIOReadResult; #[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq)] pub enum SerializeDataMeta { @@ -55,3 +59,43 @@ impl SerializeBlock { SerializeBlock { index, stats_type } } } + +pub enum CompactSourceMeta { + Concat { + read_res: Vec, + metas: Vec>, + index: BlockMetaIndex, + }, + Extras(CompactExtraInfo), +} + +#[typetag::serde(name = "compact_data_source")] +impl BlockMetaInfo for CompactSourceMeta { + fn equals(&self, _: &Box) -> bool { + unimplemented!("Unimplemented equals CompactSourceMeta") + } + + fn clone_self(&self) -> Box { + unimplemented!("Unimplemented clone CompactSourceMeta") + } +} + +impl std::fmt::Debug for CompactSourceMeta { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + f.debug_struct("CompactSourceMeta").finish() + } +} + +impl serde::Serialize for CompactSourceMeta { + fn serialize(&self, _: S) -> std::result::Result + where S: serde::Serializer { + unimplemented!("Unimplemented serialize DataSourceMeta") + } +} + +impl<'de> serde::Deserialize<'de> for CompactSourceMeta { + fn deserialize(_: D) -> std::result::Result + where D: serde::Deserializer<'de> { + unimplemented!("Unimplemented deserialize DataSourceMeta") + } +} diff --git a/src/query/storages/fuse/src/operations/mutation/processors/compact_source.rs b/src/query/storages/fuse/src/operations/mutation/processors/compact_source.rs index 203a6e73ed3f..0256651a7a3b 100644 --- a/src/query/storages/fuse/src/operations/mutation/processors/compact_source.rs +++ b/src/query/storages/fuse/src/operations/mutation/processors/compact_source.rs @@ -12,125 +12,148 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::any::Any; use std::sync::Arc; use std::time::Instant; use databend_common_base::base::ProgressValues; use databend_common_catalog::plan::gen_mutation_stream_meta; -use databend_common_catalog::plan::PartInfoPtr; use databend_common_catalog::table_context::TableContext; -use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::DataBlock; use databend_common_metrics::storage::*; -use databend_common_pipeline_core::processors::Event; -use databend_common_pipeline_core::processors::OutputPort; -use databend_common_pipeline_core::processors::Processor; -use databend_common_pipeline_core::processors::ProcessorPtr; +use databend_common_pipeline_sources::PrefetchAsyncSource; +use databend_common_pipeline_transforms::processors::BlockMetaTransform; +use databend_common_pipeline_transforms::processors::UnknownMode; use databend_common_sql::StreamContext; -use databend_storages_common_table_meta::meta::BlockMeta; use crate::io::BlockReader; use crate::io::ReadSettings; -use crate::operations::mutation::ClusterStatsGenType; -use crate::operations::mutation::CompactBlockPartInfo; -use crate::operations::mutation::SerializeBlock; -use crate::operations::mutation::SerializeDataMeta; -use crate::operations::BlockMetaIndex; +use crate::operations::ClusterStatsGenType; +use crate::operations::CompactBlockPartInfo; +use crate::operations::CompactSourceMeta; +use crate::operations::SerializeBlock; +use crate::operations::SerializeDataMeta; use crate::FuseStorageFormat; -use crate::MergeIOReadResult; - -enum State { - ReadData(Option), - Concat { - read_res: Vec, - metas: Vec>, - index: BlockMetaIndex, - }, - Output(Option, DataBlock), - Finish, -} pub struct CompactSource { - state: State, ctx: Arc, block_reader: Arc, - storage_format: FuseStorageFormat, - output: Arc, - stream_ctx: Option, + prefetch_num: usize, } impl CompactSource { - pub fn try_create( + pub fn create( ctx: Arc, - storage_format: FuseStorageFormat, block_reader: Arc, - stream_ctx: Option, - output: Arc, - ) -> Result { - Ok(ProcessorPtr::create(Box::new(CompactSource { - state: State::ReadData(None), + prefetch_num: usize, + ) -> Self { + Self { ctx, block_reader, - storage_format, - output, - stream_ctx, - }))) + prefetch_num, + } } } #[async_trait::async_trait] -impl Processor for CompactSource { - fn name(&self) -> String { - "CompactSource".to_string() - } +impl PrefetchAsyncSource for CompactSource { + const NAME: &'static str = "CompactSource"; - fn as_any(&mut self) -> &mut dyn Any { - self + const SKIP_EMPTY_DATA_BLOCK: bool = false; + + fn is_full(&self, prefetched: &[DataBlock]) -> bool { + prefetched.len() >= self.prefetch_num } - fn event(&mut self) -> Result { - if matches!(self.state, State::ReadData(None)) { - self.state = self - .ctx - .get_partition() - .map_or(State::Finish, |part| State::ReadData(Some(part))); - } + async fn generate(&mut self) -> Result> { + let part = match self.ctx.get_partition() { + Some(part) => part, + None => return Ok(None), + }; - if self.output.is_finished() { - return Ok(Event::Finished); - } + let part = CompactBlockPartInfo::from_part(&part)?; + let meta = match part { + CompactBlockPartInfo::CompactTaskInfo(task) => { + let mut task_futures = Vec::new(); + for block in &task.blocks { + let settings = ReadSettings::from_ctx(&self.ctx)?; + let block_reader = self.block_reader.clone(); + let block = block.clone(); + // read block in parallel. + task_futures.push(async move { + databend_common_base::runtime::spawn(async move { + // Perf + { + metrics_inc_compact_block_read_nums(1); + metrics_inc_compact_block_read_bytes(block.block_size); + } + + block_reader + .read_columns_data_by_merge_io( + &settings, + &block.location.0, + &block.col_metas, + &None, + ) + .await + }) + .await + .unwrap() + }); + } - if !self.output.can_push() { - return Ok(Event::NeedConsume); - } + let start = Instant::now(); - match self.state { - State::ReadData(_) => Ok(Event::Async), - State::Concat { .. } => Ok(Event::Sync), - State::Output(_, _) => { - if let State::Output(part, data_block) = - std::mem::replace(&mut self.state, State::Finish) + let read_res = futures::future::try_join_all(task_futures).await?; + // Perf. { - self.state = part.map_or(State::Finish, |part| State::ReadData(Some(part))); - - self.output.push_data(Ok(data_block)); - Ok(Event::NeedConsume) - } else { - Err(ErrorCode::Internal("It's a bug.")) + metrics_inc_compact_block_read_milliseconds(start.elapsed().as_millis() as u64); } + Box::new(CompactSourceMeta::Concat { + read_res, + metas: task.blocks.clone(), + index: task.index.clone(), + }) } - State::Finish => { - self.output.finish(); - Ok(Event::Finished) + CompactBlockPartInfo::CompactExtraInfo(extra) => { + Box::new(CompactSourceMeta::Extras(extra.clone())) } + }; + Ok(Some(DataBlock::empty_with_meta(meta))) + } +} + +pub struct CompactTransform { + ctx: Arc, + block_reader: Arc, + storage_format: FuseStorageFormat, + stream_ctx: Option, +} + +impl CompactTransform { + pub fn create( + ctx: Arc, + block_reader: Arc, + storage_format: FuseStorageFormat, + stream_ctx: Option, + ) -> Self { + Self { + ctx, + block_reader, + storage_format, + stream_ctx, } } +} + +#[async_trait::async_trait] +impl BlockMetaTransform for CompactTransform { + const UNKNOWN_MODE: UnknownMode = UnknownMode::Pass; + const NAME: &'static str = "CompactTransform"; - fn process(&mut self) -> Result<()> { - match std::mem::replace(&mut self.state, State::Finish) { - State::Concat { + fn transform(&mut self, meta: CompactSourceMeta) -> Result> { + match meta { + CompactSourceMeta::Concat { read_res, metas, index, @@ -171,70 +194,12 @@ impl Processor for CompactSource { bytes: new_block.memory_size(), }; self.ctx.get_write_progress().incr(&progress_values); - - self.state = State::Output(self.ctx.get_partition(), new_block); + Ok(vec![new_block]) } - _ => return Err(ErrorCode::Internal("It's a bug.")), - } - Ok(()) - } - - #[async_backtrace::framed] - async fn async_process(&mut self) -> Result<()> { - match std::mem::replace(&mut self.state, State::Finish) { - State::ReadData(Some(part)) => { - let block_reader = self.block_reader.as_ref(); - - // block read tasks. - let mut task_futures = Vec::new(); - let part = CompactBlockPartInfo::from_part(&part)?; - match part { - CompactBlockPartInfo::CompactExtraInfo(extra) => { - let meta = Box::new(SerializeDataMeta::CompactExtras(extra.clone())); - let block = DataBlock::empty_with_meta(meta); - self.state = State::Output(self.ctx.get_partition(), block); - } - CompactBlockPartInfo::CompactTaskInfo(task) => { - for block in &task.blocks { - let settings = ReadSettings::from_ctx(&self.ctx)?; - // read block in parallel. - task_futures.push(async move { - // Perf - { - metrics_inc_compact_block_read_nums(1); - metrics_inc_compact_block_read_bytes(block.block_size); - } - - block_reader - .read_columns_data_by_merge_io( - &settings, - &block.location.0, - &block.col_metas, - &None, - ) - .await - }); - } - - let start = Instant::now(); - - let read_res = futures::future::try_join_all(task_futures).await?; - // Perf. - { - metrics_inc_compact_block_read_milliseconds( - start.elapsed().as_millis() as u64, - ); - } - self.state = State::Concat { - read_res, - metas: task.blocks.clone(), - index: task.index.clone(), - }; - } - } - Ok(()) + CompactSourceMeta::Extras(extra) => { + let meta = Box::new(SerializeDataMeta::CompactExtras(extra)); + Ok(vec![DataBlock::empty_with_meta(meta)]) } - _ => Err(ErrorCode::Internal("It's a bug.")), } } } diff --git a/src/query/storages/fuse/src/operations/mutation/processors/mod.rs b/src/query/storages/fuse/src/operations/mutation/processors/mod.rs index f86df532bad5..ae312d9f438e 100644 --- a/src/query/storages/fuse/src/operations/mutation/processors/mod.rs +++ b/src/query/storages/fuse/src/operations/mutation/processors/mod.rs @@ -16,5 +16,6 @@ mod compact_source; mod mutation_source; pub use compact_source::CompactSource; +pub use compact_source::CompactTransform; pub use mutation_source::MutationAction; pub use mutation_source::MutationSource; From a1497e26094a81caefa9c5543c6b1c0223ad3cda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E7=82=8E=E6=B3=BC?= Date: Mon, 30 Sep 2024 16:28:04 +0800 Subject: [PATCH 14/14] fix: move TableInfo building out of SchemaApi (#16548) `SchemaApi` can not provide enough information such as `CatalogInfo` and `DatabaseType`. Therefore `SchemaApi` should not build a `TableInfo`, leave such task to its caller. --- src/meta/api/src/schema_api_impl.rs | 38 ++++++--------- src/meta/api/src/schema_api_test_suite.rs | 47 ++++++++++++------- src/meta/app/src/schema/table.rs | 25 +++++++++- .../src/catalogs/default/mutable_catalog.rs | 14 ++++-- 4 files changed, 80 insertions(+), 44 deletions(-) diff --git a/src/meta/api/src/schema_api_impl.rs b/src/meta/api/src/schema_api_impl.rs index ffdf39541597..6e4da9fddac3 100644 --- a/src/meta/api/src/schema_api_impl.rs +++ b/src/meta/api/src/schema_api_impl.rs @@ -2658,13 +2658,13 @@ impl + ?Sized> SchemaApi for KV { ) .await?; - let mut vacuum_table_infos = vec![]; + let mut vacuum_tables = vec![]; let mut vacuum_ids = vec![]; for db_info in db_infos { - if vacuum_table_infos.len() >= the_limit { + if vacuum_tables.len() >= the_limit { return Ok(ListDroppedTableResp { - drop_table_infos: vacuum_table_infos, + vacuum_tables, drop_ids: vacuum_ids, }); } @@ -2679,7 +2679,7 @@ impl + ?Sized> SchemaApi for KV { drop_time_range.clone() }; - let capacity = the_limit - vacuum_table_infos.len(); + let capacity = the_limit - vacuum_tables.len(); let table_nivs = get_history_tables_for_gc( self, table_drop_time_range, @@ -2692,26 +2692,22 @@ impl + ?Sized> SchemaApi for KV { vacuum_ids.push(DroppedId::from(table_niv.clone())); } + let db_name = db_info.name_ident.database_name().to_string(); + let db_name_ident = db_info.name_ident.clone(); + // A DB can be removed only when all its tables are removed. if vacuum_db && capacity > table_nivs.len() { vacuum_ids.push(DroppedId::Db { db_id: db_info.database_id.db_id, - db_name: db_info.name_ident.database_name().to_string(), + db_name: db_name.clone(), }); } - vacuum_table_infos.extend(table_nivs.iter().take(capacity).map(|niv| { - Arc::new(TableInfo::new( - db_info.name_ident.database_name(), - &niv.name().table_name, - TableIdent::new(niv.id().table_id, niv.value().seq), - niv.value().data.clone(), - )) - })); + vacuum_tables.extend(std::iter::repeat(db_name_ident).zip(table_nivs)); } return Ok(ListDroppedTableResp { - drop_table_infos: vacuum_table_infos, + vacuum_tables, drop_ids: vacuum_ids, }); } @@ -2748,21 +2744,15 @@ impl + ?Sized> SchemaApi for KV { .await?; let mut drop_ids = vec![]; - let mut drop_table_infos = vec![]; + let mut vacuum_tables = vec![]; - for niv in table_nivs.iter() { + for niv in table_nivs { drop_ids.push(DroppedId::from(niv.clone())); - - drop_table_infos.push(Arc::new(TableInfo::new( - db_info.name_ident.database_name(), - &niv.name().table_name, - TableIdent::new(niv.id().table_id, niv.value().seq), - niv.value().data.clone(), - ))); + vacuum_tables.push((tenant_dbname.clone(), niv)); } Ok(ListDroppedTableResp { - drop_table_infos, + vacuum_tables, drop_ids, }) } diff --git a/src/meta/api/src/schema_api_test_suite.rs b/src/meta/api/src/schema_api_test_suite.rs index 9f6e253cd309..f9028f9fa012 100644 --- a/src/meta/api/src/schema_api_test_suite.rs +++ b/src/meta/api/src/schema_api_test_suite.rs @@ -4077,6 +4077,7 @@ impl SchemaApiTestSuite { // first create a database drop within filter time info!("--- create db1"); + let db1_id; { let db_name = DatabaseNameIdent::new(&tenant, "db1"); let req = CreateDatabaseReq { @@ -4086,7 +4087,7 @@ impl SchemaApiTestSuite { }; let res = mt.create_database(req).await?; - let db1_id = res.db_id.db_id; + db1_id = res.db_id; let req = CreateTableReq { create_option: CreateOption::Create, @@ -4103,21 +4104,22 @@ impl SchemaApiTestSuite { }) .await?; - drop_ids_boundary.push(DroppedId::new_table(db1_id, db1_tb1_id, "tb1")); + drop_ids_boundary.push(DroppedId::new_table(*db1_id, db1_tb1_id, "tb1")); drop_ids_boundary.push(DroppedId::Db { - db_id: db1_id, + db_id: *db1_id, db_name: db_name.database_name().to_string(), }); - drop_ids_no_boundary.push(DroppedId::new_table(db1_id, db1_tb1_id, "tb1")); + drop_ids_no_boundary.push(DroppedId::new_table(*db1_id, db1_tb1_id, "tb1")); drop_ids_no_boundary.push(DroppedId::Db { - db_id: db1_id, + db_id: *db1_id, db_name: db_name.database_name().to_string(), }); } // second create a database drop outof filter time, but has a table drop within filter time info!("--- create db2"); + let db2_id; { let create_db_req = CreateDatabaseReq { create_option: CreateOption::Create, @@ -4126,7 +4128,7 @@ impl SchemaApiTestSuite { }; let res = mt.create_database(create_db_req.clone()).await?; - let db2_id = res.db_id; + db2_id = res.db_id; drop_ids_no_boundary.push(DroppedId::Db { db_id: *db2_id, db_name: "db2".to_string(), @@ -4231,6 +4233,7 @@ impl SchemaApiTestSuite { } // third create a database not dropped, but has a table drop within filter time + let db3_id; { let create_db_req = CreateDatabaseReq { create_option: CreateOption::Create, @@ -4239,7 +4242,7 @@ impl SchemaApiTestSuite { }; let res = mt.create_database(create_db_req.clone()).await?; - let db_id = res.db_id; + db3_id = res.db_id; info!("--- create and drop db3.tb1"); { @@ -4250,12 +4253,12 @@ impl SchemaApiTestSuite { as_dropped: false, }; let resp = mt.create_table(req.clone()).await?; - drop_ids_boundary.push(DroppedId::new_table(*db_id, resp.table_id, "tb1")); - drop_ids_no_boundary.push(DroppedId::new_table(*db_id, resp.table_id, "tb1")); + drop_ids_boundary.push(DroppedId::new_table(*db3_id, resp.table_id, "tb1")); + drop_ids_no_boundary.push(DroppedId::new_table(*db3_id, resp.table_id, "tb1")); mt.drop_table_by_id(DropTableByIdReq { if_exists: false, tenant: req.name_ident.tenant.clone(), - db_id: *db_id, + db_id: *db3_id, table_name: req.name_ident.table_name.clone(), tb_id: resp.table_id, engine: "FUSE".to_string(), @@ -4274,11 +4277,11 @@ impl SchemaApiTestSuite { as_dropped: false, }; let resp = mt.create_table(req.clone()).await?; - drop_ids_no_boundary.push(DroppedId::new_table(*db_id, resp.table_id, "tb2")); + drop_ids_no_boundary.push(DroppedId::new_table(*db3_id, resp.table_id, "tb2")); mt.drop_table_by_id(DropTableByIdReq { if_exists: false, tenant: req.name_ident.tenant.clone(), - db_id: *db_id, + db_id: *db3_id, table_name: req.name_ident.table_name.clone(), tb_id: resp.table_id, engine: "FUSE".to_string(), @@ -4331,9 +4334,15 @@ impl SchemaApiTestSuite { .into_iter() .collect(); let actual: BTreeSet = resp - .drop_table_infos + .vacuum_tables .iter() - .map(|table_info| table_info.desc.clone()) + .map(|(db_name_ident, table_niv)| { + format!( + "'{}'.'{}'", + db_name_ident.database_name(), + &table_niv.name().table_name + ) + }) .collect(); assert_eq!(expected, actual); } @@ -4368,9 +4377,15 @@ impl SchemaApiTestSuite { .cloned() .collect(); let actual: BTreeSet = resp - .drop_table_infos + .vacuum_tables .iter() - .map(|table_info| table_info.desc.clone()) + .map(|(db_name_ident, table_niv)| { + format!( + "'{}'.'{}'", + db_name_ident.database_name(), + &table_niv.name().table_name + ) + }) .collect(); assert_eq!(expected, actual); } diff --git a/src/meta/app/src/schema/table.rs b/src/meta/app/src/schema/table.rs index a7a6e48fc8d1..6da8183345f8 100644 --- a/src/meta/app/src/schema/table.rs +++ b/src/meta/app/src/schema/table.rs @@ -130,6 +130,9 @@ impl DBIdTableName { table_name: table_name.to_string(), } } + pub fn display(&self) -> impl Display { + format!("{}.'{}'", self.db_id, self.table_name) + } } impl Display for DBIdTableName { @@ -342,6 +345,7 @@ impl TableInfo { } } + /// Deprecated: use `new_full()`. This method sets default values for some fields. pub fn new(db_name: &str, table_name: &str, ident: TableIdent, meta: TableMeta) -> TableInfo { TableInfo { ident, @@ -352,6 +356,24 @@ impl TableInfo { } } + pub fn new_full( + db_name: &str, + table_name: &str, + ident: TableIdent, + meta: TableMeta, + catalog_info: Arc, + db_type: DatabaseType, + ) -> TableInfo { + TableInfo { + ident, + desc: format!("'{}'.'{}'", db_name, table_name), + name: table_name.to_string(), + meta, + catalog_info, + db_type, + } + } + pub fn schema(&self) -> Arc { self.meta.schema.clone() } @@ -1021,7 +1043,8 @@ impl DroppedId { #[derive(Clone, Debug, PartialEq, Eq)] pub struct ListDroppedTableResp { - pub drop_table_infos: Vec>, + /// The **database_name, (name, id, value)** of a table to vacuum. + pub vacuum_tables: Vec<(DatabaseNameIdent, TableNIV)>, pub drop_ids: Vec, } diff --git a/src/query/service/src/catalogs/default/mutable_catalog.rs b/src/query/service/src/catalogs/default/mutable_catalog.rs index 8c559f34915c..0e89fe51887a 100644 --- a/src/query/service/src/catalogs/default/mutable_catalog.rs +++ b/src/query/service/src/catalogs/default/mutable_catalog.rs @@ -92,6 +92,7 @@ use databend_common_meta_app::schema::RenameTableReply; use databend_common_meta_app::schema::RenameTableReq; use databend_common_meta_app::schema::SetTableColumnMaskPolicyReply; use databend_common_meta_app::schema::SetTableColumnMaskPolicyReq; +use databend_common_meta_app::schema::TableIdent; use databend_common_meta_app::schema::TableInfo; use databend_common_meta_app::schema::TableMeta; use databend_common_meta_app::schema::TruncateTableReply; @@ -524,13 +525,20 @@ impl Catalog for MutableCatalog { let resp = ctx.meta.get_drop_table_infos(req).await?; let drop_ids = resp.drop_ids.clone(); - let drop_table_infos = resp.drop_table_infos; let storage = ctx.storage_factory; let mut tables = vec![]; - for table_info in drop_table_infos { - tables.push(storage.get_table(table_info.as_ref())?); + for (db_name_ident, niv) in resp.vacuum_tables { + let table_info = TableInfo::new_full( + db_name_ident.database_name(), + &niv.name().table_name, + TableIdent::new(niv.id().table_id, niv.value().seq), + niv.value().data.clone(), + self.info(), + DatabaseType::NormalDB, + ); + tables.push(storage.get_table(&table_info)?); } Ok((tables, drop_ids)) }