From 96f52745281cc71c2e7e727970a5986d6a291781 Mon Sep 17 00:00:00 2001 From: Denis Cornehl Date: Thu, 20 Nov 2025 08:57:53 +0100 Subject: [PATCH 1/2] stream sitemap xml to the client --- ...51003d90122f83429313966add5b224f5f6c.json} | 4 +- src/db/mimes.rs | 1 + src/web/sitemap.rs | 144 ++++++++++++------ templates/core/sitemap.xml | 15 -- templates/core/sitemap/_footer.xml | 1 + templates/core/sitemap/_header.xml | 2 + templates/core/sitemap/_item.xml | 10 ++ .../{sitemapindex.xml => sitemap/index.xml} | 0 8 files changed, 114 insertions(+), 63 deletions(-) rename .sqlx/{query-65b0ead56880b369931c3a5ec324910dde51096de4ee2ad868cc5025161ab466.json => query-df1c002b7c4f5e2567eeefff56ee51003d90122f83429313966add5b224f5f6c.json} (50%) delete mode 100644 templates/core/sitemap.xml create mode 100644 templates/core/sitemap/_footer.xml create mode 100644 templates/core/sitemap/_header.xml create mode 100644 templates/core/sitemap/_item.xml rename templates/core/{sitemapindex.xml => sitemap/index.xml} (100%) diff --git a/.sqlx/query-65b0ead56880b369931c3a5ec324910dde51096de4ee2ad868cc5025161ab466.json b/.sqlx/query-df1c002b7c4f5e2567eeefff56ee51003d90122f83429313966add5b224f5f6c.json similarity index 50% rename from .sqlx/query-65b0ead56880b369931c3a5ec324910dde51096de4ee2ad868cc5025161ab466.json rename to .sqlx/query-df1c002b7c4f5e2567eeefff56ee51003d90122f83429313966add5b224f5f6c.json index 095192d9a..ce7e21cd4 100644 --- a/.sqlx/query-65b0ead56880b369931c3a5ec324910dde51096de4ee2ad868cc5025161ab466.json +++ b/.sqlx/query-df1c002b7c4f5e2567eeefff56ee51003d90122f83429313966add5b224f5f6c.json @@ -1,6 +1,6 @@ { "db_name": "PostgreSQL", - "query": "SELECT crates.name,\n releases.target_name,\n MAX(releases.release_time) as \"release_time!\"\n FROM crates\n INNER JOIN releases ON releases.crate_id = crates.id\n WHERE\n rustdoc_status = true AND\n crates.name ILIKE $1\n GROUP BY crates.name, releases.target_name\n ", + "query": "SELECT crates.name,\n releases.target_name,\n MAX(releases.release_time) as \"release_time!\"\n FROM crates\n INNER JOIN releases ON releases.crate_id = crates.id\n WHERE\n rustdoc_status = true AND\n crates.name ILIKE $1\n GROUP BY crates.name, releases.target_name\n ", "describe": { "columns": [ { @@ -30,5 +30,5 @@ null ] }, - "hash": "65b0ead56880b369931c3a5ec324910dde51096de4ee2ad868cc5025161ab466" + "hash": "df1c002b7c4f5e2567eeefff56ee51003d90122f83429313966add5b224f5f6c" } diff --git a/src/db/mimes.rs b/src/db/mimes.rs index c965bcb3b..5917a5f0e 100644 --- a/src/db/mimes.rs +++ b/src/db/mimes.rs @@ -10,6 +10,7 @@ macro_rules! mime { mime!(APPLICATION_ZIP, "application/zip"); mime!(APPLICATION_ZSTD, "application/zstd"); mime!(APPLICATION_GZIP, "application/gzip"); +mime!(APPLICATION_XML, "application/xml"); mime!(TEXT_MARKDOWN, "text/markdown"); mime!(TEXT_RUST, "text/rust"); mime!(TEXT_TOML, "text/toml"); diff --git a/src/web/sitemap.rs b/src/web/sitemap.rs index 6be1ffe18..ec6651464 100644 --- a/src/web/sitemap.rs +++ b/src/web/sitemap.rs @@ -1,8 +1,9 @@ use crate::{ Config, + db::mimes, docbuilder::Limits, impl_axum_webpage, - utils::{ConfigName, get_config}, + utils::{ConfigName, get_config, report_error}, web::{ AxumErrorPage, error::{AxumNope, AxumResult}, @@ -10,15 +11,25 @@ use crate::{ page::templates::{RenderBrands, RenderSolid, filters}, }, }; +use anyhow::Context as _; use askama::Template; -use axum::{extract::Extension, http::StatusCode, response::IntoResponse}; +use async_stream::stream; +use axum::{ + body::{Body, Bytes}, + extract::Extension, + http::StatusCode, + response::IntoResponse, +}; +use axum_extra::{TypedHeader, headers::ContentType}; use chrono::{TimeZone, Utc}; -use futures_util::stream::TryStreamExt; +use futures_util::{StreamExt as _, pin_mut}; use std::sync::Arc; +use tracing::{Span, error}; +use tracing_futures::Instrument as _; /// sitemap index #[derive(Template)] -#[template(path = "core/sitemapindex.xml")] +#[template(path = "core/sitemap/index.xml")] #[derive(Debug, Clone, PartialEq, Eq)] struct SitemapIndexXml { sitemaps: Vec, @@ -35,25 +46,17 @@ pub(crate) async fn sitemapindex_handler() -> impl IntoResponse { SitemapIndexXml { sitemaps } } +#[derive(Template)] +#[template(path = "core/sitemap/_item.xml")] #[derive(Debug, Clone, PartialEq, Eq)] -struct SitemapRow { +struct SitemapItemXml { crate_name: String, last_modified: String, target_name: String, } -/// The sitemap -#[derive(Template)] -#[template(path = "core/sitemap.xml")] -#[derive(Debug, Clone, PartialEq, Eq)] -struct SitemapXml { - releases: Vec, -} - -impl_axum_webpage! { - SitemapXml, - content_type = "application/xml", -} +const SITEMAP_HEADER: &[u8] = include_bytes!("./../../templates/core/sitemap/_header.xml"); +const SITEMAP_FOOTER: &[u8] = include_bytes!("./../../templates/core/sitemap/_footer.xml"); pub(crate) async fn sitemap_handler( Path(letter): Path, @@ -67,37 +70,86 @@ pub(crate) async fn sitemap_handler( return Err(AxumNope::ResourceNotFound); } - let releases: Vec<_> = sqlx::query!( - r#"SELECT crates.name, - releases.target_name, - MAX(releases.release_time) as "release_time!" - FROM crates - INNER JOIN releases ON releases.crate_id = crates.id - WHERE - rustdoc_status = true AND - crates.name ILIKE $1 - GROUP BY crates.name, releases.target_name - "#, - format!("{letter}%"), - ) - .fetch(&mut *conn) - .map_ok(|row| SitemapRow { - crate_name: row.name, - target_name: row - .target_name - .expect("when we have rustdoc_status=true, this field is filled"), - last_modified: row - .release_time - // On Aug 27 2022 we added `` to all pages, - // so they should all get recrawled if they haven't been since then. - .max(Utc.with_ymd_and_hms(2022, 8, 28, 0, 0, 0).unwrap()) - .format("%+") - .to_string(), + let stream_span = Span::current(); + + let stream = stream!({ + let mut items: usize = 0; + let mut streamed_bytes: usize = SITEMAP_HEADER.len(); + + yield Ok(Bytes::from_static(SITEMAP_HEADER)); + + let result = sqlx::query!( + r#"SELECT crates.name, + releases.target_name, + MAX(releases.release_time) as "release_time!" + FROM crates + INNER JOIN releases ON releases.crate_id = crates.id + WHERE + rustdoc_status = true AND + crates.name ILIKE $1 + GROUP BY crates.name, releases.target_name + "#, + format!("{letter}%"), + ) + .fetch(&mut *conn); + + pin_mut!(result); + while let Some(row) = result.next().await { + let row = match row.context("error fetching row from database") { + Ok(row) => row, + Err(err) => { + report_error(&err); + yield Err(AxumNope::InternalError(err)); + break; + } + }; + + match (SitemapItemXml { + crate_name: row.name, + target_name: row + .target_name + .expect("when we have rustdoc_status=true, this field is filled"), + last_modified: row + .release_time + // On Aug 27 2022 we added `` to all pages, + // so they should all get recrawled if they haven't been since then. + .max(Utc.with_ymd_and_hms(2022, 8, 28, 0, 0, 0).unwrap()) + .format("%+") + .to_string(), + } + .render() + .context("error when rendering sitemap item xml")) + { + Ok(item) => { + let bytes = Bytes::from(item); + items += 1; + streamed_bytes += bytes.len(); + yield Ok(bytes); + } + Err(err) => { + report_error(&err); + yield Err(AxumNope::InternalError(err)); + break; + } + }; + } + + streamed_bytes += SITEMAP_FOOTER.len(); + yield Ok(Bytes::from_static(SITEMAP_FOOTER)); + + if items > 50_000 || streamed_bytes > 50 * 1024 * 1024 { + // alert when sitemap limits are reached + // https://developers.google.com/search/docs/crawling-indexing/sitemaps/build-sitemap#general-guidelines + error!(items, streamed_bytes, letter, "sitemap limits exceeded") + } }) - .try_collect() - .await?; + .instrument(stream_span); - Ok(SitemapXml { releases }) + Ok(( + StatusCode::OK, + TypedHeader(ContentType::from(mimes::APPLICATION_XML.clone())), + Body::from_stream(stream), + )) } #[derive(Template)] diff --git a/templates/core/sitemap.xml b/templates/core/sitemap.xml deleted file mode 100644 index 65c9ce571..000000000 --- a/templates/core/sitemap.xml +++ /dev/null @@ -1,15 +0,0 @@ - - - {% for release in releases -%} - - https://docs.rs/{{ release.crate_name }}/latest/{{ release.target_name }}/ - {{ release.last_modified|escape_xml }} - 1.0 - - - https://docs.rs/{{ release.crate_name }}/latest/{{ release.target_name }}/all.html - {{ release.last_modified|escape_xml }} - 0.8 - - {%- endfor %} - diff --git a/templates/core/sitemap/_footer.xml b/templates/core/sitemap/_footer.xml new file mode 100644 index 000000000..d8521b562 --- /dev/null +++ b/templates/core/sitemap/_footer.xml @@ -0,0 +1 @@ + diff --git a/templates/core/sitemap/_header.xml b/templates/core/sitemap/_header.xml new file mode 100644 index 000000000..669269cb3 --- /dev/null +++ b/templates/core/sitemap/_header.xml @@ -0,0 +1,2 @@ + + diff --git a/templates/core/sitemap/_item.xml b/templates/core/sitemap/_item.xml new file mode 100644 index 000000000..d3e56e9a3 --- /dev/null +++ b/templates/core/sitemap/_item.xml @@ -0,0 +1,10 @@ + + https://docs.rs/{{ crate_name }}/latest/{{ target_name }}/ + {{ last_modified|escape_xml }} + 1.0 + + + https://docs.rs/{{ crate_name }}/latest/{{ target_name }}/all.html + {{ last_modified|escape_xml }} + 0.8 + diff --git a/templates/core/sitemapindex.xml b/templates/core/sitemap/index.xml similarity index 100% rename from templates/core/sitemapindex.xml rename to templates/core/sitemap/index.xml From a5c9972a00e25275f784a988c2c59ea4ca3ee46d Mon Sep 17 00:00:00 2001 From: Denis Cornehl Date: Fri, 21 Nov 2025 11:30:57 +0100 Subject: [PATCH 2/2] sitemaps: use inline constants instead of `include_bytes!` --- src/web/sitemap.rs | 6 ++++-- templates/core/sitemap/_footer.xml | 1 - templates/core/sitemap/_header.xml | 2 -- 3 files changed, 4 insertions(+), 5 deletions(-) delete mode 100644 templates/core/sitemap/_footer.xml delete mode 100644 templates/core/sitemap/_header.xml diff --git a/src/web/sitemap.rs b/src/web/sitemap.rs index ec6651464..3c26f8e13 100644 --- a/src/web/sitemap.rs +++ b/src/web/sitemap.rs @@ -55,8 +55,10 @@ struct SitemapItemXml { target_name: String, } -const SITEMAP_HEADER: &[u8] = include_bytes!("./../../templates/core/sitemap/_header.xml"); -const SITEMAP_FOOTER: &[u8] = include_bytes!("./../../templates/core/sitemap/_footer.xml"); +const SITEMAP_HEADER: &[u8] = br#" +\n"#; + +const SITEMAP_FOOTER: &[u8] = b"\n"; pub(crate) async fn sitemap_handler( Path(letter): Path, diff --git a/templates/core/sitemap/_footer.xml b/templates/core/sitemap/_footer.xml deleted file mode 100644 index d8521b562..000000000 --- a/templates/core/sitemap/_footer.xml +++ /dev/null @@ -1 +0,0 @@ - diff --git a/templates/core/sitemap/_header.xml b/templates/core/sitemap/_header.xml deleted file mode 100644 index 669269cb3..000000000 --- a/templates/core/sitemap/_header.xml +++ /dev/null @@ -1,2 +0,0 @@ - -