Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,7 @@ authors = ["WalletConnect Team"]
license = "MIT"

[workspace]
members = [
"crates/*",
]
members = ["crates/*"]

[features]
default = []
Expand All @@ -23,7 +21,7 @@ full = [
"geoip",
"metrics",
"rate_limit",
"websocket"
"websocket",
]
alloc = ["dep:alloc"]
analytics = ["dep:analytics"]
Expand Down
34 changes: 28 additions & 6 deletions crates/analytics/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,42 @@ name = "analytics"
version = "0.1.0"
edition = "2021"

[features]
default = []
full = ["parquet-native", "parquet-serde"]
parquet-native = ["parquet"]
parquet-serde = ["parquet", "dep:serde", "dep:serde_arrow"]
parquet = ["dep:parquet", "dep:arrow"]

[dependencies]
future = { path = "../future" }

async-trait = "0.1"
tokio = { version = "1", default-features = false, features = ["rt", "rt-multi-thread", "sync", "time", "macros"] }
tokio = { version = "1", default-features = false, features = [
"rt",
"rt-multi-thread",
"sync",
"time",
"macros",
] }
tracing = "0.1"

# Parquet serialization
parquet = { optional = true, version = "57.1", default-features = false, features = [
"arrow",
"flate2-rust_backened",
] }
arrow = { optional = true, version = "57.1" }
serde_arrow = { optional = true, version = "0.13", features = ["arrow-57"] }
serde = { optional = true, version = "1.0", features = ["derive", "rc"] }

# Misc
thiserror = "1.0"
anyhow = "1"
tap = "1.0"

chrono = { version = "0.4" }
aws-sdk-s3.workspace = true
bytes = "1.5"
parquet = { git = "https://github.com/WalletConnect/arrow-rs.git", rev = "e862f32", default-features = false, features = ["flate2-rust_backened"] }
parquet_derive = { git = "https://github.com/WalletConnect/arrow-rs.git", rev = "e862f32" }
bytes = "1.11"

[dev-dependencies]
analytics = { path = ".", features = ["full"] }
parquet_derive = "57.1"
2 changes: 1 addition & 1 deletion crates/analytics/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use {
pub use {
collectors::{BatchCollector, CollectionError, CollectorConfig},
exporters::{AwsConfig, AwsError, AwsExporter, NoopExporter},
serializers::{NoopBatchFactory, ParquetBatchFactory, ParquetConfig, ParquetError},
serializers::*,
};

mod collectors;
Expand Down
109 changes: 5 additions & 104 deletions crates/analytics/src/serializers.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
pub use parquet::errors::ParquetError;
use {
crate::{AnalyticsEvent, Batch, BatchFactory},
parquet::{
basic::Compression,
file::{properties::WriterProperties, writer::SerializedFileWriter},
record::RecordWriter,
},
std::{convert::Infallible, sync::Arc},
crate::{Batch, BatchFactory},
std::convert::Infallible,
};

#[cfg(feature = "parquet")]
pub mod parquet;

pub struct NoopBatchFactory;

impl<T> BatchFactory<T> for NoopBatchFactory {
Expand Down Expand Up @@ -41,99 +38,3 @@ impl<T> Batch<T> for NoopBatch {
Ok(Vec::new())
}
}

#[derive(Debug, Clone)]
pub struct ParquetConfig {
/// The maximum number of records the batch can hold. Pushing more records
/// will trigger export.
pub batch_capacity: usize,

/// The data buffer initially allocated for serialization. Specifying a low
/// value would cause memory reallocation potentially affecting performance.
pub alloc_buffer_size: usize,
}

impl Default for ParquetConfig {
fn default() -> Self {
Self {
batch_capacity: 1024 * 128,
alloc_buffer_size: 1024 * 1024 * 130,
}
}
}

pub struct ParquetBatchFactory {
config: ParquetConfig,
}

impl ParquetBatchFactory {
pub fn new(config: ParquetConfig) -> Self {
Self { config }
}
}

impl<T> BatchFactory<T> for ParquetBatchFactory
where
T: AnalyticsEvent,
for<'a> &'a [T]: RecordWriter<T>,
{
type Batch = ParquetBatch<T>;
type Error = ParquetError;

fn create(&self) -> Result<Self::Batch, Self::Error> {
let props = WriterProperties::builder()
.set_compression(Compression::GZIP(Default::default()))
.build();
let props = Arc::new(props);
let schema = (&[] as &[T]).schema()?;

Ok(ParquetBatch {
capacity: self.config.batch_capacity,
data: Vec::with_capacity(self.config.batch_capacity),
writer: SerializedFileWriter::new(
Vec::with_capacity(self.config.alloc_buffer_size),
schema,
props,
)?,
})
}
}

pub struct ParquetBatch<T> {
capacity: usize,
data: Vec<T>,
writer: SerializedFileWriter<Vec<u8>>,
}

impl<T> Batch<T> for ParquetBatch<T>
where
T: AnalyticsEvent,
for<'a> &'a [T]: RecordWriter<T>,
{
type Error = ParquetError;

fn push(&mut self, data: T) -> Result<(), Self::Error> {
self.data.push(data);
Ok(())
}

fn is_full(&self) -> bool {
self.data.len() >= self.capacity
}

fn is_empty(&self) -> bool {
self.data.is_empty()
}

fn serialize(mut self) -> Result<Vec<u8>, Self::Error> {
let mut row_group_writer = self.writer.next_row_group()?;

self.data
.as_slice()
.write_to_row_group(&mut row_group_writer)?;

row_group_writer.close()?;

self.writer.into_inner()
}
}
14 changes: 14 additions & 0 deletions crates/analytics/src/serializers/parquet.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
use {
parquet::schema::{parser::parse_message_type, types::TypePtr},
std::sync::Arc,
};

#[cfg(feature = "parquet-native")]
pub mod native;
#[cfg(feature = "parquet-serde")]
pub mod serde;

/// Returns `parquet` schema from parsing the string.
pub fn schema_from_str(schema: &str) -> anyhow::Result<TypePtr> {
Ok(Arc::new(parse_message_type(schema)?))
}
140 changes: 140 additions & 0 deletions crates/analytics/src/serializers/parquet/native.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
pub use parquet::{self, errors::ParquetError as Error};
use {
crate::AnalyticsEvent,
parquet::{
basic::Compression,
file::{properties::WriterProperties, writer::SerializedFileWriter},
record::RecordWriter,
schema::types::TypePtr,
},
std::{marker::PhantomData, sync::Arc},
};

/// Returns `parquet` schema based on the [`RecordWriter<T>`] implementation.
pub fn schema<T>() -> Result<TypePtr, Error>
where
for<'a> &'a [T]: RecordWriter<T>,
{
(&[] as &[T]).schema()
}

#[derive(Debug, Clone)]
pub struct Config {
/// The maximum number of records the batch can hold. Pushing more records
/// will trigger export.
///
/// Default value: `1024 * 128`.
pub batch_capacity: usize,

/// The data buffer initially allocated for serialization. Specifying a low
/// value would cause memory reallocation potentially affecting performance.
///
/// Default value: `1024 * 1024 * 130`.
pub alloc_buffer_size: usize,

/// Native [`parquet`] [`WriterProperties`]. Configures parquet export
/// parameters.
///
/// Default value: `WriterProperties::default()` with enabled GZIP
/// compression.
pub writer_properties: WriterProperties,
}

impl Default for Config {
fn default() -> Self {
Self {
batch_capacity: 1024 * 128,
alloc_buffer_size: 1024 * 1024 * 130,
writer_properties: WriterProperties::builder()
.set_compression(Compression::GZIP(Default::default()))
.build(),
}
}
}

pub struct BatchFactory<T> {
batch_capacity: usize,
alloc_buffer_size: usize,
writer_properties: Arc<WriterProperties>,
schema: TypePtr,
_marker: PhantomData<T>,
}

impl<T> BatchFactory<T>
where
T: AnalyticsEvent,
for<'a> &'a [T]: RecordWriter<T>,
{
pub fn new(config: Config) -> Result<Self, Error> {
Ok(Self {
batch_capacity: config.batch_capacity,
alloc_buffer_size: config.alloc_buffer_size,
writer_properties: Arc::new(config.writer_properties),
schema: (&[] as &[T]).schema()?,
_marker: PhantomData,
})
}
}

impl<T> crate::BatchFactory<T> for BatchFactory<T>
where
T: AnalyticsEvent,
for<'a> &'a [T]: RecordWriter<T>,
{
type Batch = Batch<T>;
type Error = Error;

fn create(&self) -> Result<Self::Batch, Self::Error> {
let writer = SerializedFileWriter::new(
Vec::with_capacity(self.alloc_buffer_size),
self.schema.clone(),
self.writer_properties.clone(),
)?;

Ok(Batch {
capacity: self.batch_capacity,
data: Vec::with_capacity(self.batch_capacity),
writer,
})
}
}

pub struct Batch<T> {
capacity: usize,
data: Vec<T>,
writer: SerializedFileWriter<Vec<u8>>,
}

impl<T> crate::Batch<T> for Batch<T>
where
T: AnalyticsEvent,
for<'a> &'a [T]: RecordWriter<T>,
{
type Error = Error;

fn push(&mut self, data: T) -> Result<(), Self::Error> {
self.data.push(data);
Ok(())
}

fn is_full(&self) -> bool {
self.data.len() >= self.capacity
}

fn is_empty(&self) -> bool {
self.data.is_empty()
}

fn serialize(mut self) -> Result<Vec<u8>, Self::Error> {
let mut row_group_writer = self.writer.next_row_group()?;

self.data
.as_slice()
.write_to_row_group(&mut row_group_writer)?;

row_group_writer.close()?;

self.writer.flush()?;
self.writer.into_inner()
}
}
Loading
Loading