Skip to main content

Analytics Workflow Overview

Learning Focus

Use this lesson to understand the complete DuckDB analytics workflow from raw data to actionable insights.

The DuckDB Analytics Loop

Step 1: Load and Explore

-- Preview raw data — no import needed
SELECT * FROM 'raw_sales.csv' LIMIT 10;

-- Get column types and nullability
DESCRIBE SELECT * FROM 'raw_sales.csv';

-- Statistical summary of every column in one command
SUMMARIZE SELECT * FROM 'raw_sales.csv';

SUMMARIZE returns: count, null count, min, max, avg, std, and distinct count for each column — a one-shot data quality check.

Step 2: Clean and Transform

-- Create a clean version of the dataset
CREATE TABLE clean_sales AS
SELECT
id,
UPPER(TRIM(product_name)) AS product_name,
LOWER(TRIM(region)) AS region,
COALESCE(revenue, 0) AS revenue,
strptime(sale_date, '%d/%m/%Y')::DATE AS sale_date,
revenue / NULLIF(units_sold, 0) AS avg_unit_price
FROM 'raw_sales.csv'
WHERE id IS NOT NULL
AND revenue > 0
AND sale_date IS NOT NULL;

-- Verify the clean result
SELECT COUNT(*) AS clean_rows FROM clean_sales;
SUMMARIZE clean_sales;

Step 3: Analyze

-- Revenue by region and month
SELECT
region,
date_trunc('month', sale_date) AS month,
COUNT(*) AS transactions,
SUM(revenue) AS total_revenue,
ROUND(AVG(revenue), 2) AS avg_order_value
FROM clean_sales
GROUP BY region, month
ORDER BY region, month;

-- Top 10 products overall
SELECT product_name, SUM(revenue) AS total_revenue
FROM clean_sales
GROUP BY product_name
ORDER BY total_revenue DESC
LIMIT 10;

-- Month-over-month growth using window functions
WITH monthly AS (
SELECT
date_trunc('month', sale_date) AS month,
SUM(revenue) AS revenue
FROM clean_sales
GROUP BY month
)
SELECT
month,
revenue,
lag(revenue) OVER (ORDER BY month) AS prev_month,
ROUND((revenue - lag(revenue) OVER (ORDER BY month))
/ lag(revenue) OVER (ORDER BY month) * 100, 1) AS mom_growth_pct
FROM monthly
ORDER BY month;

Step 4: Export Results

-- Export summary to Parquet for BI tool
COPY (
SELECT region, date_trunc('month', sale_date) AS month, SUM(revenue) AS revenue
FROM clean_sales
GROUP BY region, month
) TO 'output/monthly_revenue.parquet' (FORMAT PARQUET, COMPRESSION ZSTD);

-- Export top products to CSV for stakeholders
COPY (
SELECT product_name, SUM(revenue) AS total_revenue
FROM clean_sales
GROUP BY product_name
ORDER BY total_revenue DESC
LIMIT 50
) TO 'output/top_products.csv' (FORMAT CSV, HEADER true);

Step 5: Automate with a SQL Script

-- daily_analytics.sql — run with: duckdb analytics.db < daily_analytics.sql
LOAD httpfs;

-- Fetch latest data from S3
CREATE OR REPLACE TABLE raw_sales AS
SELECT * FROM read_parquet('s3://my-bucket/sales/daily/*.parquet');

-- Clean
CREATE OR REPLACE TABLE clean_sales AS
SELECT *
FROM raw_sales
WHERE revenue > 0 AND sale_date IS NOT NULL;

-- Export
COPY (
SELECT region, SUM(revenue) AS revenue
FROM clean_sales
WHERE sale_date >= current_date - INTERVAL '30 days'
GROUP BY region
) TO '/reports/region_30d.csv' (FORMAT CSV, HEADER true);

What's Next