allisonwang-db
diff --git a/‎design.md‎
Lines changed: 78 additions & 0 deletions b/‎design.md‎
Lines changed: 78 additions & 0 deletions
diff --git a/‎examples/meta_capi_example.py‎
Lines changed: 156 additions & 0 deletions b/‎examples/meta_capi_example.py‎
Lines changed: 156 additions & 0 deletions
diff --git a/‎pyspark_datasources/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎pyspark_datasources/__init__.py‎
Lines changed: 1 addition & 0 deletions
@@ -0,0 +1,78 @@
+# Meta Conversions API Data Source Design
+
+## Goal
+Implement a PySpark Custom Data Source to write event data to the Meta Conversions API (CAPI). This enables Databricks users to send server-side events directly to Meta for ad optimization and measurement.
+
+## Architecture
+The implementation will follow the `DataSource` V2 API in PySpark, specifically implementing a write-only data source.
+
+### Components
+1.  **MetaCapiDataSource**: The entry point, responsible for defining the name (`meta_capi`) and creating the writer.
+2.  **MetaCapiWriter**: Handles the execution of write operations.
+    -   Validates configuration (Access Token, Pixel ID).
+    -   Batches records (Meta CAPI supports up to 1000 events per request).
+    -   Transforms Spark Rows into CAPI-compliant JSON payloads.
+    -   Sends POST requests to the Graph API.
+    -   Handles responses and errors.
+
+## Configuration Options
+The data source will support the following options via `.option()`:
+-   `access_token` (Required): Meta System User Access Token.
+-   `pixel_id` (Required): The Meta Pixel ID (Dataset ID).
+-   `api_version` (Optional): Graph API version (default: `v19.0`).
+-   `batch_size` (Optional): Number of events per API request (default: `1000`, max is 1000).
+
+## Schema & Data Mapping
+The data source expects the input DataFrame to contain columns that map to the [Meta CAPI Event structure](https://developers.facebook.com/docs/marketing-api/conversions-api/parameters).
+
+To improve usability, the writer will support two modes:
+1.  **Structured Mode**: Users provide columns matching the API structure (e.g., a `user_data` struct column, `custom_data` struct column).
+2.  **Flat Mode** (optional/auto-detected): If `user_data` struct is missing, the writer looks for flat columns with specific prefixes or names and constructs the nested structure.
+    -   `email` -> `user_data.em` (will apply SHA256 if not already hashed - *nice to have*)
+    -   `phone` -> `user_data.ph`
+    -   `client_ip_address` -> `user_data.client_ip_address`
+    -   `event_name` -> `event_name`
+    -   `event_time` -> `event_time` (converts timestamp to Unix integer)
+    -   `value` -> `custom_data.value`
+    -   `currency` -> `custom_data.currency`
+
+*Decision*: For the initial implementation, we will prioritize **Structured Mode** correctness but add basic **Flat Mode** mapping for common fields (`email`, `event_name`, `event_time`, `value`, `currency`) to simplify the "Notebook" experience mentioned in the PRD.
+
+## API Details
+-   **Endpoint**: `https://graph.facebook.com/{api_version}/{pixel_id}/events`
+-   **Method**: `POST`
+-   **Headers**: `Content-Type: application/json`
+-   **Payload**:
+    ```json
+    {
+      "access_token": "...",
+      "data": [
+        {
+          "event_name": "Purchase",
+          "event_time": 1698765432,
+          "action_source": "website",
+          "user_data": {
+            "em": ["7b..."],
+            "ph": ["..."]
+          },
+          "custom_data": {
+            "currency": "USD",
+            "value": 100.0
+          }
+        }
+      ]
+    }
+    ```
+    *Note: `access_token` can be in the query param or body. We will use query param or body as recommended.*
+
+## Implementation Plan
+1.  Create `pyspark_datasources/meta_capi.py`.
+2.  Implement `MetaCapiDataSource` class.
+3.  Implement `MetaCapiWriter` class.
+4.  Implement helper method `_transform_row_to_event(row)` to handle schema mapping and type conversion (e.g. datetime to unix timestamp).
+5.  Implement `_send_batch(events)` using `requests` library.
+6.  Add error handling (retries for 5xx, logging for 4xx).
+
+## Dependencies
+-   `requests`: For HTTP calls.
+
@@ -0,0 +1,156 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Meta Conversions API (CAPI) Datasource Example
+
+This example demonstrates how to use the MetaCapiDataSource as a streaming datasource
+to write event data to Meta for ad optimization.
+
+Requirements:
+- PySpark
+- requests
+- Valid Meta System User Access Token and Pixel ID
+
+Setup:
+    pip install pyspark requests
+
+Environment Variables:
+    export META_ACCESS_TOKEN="your-access-token"
+    export META_PIXEL_ID="your-pixel-id"
+"""
+
+import os
+import tempfile
+import time
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import col, lit, current_timestamp, unix_timestamp
+from pyspark.sql.types import StructType, StructField, StringType, DoubleType, LongType
+
+def check_credentials():
+    """Check if Meta credentials are available"""
+    token = os.getenv("META_ACCESS_TOKEN")
+    pixel_id = os.getenv("META_PIXEL_ID")
+
+    if not all([token, pixel_id]):
+        print("❌ Missing Meta credentials!")
+        print("Please set the following environment variables:")
+        print("  export META_ACCESS_TOKEN='your-access-token'")
+        print("  export META_PIXEL_ID='your-pixel-id'")
+        return False, None, None
+
+    print(f"✅ Using Pixel ID: {pixel_id}")
+    return True, token, pixel_id
+
+def example_1_rate_source_to_capi():
+    """Example 1: Stream simulated purchases to Meta CAPI"""
+    print("\n" + "=" * 60)
+    print("EXAMPLE 1: Simulated Purchases → Meta CAPI (Streaming)")
+    print("=" * 60)
+
+    has_creds, token, pixel_id = check_credentials()
+    if not has_creds:
+        return
+
+    spark = SparkSession.builder.appName("MetaCapiExample1").getOrCreate()
+
+    try:
+        from pyspark_datasources.meta_capi import MetaCapiDataSource
+        spark.dataSource.register(MetaCapiDataSource)
+        print("✅ Meta CAPI datasource registered")
+
+        # Create streaming data (simulating 1 purchase per second)
+        streaming_df = spark.readStream.format("rate").option("rowsPerSecond", 1).load()
+
+        # Transform to CAPI format (Flat Mode)
+        # We simulate user data. In production, this comes from your tables.
+        events_df = streaming_df.select(
+            lit("Purchase").alias("event_name"),
+            col("timestamp").alias("event_time"),
+            lit("test@example.com").alias("email"),  # Will be auto-hashed
+            lit("website").alias("action_source"),
+            (col("value") * 10.0 + 5.0).alias("value"),
+            lit("USD").alias("currency"),
+            lit("TEST12345").alias("test_event_code") # For testing in Events Manager
+        )
+
+        print("📊 Starting streaming write to Meta CAPI...")
+        print("   Check your Events Manager 'Test Events' tab!")
+
+        # Write to Meta CAPI
+        query = (
+            events_df.writeStream.format("meta_capi")
+            .option("access_token", token)
+            .option("pixel_id", pixel_id)
+            .option("test_event_code", "TEST12345") # Optional: direct test code option
+            .option("batch_size", "10")
+            .option("checkpointLocation", "/tmp/meta_capi_example1_checkpoint")
+            .trigger(processingTime="10 seconds")
+            .start()
+        )
+
+        # Run for 30 seconds then stop
+        time.sleep(30)
+        query.stop()
+        print("✅ Streaming stopped")
+
+    except Exception as e:
+        print(f"❌ Error: {e}")
+    finally:
+        spark.stop()
+
+def example_2_batch_dataframe_to_capi():
+    """Example 2: Batch write a static DataFrame to Meta CAPI"""
+    print("\n" + "=" * 60)
+    print("EXAMPLE 2: Static DataFrame → Meta CAPI (Batch)")
+    print("=" * 60)
+
+    has_creds, token, pixel_id = check_credentials()
+    if not has_creds:
+        return
+
+    spark = SparkSession.builder.appName("MetaCapiExample2").getOrCreate()
+
+    try:
+        from pyspark_datasources.meta_capi import MetaCapiDataSource
+        spark.dataSource.register(MetaCapiDataSource)
+        print("✅ Meta CAPI datasource registered")
+
+        # Create sample data
+        data = [
+            ("Purchase", 1700000001, "user1@example.com", 120.50, "USD"),
+            ("Purchase", 1700000002, "user2@example.com", 85.00, "USD"),
+            ("AddToCart", 1700000003, "user3@example.com", 25.99, "USD"),
+        ]
+        
+        columns = ["event_name", "event_time", "email", "value", "currency"]
+        df = spark.createDataFrame(data, columns)
+
+        # Add optional fields
+        df = df.withColumn("action_source", lit("website")) \
+               .withColumn("test_event_code", lit("TEST12345"))
+
+        print(f"📊 Writing {df.count()} records to Meta CAPI in batch mode...")
+        print("   Check your Events Manager 'Test Events' tab!")
+
+        # Write to Meta CAPI (Batch)
+        df.write.format("meta_capi") \
+            .option("access_token", token) \
+            .option("pixel_id", pixel_id) \
+            .option("test_event_code", "TEST12345") \
+            .option("batch_size", "50") \
+            .save()
+
+        print("✅ Batch write completed")
+
+    except Exception as e:
+        print(f"❌ Error: {e}")
+    finally:
+        spark.stop()
+
+def main():
+    print("🚀 Meta CAPI Datasource Example")
+    example_1_rate_source_to_capi()
+    example_2_batch_dataframe_to_capi()
+
+if __name__ == "__main__":
+    main()
@@ -10,3 +10,4 @@
 from .simplejson import SimpleJsonDataSource
 from .stock import StockDataSource
 from .jsonplaceholder import JSONPlaceholderDataSource
+from .meta_capi import MetaCapiDataSource