# Example Adapter Implementation

This document provides a complete, working example of a BioCypher adapter that demonstrates the patterns and best practices described in llms-adapters.txt.

## Complete GEO Adapter Example

```python
#!/usr/bin/env python3
"""
Example GEO Adapter for BioCypher

This adapter demonstrates the complete pattern for creating a BioCypher adapter
that transforms NCBI GEO data into BioCypher's canonical format.
"""

import GEOparse
from typing import Iterator, Tuple, Dict, Any
from biocypher._get import FileDownload


class GEOAdapter:
    """
    BioCypher adapter for NCBI GEO data.

    Transforms GEO series and sample metadata into BioCypher nodes and edges
    following the schema configuration contract.
    """

    def __init__(self, gse_id: str, schema_config: Dict[str, Any] = None):
        """
        Initialize the GEO adapter.

        Args:
            gse_id: GEO Series ID (e.g., "GSE12345")
            schema_config: Schema configuration for validation
        """
        self.gse_id = gse_id
        self.schema_config = schema_config
        self.series = None
        self._load_data()

    def _load_data(self):
        """Load GEO data using GEOparse."""
        try:
            self.series = GEOparse.get_GEO(self.gse_id)
            print(f"Loaded GEO series: {self.series.name}")
        except Exception as e:
            raise RuntimeError(f"Failed to load GEO series {self.gse_id}: {e}")

    def _validate_schema_compliance(self):
        """Validate that adapter outputs match schema requirements."""
        if not self.schema_config:
            return

        schema_nodes = {node['input_label'] for node in self.schema_config.get('nodes', [])}
        schema_edges = {edge['input_label'] for edge in self.schema_config.get('edges', [])}

        # Validate node labels
        for node_id, node_label, _ in self.get_nodes():
            if node_label not in schema_nodes:
                raise ValueError(f"Node label '{node_label}' not in schema")

        # Validate edge labels
        for _, _, _, edge_label, _ in self.get_edges():
            if edge_label not in schema_edges:
                raise ValueError(f"Edge label '{edge_label}' not in schema")

    def _safe_extract(self, metadata: Dict[str, Any], key: str, default: Any = None) -> Any:
        """
        Safely extract metadata with fallback.

        Args:
            metadata: Metadata dictionary
            key: Key to extract
            default: Default value if key not found

        Returns:
            Extracted value or default
        """
        try:
            return metadata.get(key, default)
        except (AttributeError, KeyError):
            return default

    def get_nodes(self) -> Iterator[Tuple[str, str, Dict[str, Any]]]:
        """
        Generate nodes from GEO data.

        Yields:
            Tuple of (node_id, node_label, attributes_dict)
        """
        # Series node
        yield (
            f"GEO:{self.series.name}",    # node_id
            "geo_series",                 # node_label (matches schema input_label)
            {
                "title": self._safe_extract(self.series.metadata, "title"),
                "summary": self._safe_extract(self.series.metadata, "summary"),
                "submission_date": self._safe_extract(self.series.metadata, "submission_date"),
                "last_update_date": self._safe_extract(self.series.metadata, "last_update_date"),
                "source": "GEO",
                "version": self._safe_extract(self.series.metadata, "submission_date"),
            },
        )

        # Sample nodes
        for sample_name, sample in self.series.gsms.items():
            yield (
                f"GEO:{sample_name}",     # node_id
                "geo_sample",             # node_label (matches schema input_label)
                {
                    "disease": self._safe_extract(sample.metadata, "disease_state"),
                    "organism": self._safe_extract(sample.metadata, "organism_ch1"),
                    "tissue": self._safe_extract(sample.metadata, "tissue_ch1"),
                    "cell_type": self._safe_extract(sample.metadata, "cell_type_ch1"),
                    "treatment": self._safe_extract(sample.metadata, "treatment_protocol_ch1"),
                    "source": "GEO",
                    "version": self._safe_extract(self.series.metadata, "submission_date"),
                },
            )

        # Platform nodes (if available)
        for platform_name, platform in self.series.gpls.items():
            yield (
                f"GEO:{platform_name}",   # node_id
                "geo_platform",           # node_label (matches schema input_label)
                {
                    "title": self._safe_extract(platform.metadata, "title"),
                    "technology": self._safe_extract(platform.metadata, "technology"),
                    "source": "GEO",
                    "version": self._safe_extract(self.series.metadata, "submission_date"),
                },
            )

    def get_edges(self) -> Iterator[Tuple[str, str, str, str, Dict[str, Any]]]:
        """
        Generate edges from GEO data.

        Yields:
            Tuple of (edge_id, source_id, target_id, edge_label, attributes_dict)
        """
        # Series to Sample relationships
        for sample_name in self.series.gsms.keys():
            yield (
                f"series_sample_{self.series.name}_{sample_name}",  # edge_id
                f"GEO:{self.series.name}",                          # source_id (series)
                f"GEO:{sample_name}",                               # target_id (sample)
                "HAS_SAMPLE",                                       # edge_label (matches schema input_label)
                {
                    "source": "GEO",
                    "version": self._safe_extract(self.series.metadata, "submission_date"),
                },
            )

        # Sample to Platform relationships
        for sample_name, sample in self.series.gsms.items():
            platform_name = self._safe_extract(sample.metadata, "platform_id")
            if platform_name and platform_name in self.series.gpls:
                yield (
                    f"sample_platform_{sample_name}_{platform_name}",  # edge_id
                    f"GEO:{sample_name}",                               # source_id (sample)
                    f"GEO:{platform_name}",                             # target_id (platform)
                    "USES_PLATFORM",                                    # edge_label (matches schema input_label)
                    {
                        "source": "GEO",
                        "version": self._safe_extract(self.series.metadata, "submission_date"),
                    },
                )

    def validate(self) -> bool:
        """
        Validate the adapter output against schema.

        Returns:
            True if validation passes, raises exception otherwise
        """
        self._validate_schema_compliance()
        return True


# Usage Example
def main():
    """Example usage of the GEO adapter."""

    # Example schema configuration
    schema_config = {
        "nodes": [
            {
                "input_label": "geo_series",
                "properties": ["title", "summary", "source", "version"]
            },
            {
                "input_label": "geo_sample",
                "properties": ["disease", "organism", "source", "version"]
            },
            {
                "input_label": "geo_platform",
                "properties": ["title", "technology", "source", "version"]
            }
        ],
        "edges": [
            {
                "input_label": "HAS_SAMPLE",
                "source": "geo_series",
                "target": "geo_sample",
                "properties": ["source", "version"]
            },
            {
                "input_label": "USES_PLATFORM",
                "source": "geo_sample",
                "target": "geo_platform",
                "properties": ["source", "version"]
            }
        ]
    }

    # Create adapter
    adapter = GEOAdapter("GSE12345", schema_config)

    # Validate
    adapter.validate()

    # Generate nodes and edges
    nodes = list(adapter.get_nodes())
    edges = list(adapter.get_edges())

    print(f"Generated {len(nodes)} nodes and {len(edges)} edges")

    # Example node output
    print("\nExample node:")
    print(nodes[0])

    # Example edge output
    print("\nExample edge:")
    print(edges[0])


if __name__ == "__main__":
    main()
```

## Key Features Demonstrated

### 1. Schema Compliance
- Validates node and edge labels against schema configuration
- Ensures all required properties are present
- Raises clear error messages for violations

### 2. Error Handling
- Safe metadata extraction with fallbacks
- Graceful handling of missing or malformed data
- Comprehensive exception handling

### 3. Data Transformation
- Converts GEO metadata to BioCypher canonical format
- Handles multiple entity types (series, samples, platforms)
- Creates meaningful relationships between entities

### 4. Provenance Tracking
- Includes source and version information
- Maintains data lineage
- Supports strict mode requirements

### 5. Memory Efficiency
- Uses generators for streaming data
- Processes large datasets without loading everything into memory
- Supports incremental processing

## Integration with BioCypher

```python
from biocypher import BioCypher

# Initialize BioCypher
bc = BioCypher()

# Create adapter
adapter = GEOAdapter("GSE12345")

# Add nodes and edges to BioCypher
bc.add_nodes(adapter.get_nodes())
bc.add_edges(adapter.get_edges())

# Write to database
bc.write()
```

This example demonstrates all the patterns and best practices described in llms-adapters.txt and can serve as a template for creating new adapters.

## Related Files

- **llms-adapters.txt** - Adapter creation guide
- **llms.txt** - Functionality index and reference