API Documentation¶

Complete API reference for GraphForge.

Main API¶

`graphforge.api` ¶

High-level API for GraphForge.

This module provides the main public interface for GraphForge.

`QueryInput` ¶

Bases: BaseModel

Validates openCypher query input.

Source code in src/graphforge/api.py

class QueryInput(BaseModel):
    """Validates openCypher query input."""

    query: str = Field(..., min_length=1, description="openCypher query string")

    @field_validator("query")
    @classmethod
    def validate_query(cls, v: str) -> str:
        """Validate query is not just whitespace."""
        if not v.strip():
            raise ValueError("Query cannot be empty or whitespace only")
        return v

    model_config = {"frozen": True}

`validate_query(v)` `classmethod` ¶

Validate query is not just whitespace.

Source code in src/graphforge/api.py

@field_validator("query")
@classmethod
def validate_query(cls, v: str) -> str:
    """Validate query is not just whitespace."""
    if not v.strip():
        raise ValueError("Query cannot be empty or whitespace only")
    return v

`NodeInput` ¶

Bases: BaseModel

Validates node creation input.

Source code in src/graphforge/api.py

class NodeInput(BaseModel):
    """Validates node creation input."""

    labels: list[str] = Field(default_factory=list, description="Node labels")

    @field_validator("labels")
    @classmethod
    def validate_labels(cls, v: list[str]) -> list[str]:
        """Validate label names."""
        for label in v:
            if not label:
                raise ValueError("Label cannot be empty string")
            if not label[0].isalpha():
                raise ValueError(f"Label must start with a letter: {label}")
            if not label.replace("_", "").isalnum():
                raise ValueError(f"Label must contain only alphanumeric and underscore: {label}")
        return v

    model_config = {"frozen": True}

`validate_labels(v)` `classmethod` ¶

Validate label names.

Source code in src/graphforge/api.py

@field_validator("labels")
@classmethod
def validate_labels(cls, v: list[str]) -> list[str]:
    """Validate label names."""
    for label in v:
        if not label:
            raise ValueError("Label cannot be empty string")
        if not label[0].isalpha():
            raise ValueError(f"Label must start with a letter: {label}")
        if not label.replace("_", "").isalnum():
            raise ValueError(f"Label must contain only alphanumeric and underscore: {label}")
    return v

`RelationshipInput` ¶

Bases: BaseModel

Validates relationship creation input.

Source code in src/graphforge/api.py

class RelationshipInput(BaseModel):
    """Validates relationship creation input."""

    rel_type: str = Field(..., min_length=1, description="Relationship type")

    @field_validator("rel_type")
    @classmethod
    def validate_rel_type(cls, v: str) -> str:
        """Validate relationship type name."""
        if not v[0].isalpha() and v[0] != "_":
            raise ValueError(f"Relationship type must start with letter or underscore: {v}")
        if not v.replace("_", "").isalnum():
            raise ValueError(
                f"Relationship type must contain only alphanumeric and underscore: {v}"
            )
        return v

    model_config = {"frozen": True}

`validate_rel_type(v)` `classmethod` ¶

Validate relationship type name.

Source code in src/graphforge/api.py

@field_validator("rel_type")
@classmethod
def validate_rel_type(cls, v: str) -> str:
    """Validate relationship type name."""
    if not v[0].isalpha() and v[0] != "_":
        raise ValueError(f"Relationship type must start with letter or underscore: {v}")
    if not v.replace("_", "").isalnum():
        raise ValueError(
            f"Relationship type must contain only alphanumeric and underscore: {v}"
        )
    return v

`DatasetNameInput` ¶

Bases: BaseModel

Validates dataset name input.

Source code in src/graphforge/api.py

class DatasetNameInput(BaseModel):
    """Validates dataset name input."""

    name: str = Field(..., min_length=1, description="Dataset name")

    @field_validator("name")
    @classmethod
    def validate_name(cls, v: str) -> str:
        """Validate dataset name is not just whitespace."""
        if not v.strip():
            raise ValueError("Dataset name cannot be empty or whitespace only")
        return v

    model_config = {"frozen": True}

`validate_name(v)` `classmethod` ¶

Validate dataset name is not just whitespace.

Source code in src/graphforge/api.py

@field_validator("name")
@classmethod
def validate_name(cls, v: str) -> str:
    """Validate dataset name is not just whitespace."""
    if not v.strip():
        raise ValueError("Dataset name cannot be empty or whitespace only")
    return v

`GraphForge` ¶

Main GraphForge interface for graph operations.

GraphForge provides an embedded graph database with openCypher query support.

Examples:

>>> gf = GraphForge()
>>> # Create nodes with Python API
>>> alice = gf.create_node(['Person'], name='Alice', age=30)
>>> bob = gf.create_node(['Person'], name='Bob', age=25)
>>> # Create relationships
>>> knows = gf.create_relationship(alice, bob, 'KNOWS', since=2020)
>>> # Query with openCypher
>>> results = gf.execute("MATCH (p:Person) WHERE p.age > 25 RETURN p.name")

Source code in src/graphforge/api.py

class GraphForge:
    """Main GraphForge interface for graph operations.

    GraphForge provides an embedded graph database with openCypher query support.

    Examples:
        >>> gf = GraphForge()
        >>> # Create nodes with Python API
        >>> alice = gf.create_node(['Person'], name='Alice', age=30)
        >>> bob = gf.create_node(['Person'], name='Bob', age=25)
        >>> # Create relationships
        >>> knows = gf.create_relationship(alice, bob, 'KNOWS', since=2020)
        >>> # Query with openCypher
        >>> results = gf.execute("MATCH (p:Person) WHERE p.age > 25 RETURN p.name")
    """

    def __init__(self, path: str | Path | None = None, enable_optimizer: bool = True):
        """Initialize GraphForge.

        Args:
            path: Optional path to persistent storage (SQLite database file)
                  If None, uses in-memory storage.
                  If provided, loads existing graph or creates new database.
            enable_optimizer: Enable query optimization (default: True).
                  When enabled, applies filter pushdown and predicate reordering
                  for better performance.

        Raises:
            ValueError: If path is empty string or whitespace only

        Examples:
            >>> # In-memory graph (lost on exit)
            >>> gf = GraphForge()

            >>> # Persistent graph (saved to disk)
            >>> gf = GraphForge("my-graph.db")
            >>> # ... create nodes ...
            >>> gf.close()  # Save to disk

            >>> # Later, load the graph
            >>> gf = GraphForge("my-graph.db")  # Graph is still there

            >>> # Disable optimizer for debugging
            >>> gf = GraphForge(enable_optimizer=False)
        """
        # Validate path if provided
        if path is not None:
            if isinstance(path, str) and not path.strip():
                raise ValueError("Path cannot be empty or whitespace only")

        # Initialize storage backend
        self.backend: SQLiteBackend | None
        if path:
            # Use SQLite for persistence
            self.backend = SQLiteBackend(Path(path))
            self.graph = self._load_graph_from_backend()
            # Set next IDs based on existing data
            self._next_node_id = self.backend.get_next_node_id()
            self._next_edge_id = self.backend.get_next_edge_id()
        else:
            # Use in-memory storage
            self.backend = None
            self.graph = Graph()
            self._next_node_id = 1
            self._next_edge_id = 1

        # Track if database has been closed
        self._closed = False

        # Transaction state
        self._in_transaction = False
        self._transaction_snapshot = None

        # Initialize query execution components
        self.parser = CypherParser()
        self.planner = QueryPlanner()
        self.optimizer = QueryOptimizer() if enable_optimizer else None
        self.executor = QueryExecutor(self.graph, graphforge=self, planner=self.planner)

    @classmethod
    def from_dataset(cls, name: str, path: str | Path | None = None) -> "GraphForge":
        """Create a new GraphForge instance and load a dataset into it.

        This is a convenience method that combines instance creation with dataset loading.

        Args:
            name: Dataset name (e.g., "snap-ego-facebook", "neo4j-movie-graph")
            path: Optional path for persistent storage

        Returns:
            GraphForge instance with dataset loaded

        Raises:
            ValueError: If dataset name is empty or whitespace only
            pydantic.ValidationError: If dataset name fails validation

        Examples:
            >>> # Load dataset into in-memory graph
            >>> gf = GraphForge.from_dataset("snap-ego-facebook")
            >>>
            >>> # Load dataset into persistent storage
            >>> gf = GraphForge.from_dataset("neo4j-movie-graph", "movies.db")
        """
        from graphforge.datasets import load_dataset

        # Validate dataset name
        DatasetNameInput(name=name)

        instance = cls(path)
        load_dataset(instance, name)  # nosec B615 - Not Hugging Face, our own dataset loader
        return instance

    def register_function(self, name: str, func: Any) -> None:
        """Register a custom function for testing/extension purposes.

        Args:
            name: Function name (case-insensitive, will be uppercased)
            func: Python callable that takes (args, ctx, executor) and returns CypherValue

        Examples:
            >>> gf = GraphForge()
            >>> def my_func(args, ctx, executor):
            ...     return CypherInt(42)
            >>> gf.register_function("MYFUNC", my_func)
            >>> result = gf.execute("RETURN MYFUNC() AS value")
        """
        self.executor.custom_functions[name.upper()] = func

    def execute(self, query: str) -> list[dict]:
        """Execute an openCypher query.

        Args:
            query: openCypher query string

        Returns:
            List of result rows as dictionaries

        Raises:
            ValueError: If query is empty or whitespace only
            pydantic.ValidationError: If query fails validation

        Examples:
            >>> gf = GraphForge()
            >>> results = gf.execute("MATCH (n) RETURN n LIMIT 10")
        """
        # Validate query input
        QueryInput(query=query)

        # Parse query
        ast = self.parser.parse(query)

        # Check if this is a UNION query
        from graphforge.ast.query import UnionQuery

        if isinstance(ast, UnionQuery):
            # Handle UNION query: plan and optimize each branch separately
            branch_operators = []
            # Update optimizer statistics for cost-based optimization
            if self.optimizer:
                self.optimizer.update_statistics(self.graph.get_statistics())
            for branch_ast in ast.branches:
                branch_ops = self.planner.plan(branch_ast)
                # Optimize each branch independently
                if self.optimizer:
                    branch_ops = self.optimizer.optimize(branch_ops)
                branch_operators.append(branch_ops)

            # Create Union operator
            from graphforge.planner.operators import Union

            union_op = Union(branches=branch_operators, all=ast.all)
            operators = [union_op]
        else:
            # Regular query
            operators = self.planner.plan(ast)

            # Optimize query plan with current graph statistics
            if self.optimizer:
                self.optimizer.update_statistics(self.graph.get_statistics())
                operators = self.optimizer.optimize(operators)

        # Execute
        results = self.executor.execute(operators)

        return results

    def create_node(self, labels: list[str] | None = None, **properties: Any) -> NodeRef:
        """Create a node with labels and properties.

        Automatically assigns a unique node ID and converts Python values
        to CypherValue types.

        Args:
            labels: List of label strings (e.g., ['Person', 'Employee'])
            **properties: Property key-value pairs as Python types.
                Values are converted to CypherValue types:
                - int → CypherInt
                - float → CypherFloat
                - str → CypherString
                - bool → CypherBool
                - None → CypherNull
                - dict with {x, y} or {latitude, longitude} → CypherPoint
                - dict (other) → CypherMap
                - list → CypherList
                - date → CypherDate
                - datetime → CypherDateTime
                - time → CypherTime
                - timedelta → CypherDuration

        Returns:
            NodeRef for the created node

        Raises:
            ValueError: If labels are invalid (empty, don't start with letter, etc.)
            pydantic.ValidationError: If labels fail validation
            TypeError: If property values are unsupported types

        Examples:
            >>> gf = GraphForge()
            >>> alice = gf.create_node(['Person'], name='Alice', age=30)
            >>> bob = gf.create_node(['Person', 'Employee'], name='Bob', salary=50000)
            >>> # Create node with spatial property (Cartesian coordinates)
            >>> office = gf.create_node(['Place'], name='Office', location={"x": 1.0, "y": 2.0})
            >>> # Create node with geographic coordinates
            >>> sf = gf.create_node(
            ...     ['City'], name='SF', location={"latitude": 37.7749, "longitude": -122.4194}
            ... )
            >>> # Query the created nodes
            >>> results = gf.execute("MATCH (p:Person) RETURN p.name")
        """
        # Validate labels
        NodeInput(labels=labels or [])

        # Convert properties to CypherValues
        cypher_properties = {key: self._to_cypher_value(value) for key, value in properties.items()}

        # Create node with auto-generated ID
        node = NodeRef(
            id=self._next_node_id,
            labels=frozenset(labels or []),
            properties=cypher_properties,
        )

        # Add to graph
        self.graph.add_node(node)

        # Increment ID for next node
        self._next_node_id += 1

        return node

    def create_relationship(
        self, src: NodeRef, dst: NodeRef, rel_type: str, **properties: Any
    ) -> EdgeRef:
        """Create a relationship between two nodes.

        Automatically assigns a unique edge ID and converts Python values
        to CypherValue types.

        Args:
            src: Source node (NodeRef)
            dst: Destination node (NodeRef)
            rel_type: Relationship type (e.g., 'KNOWS', 'WORKS_AT')
            **properties: Property key-value pairs as Python types.
                Values are converted to CypherValue types (same as create_node).
                Supports Point coordinates: {"x": 1.0, "y": 2.0} or
                {"latitude": 37.7, "longitude": -122.4}

        Returns:
            EdgeRef for the created relationship

        Raises:
            ValueError: If rel_type is invalid (empty, doesn't start with letter/underscore, etc.)
            TypeError: If src or dst are not NodeRef instances
            pydantic.ValidationError: If rel_type fails validation

        Examples:
            >>> gf = GraphForge()
            >>> alice = gf.create_node(['Person'], name='Alice')
            >>> bob = gf.create_node(['Person'], name='Bob')
            >>> knows = gf.create_relationship(alice, bob, 'KNOWS', since=2020)
            >>> # Relationship with spatial property
            >>> travels = gf.create_relationship(
            ...     alice, bob, 'TRAVELS_TO', distance_from={"x": 0.0, "y": 0.0}
            ... )
            >>> # Query relationships
            >>> results = gf.execute("MATCH (a)-[r:KNOWS]->(b) RETURN a.name, b.name")
        """
        # Validate inputs
        if not isinstance(src, NodeRef):
            raise TypeError(f"src must be a NodeRef, got {type(src).__name__}")
        if not isinstance(dst, NodeRef):
            raise TypeError(f"dst must be a NodeRef, got {type(dst).__name__}")

        RelationshipInput(rel_type=rel_type)

        # Convert properties to CypherValues
        cypher_properties = {key: self._to_cypher_value(value) for key, value in properties.items()}

        # Create edge with auto-generated ID
        edge = EdgeRef(
            id=self._next_edge_id,
            type=rel_type,
            src=src,
            dst=dst,
            properties=cypher_properties,
        )

        # Add to graph
        self.graph.add_edge(edge)

        # Increment ID for next edge
        self._next_edge_id += 1

        return edge

    def _to_cypher_value(self, value):
        """Convert Python value to CypherValue type.

        Args:
            value: Python value (str, int, float, bool, None, list, dict,
                   date, datetime, time, timedelta)

        Returns:
            Corresponding CypherValue instance

        Raises:
            TypeError: If value type is not supported
        """
        # Handle None
        if value is None:
            return CypherNull()

        # Handle bool (must check before int since bool is subclass of int)
        if isinstance(value, bool):
            return CypherBool(value)

        # Handle int
        if isinstance(value, int):
            return CypherInt(value)

        # Handle float
        if isinstance(value, float):
            return CypherFloat(value)

        # Handle str
        if isinstance(value, str):
            return CypherString(value)

        # Handle temporal types (check datetime before date since datetime is subclass of date)
        if isinstance(value, datetime.datetime):
            return CypherDateTime(value)
        if isinstance(value, datetime.date):
            return CypherDate(value)
        if isinstance(value, datetime.time):
            return CypherTime(value)
        if isinstance(value, datetime.timedelta):
            return CypherDuration(value)

        # Handle isodate.Duration (used for ISO 8601 durations with years/months)
        try:
            import isodate  # type: ignore[import-untyped]

            if isinstance(value, isodate.Duration):
                return CypherDuration(value)
        except ImportError:
            pass

        # Handle list (recursively convert elements)
        if isinstance(value, list):
            return CypherList([self._to_cypher_value(item) for item in value])

        # Handle dict - check for Point coordinates before CypherMap
        if isinstance(value, dict):
            keys = set(value.keys())

            # Detect Cartesian coordinates: {x, y} or {x, y, z}, optionally with crs
            # Valid: {"x", "y"}, {"x", "y", "crs"}, {"x", "y", "z"}, {"x", "y", "z", "crs"}
            cartesian_2d = {"x", "y"}
            cartesian_2d_crs = {"x", "y", "crs"}
            cartesian_3d = {"x", "y", "z"}
            cartesian_3d_crs = {"x", "y", "z", "crs"}

            if keys in (cartesian_2d, cartesian_2d_crs, cartesian_3d, cartesian_3d_crs):
                try:
                    return CypherPoint(value)
                except (ValueError, TypeError):
                    # Invalid coordinates (out of range or non-numeric), fall through to CypherMap
                    pass

            # Detect Geographic coordinates: {latitude, longitude}, optionally with crs
            # Valid: {"latitude", "longitude"}, {"latitude", "longitude", "crs"}
            geographic = {"latitude", "longitude"}
            geographic_crs = {"latitude", "longitude", "crs"}

            if keys in (geographic, geographic_crs):
                try:
                    return CypherPoint(value)
                except (ValueError, TypeError):
                    # Invalid coordinates (out of range or non-numeric), fall through to CypherMap
                    pass

            # Default to CypherMap (recursively convert values)
            return CypherMap({key: self._to_cypher_value(val) for key, val in value.items()})

        # Unsupported type
        raise TypeError(
            f"Unsupported property value type: {type(value).__name__}. "
            "Supported types: str, int, float, bool, None, list, dict, "
            "date, datetime, time, timedelta"
        )

    def begin(self):
        """Begin an explicit transaction.

        Starts a new transaction by taking a snapshot of the current graph state.
        Changes made after begin() can be committed or rolled back.

        Raises:
            RuntimeError: If already in a transaction

        Examples:
            >>> gf = GraphForge("my-graph.db")
            >>> gf.begin()
            >>> alice = gf.create_node(['Person'], name='Alice')
            >>> gf.commit()  # Changes are saved

            >>> gf.begin()
            >>> bob = gf.create_node(['Person'], name='Bob')
            >>> gf.rollback()  # Bob is removed
        """
        if self._in_transaction:
            raise RuntimeError("Already in a transaction. Commit or rollback first.")

        # Take snapshot of current state
        self._transaction_snapshot = self.graph.snapshot()  # type: ignore[assignment]
        self._in_transaction = True

    def commit(self):
        """Commit the current transaction.

        Saves all changes made since begin() to the database (if using persistence).
        Clears the transaction snapshot.

        Raises:
            RuntimeError: If not in a transaction

        Examples:
            >>> gf = GraphForge("my-graph.db")
            >>> gf.begin()
            >>> gf.create_node(['Person'], name='Alice')
            >>> gf.commit()  # Changes are now permanent
        """
        if not self._in_transaction:
            raise RuntimeError("Not in a transaction. Call begin() first.")

        # Save to backend if persistence is enabled
        if self.backend:
            self._save_graph_to_backend()

        # Clear transaction state
        self._in_transaction = False
        self._transaction_snapshot = None

    def rollback(self):
        """Roll back the current transaction.

        Reverts all changes made since begin() by restoring the snapshot.
        Works for both in-memory and persistent graphs.

        Raises:
            RuntimeError: If not in a transaction

        Examples:
            >>> gf = GraphForge("my-graph.db")
            >>> gf.begin()
            >>> gf.create_node(['Person'], name='Alice')
            >>> results = gf.execute("MATCH (p:Person) RETURN count(*)")
            >>> # count is 1
            >>> gf.rollback()  # Alice is gone
            >>> results = gf.execute("MATCH (p:Person) RETURN count(*)")
            >>> # count is 0
        """
        if not self._in_transaction:
            raise RuntimeError("Not in a transaction. Call begin() first.")

        # Restore graph from snapshot
        self.graph.restore(self._transaction_snapshot)  # type: ignore[arg-type]

        # Rollback SQLite transaction if using persistence
        if self.backend:
            self.backend.rollback()

        # Clear transaction state
        self._in_transaction = False
        self._transaction_snapshot = None

    def close(self):
        """Save graph and close database.

        If using SQLite backend, saves all nodes and edges to disk and
        commits the transaction. Safe to call multiple times.

        If in an active transaction, the transaction is committed before closing.

        Examples:
            >>> gf = GraphForge("my-graph.db")
            >>> # ... create nodes and edges ...
            >>> gf.close()  # Save to disk
        """
        if self.backend and not self._closed:
            # Auto-commit any pending transaction
            if self._in_transaction:
                self.commit()
            else:
                # Save changes if not in explicit transaction
                self._save_graph_to_backend()

            self.backend.close()
            self._closed = True

    def clear(self) -> None:
        """Clear all graph data, resetting to an empty state.

        Resets the graph, internal ID counters, and transaction state without
        recreating the parser, planner, optimizer, or executor. This allows
        reusing a GraphForge instance for a new workload with zero parsing
        overhead.

        Raises:
            RuntimeError: If the instance has been closed

        Examples:
            >>> gf = GraphForge()
            >>> gf.execute("CREATE (:Person {name: 'Alice'})")
            >>> results = gf.execute("MATCH (n) RETURN count(n) AS c")
            >>> results[0]['c'].value
            1
            >>> gf.clear()
            >>> results = gf.execute("MATCH (n) RETURN count(n) AS c")
            >>> results[0]['c'].value
            0
        """
        if self._closed:
            raise RuntimeError("GraphForge instance has been closed")

        if self.backend is not None:
            if self._in_transaction:
                self.backend.rollback()
            raise RuntimeError(
                "Cannot clear a GraphForge instance with persistent storage. "
                "Use in-memory instances only (GraphForge() without path)."
            )

        # Reset graph data
        self.graph.clear()

        # Reset ID counters
        self._next_node_id = 1
        self._next_edge_id = 1

        # Reset transaction state
        self._in_transaction = False
        self._transaction_snapshot = None

        # Clear any custom functions registered on the executor
        self.executor.custom_functions.clear()

    def clone(self) -> "GraphForge":
        """Create a deep copy of this GraphForge instance.

        Creates a new GraphForge instance with a deep copy of graph state
        (nodes, edges, properties, indexes, ID counters) and fresh
        CypherParser, QueryPlanner, QueryOptimizer, and QueryExecutor
        instances.  Only the compiled Lark grammar is shared, via the
        module-level ``@lru_cache`` on ``_get_lark_parser``.

        Returns:
            GraphForge: A new instance with copied graph state

        Raises:
            RuntimeError: If the instance has been closed or uses persistent storage

        Examples:
            >>> gf = GraphForge()
            >>> gf.execute("CREATE (:Person {name: 'Alice'})")
            >>> clone = gf.clone()
            >>> clone.execute("CREATE (:Person {name: 'Bob'})")
            >>> # Original has 1 node, clone has 2 nodes
            >>> gf.execute("MATCH (n) RETURN count(n) AS c")[0]['c'].value
            1
            >>> clone.execute("MATCH (n) RETURN count(n) AS c")[0]['c'].value
            2
        """
        if self._closed:
            raise RuntimeError("Cannot clone a closed GraphForge instance")

        if self.backend is not None:
            raise RuntimeError(
                "Cannot clone GraphForge instances with persistent storage. "
                "Use in-memory instances only (GraphForge() without path)."
            )

        # Create new instance with same configuration
        cloned = GraphForge(
            enable_optimizer=self.optimizer is not None,
        )

        # Manually copy graph state (deepcopy doesn't work well with defaultdicts)
        cloned.graph._nodes = copy.deepcopy(self.graph._nodes)
        cloned.graph._edges = copy.deepcopy(self.graph._edges)

        # Copy adjacency lists
        cloned.graph._outgoing = defaultdict(list)
        for node_id, edges in self.graph._outgoing.items():
            cloned.graph._outgoing[node_id] = copy.deepcopy(edges)

        cloned.graph._incoming = defaultdict(list)
        for node_id, edges in self.graph._incoming.items():
            cloned.graph._incoming[node_id] = copy.deepcopy(edges)

        # Copy indexes
        cloned.graph._label_index = defaultdict(set)
        for label, node_ids in self.graph._label_index.items():
            cloned.graph._label_index[label] = copy.copy(node_ids)

        cloned.graph._type_index = defaultdict(set)
        for edge_type, edge_ids in self.graph._type_index.items():
            cloned.graph._type_index[edge_type] = copy.copy(edge_ids)

        # Copy statistics
        cloned.graph._statistics = copy.deepcopy(self.graph._statistics)

        # Copy ID counters
        cloned._next_node_id = self._next_node_id
        cloned._next_edge_id = self._next_edge_id

        # Copy transaction state (should be False/None in typical usage)
        cloned._in_transaction = self._in_transaction
        cloned._transaction_snapshot = (
            copy.deepcopy(self._transaction_snapshot) if self._transaction_snapshot else None
        )

        # Note: Custom functions are intentionally NOT copied. Each clone gets
        # its own executor instance; custom functions must be re-registered on
        # the clone if needed.

        return cloned

    def _load_graph_from_backend(self) -> Graph:
        """Load graph from SQLite backend.

        Returns:
            Graph instance populated with nodes and edges from database
        """
        assert self.backend is not None
        graph = Graph()

        # Load all nodes
        nodes = self.backend.load_all_nodes()
        node_map = {}  # Map node_id to NodeRef

        for node in nodes:
            graph.add_node(node)
            node_map[node.id] = node

        # Load all edges (returns dict of edge data)
        edges_data = self.backend.load_all_edges()

        # Reconstruct EdgeRef instances with actual NodeRef objects
        for edge_id, (edge_type, src_id, dst_id, properties) in edges_data.items():
            src_node = node_map[src_id]
            dst_node = node_map[dst_id]

            edge = EdgeRef(
                id=edge_id,
                type=edge_type,
                src=src_node,
                dst=dst_node,
                properties=properties,
            )

            graph.add_edge(edge)

        # Load statistics
        loaded_stats = self.backend.load_statistics()
        if loaded_stats is not None:
            graph._statistics = loaded_stats

        return graph

    def _save_graph_to_backend(self):
        """Save graph to SQLite backend."""
        assert self.backend is not None
        # Save all nodes
        for node in self.graph.get_all_nodes():
            self.backend.save_node(node)

        # Save all edges
        for edge in self.graph.get_all_edges():
            self.backend.save_edge(edge)

        # Save statistics
        stats = self.graph.get_statistics()
        self.backend.save_statistics(stats)

        # Commit transaction
        self.backend.commit()

`init(path=None, enable_optimizer=True)` ¶

Initialize GraphForge.

Parameters:

Name	Type	Description	Default
`path`	`str \| Path \| None`	Optional path to persistent storage (SQLite database file) If None, uses in-memory storage. If provided, loads existing graph or creates new database.	`None`
`enable_optimizer`	`bool`	Enable query optimization (default: True). When enabled, applies filter pushdown and predicate reordering for better performance.	`True`

Raises:

Type	Description
`ValueError`	If path is empty string or whitespace only

Examples:

>>> # In-memory graph (lost on exit)
>>> gf = GraphForge()

>>> # Persistent graph (saved to disk)
>>> gf = GraphForge("my-graph.db")
>>> # ... create nodes ...
>>> gf.close()  # Save to disk

>>> # Later, load the graph
>>> gf = GraphForge("my-graph.db")  # Graph is still there

>>> # Disable optimizer for debugging
>>> gf = GraphForge(enable_optimizer=False)

Source code in src/graphforge/api.py

def __init__(self, path: str | Path | None = None, enable_optimizer: bool = True):
    """Initialize GraphForge.

    Args:
        path: Optional path to persistent storage (SQLite database file)
              If None, uses in-memory storage.
              If provided, loads existing graph or creates new database.
        enable_optimizer: Enable query optimization (default: True).
              When enabled, applies filter pushdown and predicate reordering
              for better performance.

    Raises:
        ValueError: If path is empty string or whitespace only

    Examples:
        >>> # In-memory graph (lost on exit)
        >>> gf = GraphForge()

        >>> # Persistent graph (saved to disk)
        >>> gf = GraphForge("my-graph.db")
        >>> # ... create nodes ...
        >>> gf.close()  # Save to disk

        >>> # Later, load the graph
        >>> gf = GraphForge("my-graph.db")  # Graph is still there

        >>> # Disable optimizer for debugging
        >>> gf = GraphForge(enable_optimizer=False)
    """
    # Validate path if provided
    if path is not None:
        if isinstance(path, str) and not path.strip():
            raise ValueError("Path cannot be empty or whitespace only")

    # Initialize storage backend
    self.backend: SQLiteBackend | None
    if path:
        # Use SQLite for persistence
        self.backend = SQLiteBackend(Path(path))
        self.graph = self._load_graph_from_backend()
        # Set next IDs based on existing data
        self._next_node_id = self.backend.get_next_node_id()
        self._next_edge_id = self.backend.get_next_edge_id()
    else:
        # Use in-memory storage
        self.backend = None
        self.graph = Graph()
        self._next_node_id = 1
        self._next_edge_id = 1

    # Track if database has been closed
    self._closed = False

    # Transaction state
    self._in_transaction = False
    self._transaction_snapshot = None

    # Initialize query execution components
    self.parser = CypherParser()
    self.planner = QueryPlanner()
    self.optimizer = QueryOptimizer() if enable_optimizer else None
    self.executor = QueryExecutor(self.graph, graphforge=self, planner=self.planner)

`from_dataset(name, path=None)` `classmethod` ¶

Create a new GraphForge instance and load a dataset into it.

This is a convenience method that combines instance creation with dataset loading.

Parameters:

Name	Type	Description	Default
`name`	`str`	Dataset name (e.g., "snap-ego-facebook", "neo4j-movie-graph")	required
`path`	`str \| Path \| None`	Optional path for persistent storage	`None`

Returns:

Type	Description
`GraphForge`	GraphForge instance with dataset loaded

Raises:

Type	Description
`ValueError`	If dataset name is empty or whitespace only
`ValidationError`	If dataset name fails validation

Examples:

>>> # Load dataset into in-memory graph
>>> gf = GraphForge.from_dataset("snap-ego-facebook")
>>>
>>> # Load dataset into persistent storage
>>> gf = GraphForge.from_dataset("neo4j-movie-graph", "movies.db")

Source code in src/graphforge/api.py

@classmethod
def from_dataset(cls, name: str, path: str | Path | None = None) -> "GraphForge":
    """Create a new GraphForge instance and load a dataset into it.

    This is a convenience method that combines instance creation with dataset loading.

    Args:
        name: Dataset name (e.g., "snap-ego-facebook", "neo4j-movie-graph")
        path: Optional path for persistent storage

    Returns:
        GraphForge instance with dataset loaded

    Raises:
        ValueError: If dataset name is empty or whitespace only
        pydantic.ValidationError: If dataset name fails validation

    Examples:
        >>> # Load dataset into in-memory graph
        >>> gf = GraphForge.from_dataset("snap-ego-facebook")
        >>>
        >>> # Load dataset into persistent storage
        >>> gf = GraphForge.from_dataset("neo4j-movie-graph", "movies.db")
    """
    from graphforge.datasets import load_dataset

    # Validate dataset name
    DatasetNameInput(name=name)

    instance = cls(path)
    load_dataset(instance, name)  # nosec B615 - Not Hugging Face, our own dataset loader
    return instance

`register_function(name, func)` ¶

Register a custom function for testing/extension purposes.

Parameters:

Name	Type	Description	Default
`name`	`str`	Function name (case-insensitive, will be uppercased)	required
`func`	`Any`	Python callable that takes (args, ctx, executor) and returns CypherValue	required

Examples:

>>> gf = GraphForge()
>>> def my_func(args, ctx, executor):
...     return CypherInt(42)
>>> gf.register_function("MYFUNC", my_func)
>>> result = gf.execute("RETURN MYFUNC() AS value")

Source code in src/graphforge/api.py

def register_function(self, name: str, func: Any) -> None:
    """Register a custom function for testing/extension purposes.

    Args:
        name: Function name (case-insensitive, will be uppercased)
        func: Python callable that takes (args, ctx, executor) and returns CypherValue

    Examples:
        >>> gf = GraphForge()
        >>> def my_func(args, ctx, executor):
        ...     return CypherInt(42)
        >>> gf.register_function("MYFUNC", my_func)
        >>> result = gf.execute("RETURN MYFUNC() AS value")
    """
    self.executor.custom_functions[name.upper()] = func

`execute(query)` ¶

Execute an openCypher query.

Parameters:

Name	Type	Description	Default
`query`	`str`	openCypher query string	required

Returns:

Type	Description
`list[dict]`	List of result rows as dictionaries

Raises:

Type	Description
`ValueError`	If query is empty or whitespace only
`ValidationError`	If query fails validation

Examples:

>>> gf = GraphForge()
>>> results = gf.execute("MATCH (n) RETURN n LIMIT 10")

Source code in src/graphforge/api.py

def execute(self, query: str) -> list[dict]:
    """Execute an openCypher query.

    Args:
        query: openCypher query string

    Returns:
        List of result rows as dictionaries

    Raises:
        ValueError: If query is empty or whitespace only
        pydantic.ValidationError: If query fails validation

    Examples:
        >>> gf = GraphForge()
        >>> results = gf.execute("MATCH (n) RETURN n LIMIT 10")
    """
    # Validate query input
    QueryInput(query=query)

    # Parse query
    ast = self.parser.parse(query)

    # Check if this is a UNION query
    from graphforge.ast.query import UnionQuery

    if isinstance(ast, UnionQuery):
        # Handle UNION query: plan and optimize each branch separately
        branch_operators = []
        # Update optimizer statistics for cost-based optimization
        if self.optimizer:
            self.optimizer.update_statistics(self.graph.get_statistics())
        for branch_ast in ast.branches:
            branch_ops = self.planner.plan(branch_ast)
            # Optimize each branch independently
            if self.optimizer:
                branch_ops = self.optimizer.optimize(branch_ops)
            branch_operators.append(branch_ops)

        # Create Union operator
        from graphforge.planner.operators import Union

        union_op = Union(branches=branch_operators, all=ast.all)
        operators = [union_op]
    else:
        # Regular query
        operators = self.planner.plan(ast)

        # Optimize query plan with current graph statistics
        if self.optimizer:
            self.optimizer.update_statistics(self.graph.get_statistics())
            operators = self.optimizer.optimize(operators)

    # Execute
    results = self.executor.execute(operators)

    return results

`create_node(labels=None, **properties)` ¶

Create a node with labels and properties.

Automatically assigns a unique node ID and converts Python values to CypherValue types.

Parameters:

Name	Type	Description	Default
`labels`	`list[str] \| None`	List of label strings (e.g., ['Person', 'Employee'])	`None`
`**properties`	`Any`	Property key-value pairs as Python types. Values are converted to CypherValue types: - int → CypherInt - float → CypherFloat - str → CypherString - bool → CypherBool - None → CypherNull - dict with {x, y} or {latitude, longitude} → CypherPoint - dict (other) → CypherMap - list → CypherList - date → CypherDate - datetime → CypherDateTime - time → CypherTime - timedelta → CypherDuration	`{}`

Returns:

Type	Description
`NodeRef`	NodeRef for the created node

Raises:

Type	Description
`ValueError`	If labels are invalid (empty, don't start with letter, etc.)
`ValidationError`	If labels fail validation
`TypeError`	If property values are unsupported types

Examples:

>>> gf = GraphForge()
>>> alice = gf.create_node(['Person'], name='Alice', age=30)
>>> bob = gf.create_node(['Person', 'Employee'], name='Bob', salary=50000)
>>> # Create node with spatial property (Cartesian coordinates)
>>> office = gf.create_node(['Place'], name='Office', location={"x": 1.0, "y": 2.0})
>>> # Create node with geographic coordinates
>>> sf = gf.create_node(
...     ['City'], name='SF', location={"latitude": 37.7749, "longitude": -122.4194}
... )
>>> # Query the created nodes
>>> results = gf.execute("MATCH (p:Person) RETURN p.name")

Source code in src/graphforge/api.py

def create_node(self, labels: list[str] | None = None, **properties: Any) -> NodeRef:
    """Create a node with labels and properties.

    Automatically assigns a unique node ID and converts Python values
    to CypherValue types.

    Args:
        labels: List of label strings (e.g., ['Person', 'Employee'])
        **properties: Property key-value pairs as Python types.
            Values are converted to CypherValue types:
            - int → CypherInt
            - float → CypherFloat
            - str → CypherString
            - bool → CypherBool
            - None → CypherNull
            - dict with {x, y} or {latitude, longitude} → CypherPoint
            - dict (other) → CypherMap
            - list → CypherList
            - date → CypherDate
            - datetime → CypherDateTime
            - time → CypherTime
            - timedelta → CypherDuration

    Returns:
        NodeRef for the created node

    Raises:
        ValueError: If labels are invalid (empty, don't start with letter, etc.)
        pydantic.ValidationError: If labels fail validation
        TypeError: If property values are unsupported types

    Examples:
        >>> gf = GraphForge()
        >>> alice = gf.create_node(['Person'], name='Alice', age=30)
        >>> bob = gf.create_node(['Person', 'Employee'], name='Bob', salary=50000)
        >>> # Create node with spatial property (Cartesian coordinates)
        >>> office = gf.create_node(['Place'], name='Office', location={"x": 1.0, "y": 2.0})
        >>> # Create node with geographic coordinates
        >>> sf = gf.create_node(
        ...     ['City'], name='SF', location={"latitude": 37.7749, "longitude": -122.4194}
        ... )
        >>> # Query the created nodes
        >>> results = gf.execute("MATCH (p:Person) RETURN p.name")
    """
    # Validate labels
    NodeInput(labels=labels or [])

    # Convert properties to CypherValues
    cypher_properties = {key: self._to_cypher_value(value) for key, value in properties.items()}

    # Create node with auto-generated ID
    node = NodeRef(
        id=self._next_node_id,
        labels=frozenset(labels or []),
        properties=cypher_properties,
    )

    # Add to graph
    self.graph.add_node(node)

    # Increment ID for next node
    self._next_node_id += 1

    return node

`create_relationship(src, dst, rel_type, **properties)` ¶

Create a relationship between two nodes.

Automatically assigns a unique edge ID and converts Python values to CypherValue types.

Parameters:

Name	Type	Description	Default
`src`	`NodeRef`	Source node (NodeRef)	required
`dst`	`NodeRef`	Destination node (NodeRef)	required
`rel_type`	`str`	Relationship type (e.g., 'KNOWS', 'WORKS_AT')	required
`**properties`	`Any`	Property key-value pairs as Python types. Values are converted to CypherValue types (same as create_node). Supports Point coordinates: {"x": 1.0, "y": 2.0} or	`{}`

Returns:

Type	Description
`EdgeRef`	EdgeRef for the created relationship

Raises:

Type	Description
`ValueError`	If rel_type is invalid (empty, doesn't start with letter/underscore, etc.)
`TypeError`	If src or dst are not NodeRef instances
`ValidationError`	If rel_type fails validation

Examples:

>>> gf = GraphForge()
>>> alice = gf.create_node(['Person'], name='Alice')
>>> bob = gf.create_node(['Person'], name='Bob')
>>> knows = gf.create_relationship(alice, bob, 'KNOWS', since=2020)
>>> # Relationship with spatial property
>>> travels = gf.create_relationship(
...     alice, bob, 'TRAVELS_TO', distance_from={"x": 0.0, "y": 0.0}
... )
>>> # Query relationships
>>> results = gf.execute("MATCH (a)-[r:KNOWS]->(b) RETURN a.name, b.name")

Source code in src/graphforge/api.py

def create_relationship(
    self, src: NodeRef, dst: NodeRef, rel_type: str, **properties: Any
) -> EdgeRef:
    """Create a relationship between two nodes.

    Automatically assigns a unique edge ID and converts Python values
    to CypherValue types.

    Args:
        src: Source node (NodeRef)
        dst: Destination node (NodeRef)
        rel_type: Relationship type (e.g., 'KNOWS', 'WORKS_AT')
        **properties: Property key-value pairs as Python types.
            Values are converted to CypherValue types (same as create_node).
            Supports Point coordinates: {"x": 1.0, "y": 2.0} or
            {"latitude": 37.7, "longitude": -122.4}

    Returns:
        EdgeRef for the created relationship

    Raises:
        ValueError: If rel_type is invalid (empty, doesn't start with letter/underscore, etc.)
        TypeError: If src or dst are not NodeRef instances
        pydantic.ValidationError: If rel_type fails validation

    Examples:
        >>> gf = GraphForge()
        >>> alice = gf.create_node(['Person'], name='Alice')
        >>> bob = gf.create_node(['Person'], name='Bob')
        >>> knows = gf.create_relationship(alice, bob, 'KNOWS', since=2020)
        >>> # Relationship with spatial property
        >>> travels = gf.create_relationship(
        ...     alice, bob, 'TRAVELS_TO', distance_from={"x": 0.0, "y": 0.0}
        ... )
        >>> # Query relationships
        >>> results = gf.execute("MATCH (a)-[r:KNOWS]->(b) RETURN a.name, b.name")
    """
    # Validate inputs
    if not isinstance(src, NodeRef):
        raise TypeError(f"src must be a NodeRef, got {type(src).__name__}")
    if not isinstance(dst, NodeRef):
        raise TypeError(f"dst must be a NodeRef, got {type(dst).__name__}")

    RelationshipInput(rel_type=rel_type)

    # Convert properties to CypherValues
    cypher_properties = {key: self._to_cypher_value(value) for key, value in properties.items()}

    # Create edge with auto-generated ID
    edge = EdgeRef(
        id=self._next_edge_id,
        type=rel_type,
        src=src,
        dst=dst,
        properties=cypher_properties,
    )

    # Add to graph
    self.graph.add_edge(edge)

    # Increment ID for next edge
    self._next_edge_id += 1

    return edge

`begin()` ¶

Begin an explicit transaction.

Starts a new transaction by taking a snapshot of the current graph state. Changes made after begin() can be committed or rolled back.

Raises:

Type	Description
`RuntimeError`	If already in a transaction

Examples:

>>> gf = GraphForge("my-graph.db")
>>> gf.begin()
>>> alice = gf.create_node(['Person'], name='Alice')
>>> gf.commit()  # Changes are saved

>>> gf.begin()
>>> bob = gf.create_node(['Person'], name='Bob')
>>> gf.rollback()  # Bob is removed

Source code in src/graphforge/api.py

def begin(self):
    """Begin an explicit transaction.

    Starts a new transaction by taking a snapshot of the current graph state.
    Changes made after begin() can be committed or rolled back.

    Raises:
        RuntimeError: If already in a transaction

    Examples:
        >>> gf = GraphForge("my-graph.db")
        >>> gf.begin()
        >>> alice = gf.create_node(['Person'], name='Alice')
        >>> gf.commit()  # Changes are saved

        >>> gf.begin()
        >>> bob = gf.create_node(['Person'], name='Bob')
        >>> gf.rollback()  # Bob is removed
    """
    if self._in_transaction:
        raise RuntimeError("Already in a transaction. Commit or rollback first.")

    # Take snapshot of current state
    self._transaction_snapshot = self.graph.snapshot()  # type: ignore[assignment]
    self._in_transaction = True

`commit()` ¶

Commit the current transaction.

Saves all changes made since begin() to the database (if using persistence). Clears the transaction snapshot.

Raises:

Type	Description
`RuntimeError`	If not in a transaction

Examples:

>>> gf = GraphForge("my-graph.db")
>>> gf.begin()
>>> gf.create_node(['Person'], name='Alice')
>>> gf.commit()  # Changes are now permanent

Source code in src/graphforge/api.py

def commit(self):
    """Commit the current transaction.

    Saves all changes made since begin() to the database (if using persistence).
    Clears the transaction snapshot.

    Raises:
        RuntimeError: If not in a transaction

    Examples:
        >>> gf = GraphForge("my-graph.db")
        >>> gf.begin()
        >>> gf.create_node(['Person'], name='Alice')
        >>> gf.commit()  # Changes are now permanent
    """
    if not self._in_transaction:
        raise RuntimeError("Not in a transaction. Call begin() first.")

    # Save to backend if persistence is enabled
    if self.backend:
        self._save_graph_to_backend()

    # Clear transaction state
    self._in_transaction = False
    self._transaction_snapshot = None

`rollback()` ¶

Roll back the current transaction.

Reverts all changes made since begin() by restoring the snapshot. Works for both in-memory and persistent graphs.

Raises:

Type	Description
`RuntimeError`	If not in a transaction

Examples:

>>> gf = GraphForge("my-graph.db")
>>> gf.begin()
>>> gf.create_node(['Person'], name='Alice')
>>> results = gf.execute("MATCH (p:Person) RETURN count(*)")
>>> # count is 1
>>> gf.rollback()  # Alice is gone
>>> results = gf.execute("MATCH (p:Person) RETURN count(*)")
>>> # count is 0

Source code in src/graphforge/api.py

def rollback(self):
    """Roll back the current transaction.

    Reverts all changes made since begin() by restoring the snapshot.
    Works for both in-memory and persistent graphs.

    Raises:
        RuntimeError: If not in a transaction

    Examples:
        >>> gf = GraphForge("my-graph.db")
        >>> gf.begin()
        >>> gf.create_node(['Person'], name='Alice')
        >>> results = gf.execute("MATCH (p:Person) RETURN count(*)")
        >>> # count is 1
        >>> gf.rollback()  # Alice is gone
        >>> results = gf.execute("MATCH (p:Person) RETURN count(*)")
        >>> # count is 0
    """
    if not self._in_transaction:
        raise RuntimeError("Not in a transaction. Call begin() first.")

    # Restore graph from snapshot
    self.graph.restore(self._transaction_snapshot)  # type: ignore[arg-type]

    # Rollback SQLite transaction if using persistence
    if self.backend:
        self.backend.rollback()

    # Clear transaction state
    self._in_transaction = False
    self._transaction_snapshot = None

`close()` ¶

Save graph and close database.

If using SQLite backend, saves all nodes and edges to disk and commits the transaction. Safe to call multiple times.

If in an active transaction, the transaction is committed before closing.

Examples:

>>> gf = GraphForge("my-graph.db")
>>> # ... create nodes and edges ...
>>> gf.close()  # Save to disk

Source code in src/graphforge/api.py

def close(self):
    """Save graph and close database.

    If using SQLite backend, saves all nodes and edges to disk and
    commits the transaction. Safe to call multiple times.

    If in an active transaction, the transaction is committed before closing.

    Examples:
        >>> gf = GraphForge("my-graph.db")
        >>> # ... create nodes and edges ...
        >>> gf.close()  # Save to disk
    """
    if self.backend and not self._closed:
        # Auto-commit any pending transaction
        if self._in_transaction:
            self.commit()
        else:
            # Save changes if not in explicit transaction
            self._save_graph_to_backend()

        self.backend.close()
        self._closed = True

`clear()` ¶

Clear all graph data, resetting to an empty state.

Resets the graph, internal ID counters, and transaction state without recreating the parser, planner, optimizer, or executor. This allows reusing a GraphForge instance for a new workload with zero parsing overhead.

Raises:

Type	Description
`RuntimeError`	If the instance has been closed

Examples:

>>> gf = GraphForge()
>>> gf.execute("CREATE (:Person {name: 'Alice'})")
>>> results = gf.execute("MATCH (n) RETURN count(n) AS c")
>>> results[0]['c'].value
1
>>> gf.clear()
>>> results = gf.execute("MATCH (n) RETURN count(n) AS c")
>>> results[0]['c'].value
0

Source code in src/graphforge/api.py

def clear(self) -> None:
    """Clear all graph data, resetting to an empty state.

    Resets the graph, internal ID counters, and transaction state without
    recreating the parser, planner, optimizer, or executor. This allows
    reusing a GraphForge instance for a new workload with zero parsing
    overhead.

    Raises:
        RuntimeError: If the instance has been closed

    Examples:
        >>> gf = GraphForge()
        >>> gf.execute("CREATE (:Person {name: 'Alice'})")
        >>> results = gf.execute("MATCH (n) RETURN count(n) AS c")
        >>> results[0]['c'].value
        1
        >>> gf.clear()
        >>> results = gf.execute("MATCH (n) RETURN count(n) AS c")
        >>> results[0]['c'].value
        0
    """
    if self._closed:
        raise RuntimeError("GraphForge instance has been closed")

    if self.backend is not None:
        if self._in_transaction:
            self.backend.rollback()
        raise RuntimeError(
            "Cannot clear a GraphForge instance with persistent storage. "
            "Use in-memory instances only (GraphForge() without path)."
        )

    # Reset graph data
    self.graph.clear()

    # Reset ID counters
    self._next_node_id = 1
    self._next_edge_id = 1

    # Reset transaction state
    self._in_transaction = False
    self._transaction_snapshot = None

    # Clear any custom functions registered on the executor
    self.executor.custom_functions.clear()

`clone()` ¶

Create a deep copy of this GraphForge instance.

Creates a new GraphForge instance with a deep copy of graph state (nodes, edges, properties, indexes, ID counters) and fresh CypherParser, QueryPlanner, QueryOptimizer, and QueryExecutor instances. Only the compiled Lark grammar is shared, via the module-level @lru_cache on _get_lark_parser.

Returns:

Name	Type	Description
`GraphForge`	`GraphForge`	A new instance with copied graph state

Raises:

Type	Description
`RuntimeError`	If the instance has been closed or uses persistent storage

Examples:

>>> gf = GraphForge()
>>> gf.execute("CREATE (:Person {name: 'Alice'})")
>>> clone = gf.clone()
>>> clone.execute("CREATE (:Person {name: 'Bob'})")
>>> # Original has 1 node, clone has 2 nodes
>>> gf.execute("MATCH (n) RETURN count(n) AS c")[0]['c'].value
1
>>> clone.execute("MATCH (n) RETURN count(n) AS c")[0]['c'].value
2

Source code in src/graphforge/api.py

def clone(self) -> "GraphForge":
    """Create a deep copy of this GraphForge instance.

    Creates a new GraphForge instance with a deep copy of graph state
    (nodes, edges, properties, indexes, ID counters) and fresh
    CypherParser, QueryPlanner, QueryOptimizer, and QueryExecutor
    instances.  Only the compiled Lark grammar is shared, via the
    module-level ``@lru_cache`` on ``_get_lark_parser``.

    Returns:
        GraphForge: A new instance with copied graph state

    Raises:
        RuntimeError: If the instance has been closed or uses persistent storage

    Examples:
        >>> gf = GraphForge()
        >>> gf.execute("CREATE (:Person {name: 'Alice'})")
        >>> clone = gf.clone()
        >>> clone.execute("CREATE (:Person {name: 'Bob'})")
        >>> # Original has 1 node, clone has 2 nodes
        >>> gf.execute("MATCH (n) RETURN count(n) AS c")[0]['c'].value
        1
        >>> clone.execute("MATCH (n) RETURN count(n) AS c")[0]['c'].value
        2
    """
    if self._closed:
        raise RuntimeError("Cannot clone a closed GraphForge instance")

    if self.backend is not None:
        raise RuntimeError(
            "Cannot clone GraphForge instances with persistent storage. "
            "Use in-memory instances only (GraphForge() without path)."
        )

    # Create new instance with same configuration
    cloned = GraphForge(
        enable_optimizer=self.optimizer is not None,
    )

    # Manually copy graph state (deepcopy doesn't work well with defaultdicts)
    cloned.graph._nodes = copy.deepcopy(self.graph._nodes)
    cloned.graph._edges = copy.deepcopy(self.graph._edges)

    # Copy adjacency lists
    cloned.graph._outgoing = defaultdict(list)
    for node_id, edges in self.graph._outgoing.items():
        cloned.graph._outgoing[node_id] = copy.deepcopy(edges)

    cloned.graph._incoming = defaultdict(list)
    for node_id, edges in self.graph._incoming.items():
        cloned.graph._incoming[node_id] = copy.deepcopy(edges)

    # Copy indexes
    cloned.graph._label_index = defaultdict(set)
    for label, node_ids in self.graph._label_index.items():
        cloned.graph._label_index[label] = copy.copy(node_ids)

    cloned.graph._type_index = defaultdict(set)
    for edge_type, edge_ids in self.graph._type_index.items():
        cloned.graph._type_index[edge_type] = copy.copy(edge_ids)

    # Copy statistics
    cloned.graph._statistics = copy.deepcopy(self.graph._statistics)

    # Copy ID counters
    cloned._next_node_id = self._next_node_id
    cloned._next_edge_id = self._next_edge_id

    # Copy transaction state (should be False/None in typical usage)
    cloned._in_transaction = self._in_transaction
    cloned._transaction_snapshot = (
        copy.deepcopy(self._transaction_snapshot) if self._transaction_snapshot else None
    )

    # Note: Custom functions are intentionally NOT copied. Each clone gets
    # its own executor instance; custom functions must be re-registered on
    # the clone if needed.

    return cloned

AST (Abstract Syntax Tree)¶

`graphforge.ast` ¶

Abstract Syntax Tree (AST) for openCypher queries.

This module contains AST node definitions for the supported openCypher subset (v1: MATCH, CREATE, WHERE, RETURN, LIMIT, SKIP).

`CreateClause` ¶

Bases: BaseModel

CREATE clause for creating graph elements.

Examples:

CREATE (n:Person {name: 'Alice'}) CREATE (a)-[r:KNOWS]->(b)

`DeleteClause` ¶

Bases: BaseModel

DELETE clause for removing nodes and relationships.

Examples:

DELETE n DETACH DELETE n DELETE n, r

`LimitClause` ¶

Bases: BaseModel

LIMIT clause for limiting result rows.

Examples:

LIMIT 10 LIMIT 100 LIMIT 0 (valid - returns no rows)

`MatchClause` ¶

Bases: BaseModel

MATCH clause for pattern matching.

Examples:

MATCH (n:Person) MATCH (a)-[r:KNOWS]->(b)

`MergeClause` ¶

Bases: BaseModel

MERGE clause for creating or matching patterns.

Examples:

MERGE (n:Person {name: 'Alice'}) MERGE (n:Person {id: 1}) ON CREATE SET n.created = timestamp() MERGE (n:Person {id: 1}) ON MATCH SET n.updated = timestamp() MERGE (n:Person {id: 1}) ON CREATE SET n.created = 1 ON MATCH SET n.updated = 1 MERGE (a)-[r:KNOWS]->(b)

`ReturnClause` ¶

Bases: BaseModel

RETURN clause for projection.

Examples:

RETURN n RETURN n.name AS name, n.age AS age RETURN count(n) AS count RETURN DISTINCT n.name

`SetClause` ¶

Bases: BaseModel

SET clause for updating properties.

Examples:

SET n.age = 30 SET n.age = 30, n.name = 'Alice'

`validate_items(v)` `classmethod` ¶

Validate SET items format.

`SkipClause` ¶

Bases: BaseModel

SKIP clause for offsetting results.

Examples:

SKIP 5 SKIP 20

`WhereClause` ¶

Bases: BaseModel

WHERE clause for filtering.

Examples:

WHERE n.age > 30 WHERE n.name = "Alice" AND n.age < 50

`BinaryOp` ¶

Bases: BaseModel

Binary operation expression.

Supports: - Comparisons: =, <>, <, >, <=, >= - Logical: AND, OR, XOR - Arithmetic: +, -, *, / (future)

`validate_op(v)` `classmethod` ¶

Validate operator is supported.

`Literal` ¶

Bases: BaseModel

Literal value expression.

Examples:

Literal(value=42), Literal(value="hello"), Literal(value=True), Literal(value=None)

`validate_value(v)` `classmethod` ¶

Validate literal value is a supported type.

`PropertyAccess` ¶

Bases: BaseModel

Property access expression.

Supports both variable property access and expression property access: - variable.property: PropertyAccess(variable="n", property="name") - {key: val}.property: PropertyAccess(base=Literal(...), property="key") - list[0].property: PropertyAccess(base=ListIndex(...), property="...")

For backward compatibility, 'variable' can still be used for simple cases. When 'base' is provided, it takes precedence over 'variable'.

`validate_variable_or_base()` ¶

Ensure either variable or base is provided.

`validate_identifier(v)` `classmethod` ¶

Validate identifier format.

`Subscript` ¶

Bases: BaseModel

Subscript/slice expression for list access.

Supports: - Index access: list[0], list[-1] - Slice access: list[1..3], list[..2], list[2..], list[..]

Examples:

Subscript(base=Variable("list"), index=Literal(0)) # list[0] Subscript(base=Variable("list"), start=Literal(1), end=Literal(3)) # list[1..3] Subscript(base=Variable("list"), start=None, end=Literal(2)) # list[..2] Subscript(base=Variable("list"), start=Literal(2), end=None) # list[2..] Subscript(base=Variable("list"), start=None, end=None) # list[..] (full slice)

`validate_index_or_slice()` ¶

Ensure either index OR slice mode, not both.

Index mode: index is not None, start and end are None Slice mode: index is None, start and/or end can be any value (including both None for list[..])

`UnaryOp` ¶

Bases: BaseModel

Unary operation expression.

Supports: - Logical: NOT - Arithmetic: - (negation, future)

`validate_op(v)` `classmethod` ¶

Validate operator is supported.

`Variable` ¶

Bases: BaseModel

Variable reference expression.

Examples:

Variable(name="n"), Variable(name="person"), Variable(name="r")

`validate_name(v)` `classmethod` ¶

Validate variable name format.

`Wildcard` ¶

Bases: BaseModel

Wildcard expression for RETURN * and WITH *.

Represents the special * syntax that expands to all variables in scope.

`Direction` ¶

Bases: Enum

Relationship direction in pattern matching.

`NodePattern` ¶

Bases: BaseModel

AST node for matching graph nodes.

Represents a node pattern like: (n:Person {name: "Alice"})

Attributes:

Name	Type	Description
`variable`	`str \| None`	Variable name to bind the node (None for anonymous)
`labels`	`list[list[str]]`	List of label groups (disjunction of conjunctions) Example: [['Person']] - must have 'Person' Example: [['Person', 'Employee']] - must have both Example: [['Person'], ['Company']] - must have Person OR Company
`properties`	`dict[str, Any]`	Dict of property constraints (property_name -> Expression)

`validate_variable(v)` `classmethod` ¶

Validate variable name format if provided.

`validate_labels(v)` `classmethod` ¶

Validate label names.

`RelationshipPattern` ¶

Bases: BaseModel

AST node for matching relationships.

Represents a relationship pattern like: -[r:KNOWS {since: 2020}]-> Or variable-length: -[r:KNOWS*1..3]-> Or with predicate: -[r:KNOWS WHERE r.since > 2020]->

Attributes:

Name	Type	Description
`variable`	`str \| None`	Variable name to bind the relationship (None for anonymous)
`types`	`list[str]`	List of relationship types to match
`direction`	`Direction`	Direction of the relationship
`properties`	`dict[str, Any]`	Dict of property constraints (property_name -> Expression)
`min_hops`	`int \| None`	Minimum hops for variable-length (None for single-hop)
`max_hops`	`int \| None`	Maximum hops for variable-length (None for unbounded)
`predicate`	`Any \| None`	WHERE predicate expression inside pattern (None if not specified)

`validate_variable(v)` `classmethod` ¶

Validate variable name format if provided.

`validate_types(v)` `classmethod` ¶

Validate relationship type names.

`validate_min_hops(v)` `classmethod` ¶

Validate minimum hops.

`validate_max_hops(v)` `classmethod` ¶

Validate maximum hops.

`CypherQuery` `dataclass` ¶

Root AST node for an openCypher query.

A query consists of a sequence of clauses: - MATCH - WHERE - WITH - RETURN - LIMIT - SKIP

Examples:

MATCH (n:Person) RETURN n MATCH (n) WHERE n.age > 30 RETURN n.name LIMIT 10 MATCH (n) WITH n ORDER BY n.age MATCH (n)-[r]->(m) RETURN n, m

Parser¶

`graphforge.parser` ¶

openCypher query parser.

This module contains the parser for converting openCypher query strings into AST representations.

Query Planner¶

`graphforge.planner` ¶

Query planner and logical plan operators.

This module contains the query planner that converts AST into logical execution plans, and the logical plan operator definitions.

`ExpandEdges` ¶

Bases: BaseModel

Operator for expanding (traversing) relationships.

Follows relationships from source nodes to destination nodes.

Attributes:

Name	Type	Description
`src_var`	`str`	Variable name for source nodes
`edge_var`	`str \| None`	Variable name to bind edges to
`dst_var`	`str`	Variable name to bind destination nodes to
`path_var`	`str \| None`	Variable name to bind full path to (None if not needed)
`edge_types`	`list[str]`	List of edge types to match
`direction`	`str`	Direction to traverse ('OUT', 'IN', 'UNDIRECTED')
`predicate`	`Any \| None`	WHERE predicate expression to filter edges (None if not specified)
`agg_hint`	`AggregationHint \| None`	Optional hint for incremental aggregation during traversal

`validate_direction(v)` `classmethod` ¶

Validate direction is valid.

`Filter` ¶

Bases: BaseModel

Operator for filtering rows based on a predicate.

Evaluates a boolean expression and keeps only rows where it's true.

Attributes:

Name	Type	Description
`predicate`	`Any`	Expression AST node to evaluate

`Limit` ¶

Bases: BaseModel

Operator for limiting the number of result rows.

Attributes:

Name	Type	Description
`count`	`int`	Maximum number of rows to return

`Project` ¶

Bases: BaseModel

Operator for projecting (selecting) return items.

Evaluates expressions and returns specified columns with optional aliases.

Attributes:

Name	Type	Description
`items`	`list[Any]`	List of ReturnItem AST nodes (expression + optional alias)

`ScanNodes` ¶

Bases: BaseModel

Operator for scanning nodes.

Scans all nodes or filters by labels.

Attributes:

Name	Type	Description
`variable`	`str`	Variable name to bind nodes to
`labels`	`list[list[str]] \| None`	Optional list of label groups (disjunction of conjunctions) Example: [['Person']] - must have 'Person' Example: [['Person', 'Employee']] - must have both Example: [['Person'], ['Company']] - must have Person OR Company
`path_var`	`str \| None`	Variable name to bind single-node path to (None if not needed)
`predicate`	`Any \| None`	WHERE predicate expression to filter nodes (None if not specified)

`validate_variable(v)` `classmethod` ¶

Validate variable name.

`Skip` ¶

Bases: BaseModel

Operator for skipping result rows.

Attributes:

Name	Type	Description
`count`	`int`	Number of rows to skip

Executor¶

`graphforge.executor` ¶

Query execution engine.

This module contains the execution engine that executes logical plans against graph stores.

`ExecutionContext` ¶

Context for query execution.

Maintains variable bindings during query execution.

Attributes:

Name	Type	Description
`bindings`	`dict[str, Any]`	Dictionary mapping variable names to values

`init()` ¶

Initialize empty execution context.

`bind(name, value)` ¶

Bind a variable to a value.

Parameters:

Name	Type	Description	Default
`name`	`str`	Variable name	required
`value`	`Any`	Value to bind (NodeRef, EdgeRef, CypherValue)	required

`get(name)` ¶

Get a variable's value.

Parameters:

Name	Type	Description	Default
`name`	`str`	Variable name	required

Returns:

Type	Description
`Any`	The bound value

Raises:

Type	Description
`KeyError`	If variable is not bound

`has(name)` ¶

Check if a variable is bound.

Parameters:

Name	Type	Description	Default
`name`	`str`	Variable name	required

Returns:

Type	Description
`bool`	True if variable is bound

`evaluate_expression(expr, ctx, executor=None)` ¶

Evaluate an AST expression in a context.

Parameters:

Name	Type	Description	Default
`expr`	`Any`	AST expression node	required
`ctx`	`ExecutionContext`	Execution context with variable bindings	required
`executor`	`Any`	Optional QueryExecutor instance for subquery evaluation	`None`

Returns:

Type	Description
`CypherValue`	CypherValue result

Raises:

Type	Description
`KeyError`	If a referenced variable is not bound
`TypeError`	If expression type is not supported

Storage¶

`graphforge.storage` ¶

Storage backends for GraphForge.

This module contains storage implementations: - In-memory graph store - SQLite persistent storage backend - TWO separate serialization systems (see below)

CRITICAL: Two Serialization Systems¶

GraphForge uses TWO separate serialization systems for different purposes:

SQLite + MessagePack (serialization.py)
For: Graph data (nodes, edges, properties)
Format: Binary MessagePack (fast, compact)
Types: CypherValue types (CypherInt, CypherString, etc.)
Storage: SQLite database files (*.db)
Use for: Runtime graph operations
Pydantic + JSON (pydantic_serialization.py)
For: Metadata, schemas, dataset definitions
Format: JSON (human-readable, validatable)
Types: Pydantic models (DatasetInfo, AST nodes, ontologies)
Storage: JSON metadata files (*.json)
Use for: Configuration and schema definitions

NEVER mix these systems: - Don't serialize graph data with Pydantic (performance disaster) - Don't serialize metadata with MessagePack (loses validation)

See CLAUDE.md "Two Serialization Systems" for detailed explanation.

`Graph` ¶

In-memory graph store with adjacency list representation.

The graph maintains several indexes for efficient queries: - Node storage: id -> NodeRef - Edge storage: id -> EdgeRef - Outgoing edges: node_id -> [EdgeRef] - Incoming edges: node_id -> [EdgeRef] - Label index: label -> {node_id} - Type index: edge_type -> {edge_id}

Examples:

>>> graph = Graph()
>>> node = NodeRef(id=1, labels=frozenset(["Person"]), properties={})
>>> graph.add_node(node)
>>> graph.node_count()
1
>>> graph.get_node(1) == node
True

`init()` ¶

Initialize an empty graph.

`add_node(node)` ¶

Add a node to the graph.

Parameters:

Name	Type	Description	Default
`node`	`NodeRef`	The node to add	required

Note

If a node with this ID already exists, it will be replaced.

`get_node(node_id)` ¶

Get a node by its ID.

Parameters:

Name	Type	Description	Default
`node_id`	`int \| str`	The node ID to retrieve	required

Returns:

Type	Description
`NodeRef \| None`	The NodeRef if found, None otherwise

`has_node(node_id)` ¶

Check if a node exists in the graph.

Parameters:

Name	Type	Description	Default
`node_id`	`int \| str`	The node ID to check	required

Returns:

Type	Description
`bool`	True if the node exists, False otherwise

`node_count()` ¶

Get the number of nodes in the graph.

Returns:

Type	Description
`int`	The number of nodes

`get_all_nodes()` ¶

Get all nodes in the graph.

Returns:

Type	Description
`list[NodeRef]`	List of all nodes

`get_nodes_by_label(label)` ¶

Get all nodes with a specific label.

Parameters:

Name	Type	Description	Default
`label`	`str`	The label to filter by	required

Returns:

Type	Description
`list[NodeRef]`	List of nodes with the specified label

`get_statistics()` ¶

Get current graph statistics for cost-based optimization.

Returns:

Type	Description
`GraphStatistics`	GraphStatistics instance with current statistics

`add_edge(edge)` ¶

Add an edge to the graph.

Parameters:

Name	Type	Description	Default
`edge`	`EdgeRef`	The edge to add	required

Raises:

Type	Description
`ValueError`	If source or destination node doesn't exist

Note

If an edge with this ID already exists, it will be replaced.

`get_edge(edge_id)` ¶

Get an edge by its ID.

Parameters:

Name	Type	Description	Default
`edge_id`	`int \| str`	The edge ID to retrieve	required

Returns:

Type	Description
`EdgeRef \| None`	The EdgeRef if found, None otherwise

`has_edge(edge_id)` ¶

Check if an edge exists in the graph.

Parameters:

Name	Type	Description	Default
`edge_id`	`int \| str`	The edge ID to check	required

Returns:

Type	Description
`bool`	True if the edge exists, False otherwise

`edge_count()` ¶

Get the number of edges in the graph.

Returns:

Type	Description
`int`	The number of edges

`get_all_edges()` ¶

Get all edges in the graph.

Returns:

Type	Description
`list[EdgeRef]`	List of all edges

`get_edges_by_type(edge_type)` ¶

Get all edges of a specific type.

Parameters:

Name	Type	Description	Default
`edge_type`	`str`	The edge type to filter by	required

Returns:

Type	Description
`list[EdgeRef]`	List of edges with the specified type

`get_outgoing_edges(node_id)` ¶

Get all edges going out from a node.

Parameters:

Name	Type	Description	Default
`node_id`	`int \| str`	The source node ID	required

Returns:

Type	Description
`list[EdgeRef]`	List of outgoing edges (empty list if node doesn't exist)

`get_incoming_edges(node_id)` ¶

Get all edges coming into a node.

Parameters:

Name	Type	Description	Default
`node_id`	`int \| str`	The destination node ID	required

Returns:

Type	Description
`list[EdgeRef]`	List of incoming edges (empty list if node doesn't exist)

`clear()` ¶

Clear all graph data, resetting to an empty state.

Removes all nodes, edges, indexes, and statistics. This is equivalent to creating a new Graph() but reuses the same object.

`snapshot()` ¶

Create a snapshot of the current graph state.

Returns:

Type	Description
`dict`	Dictionary containing all graph data for restoration

Note

This creates a deep copy of all internal structures to support transaction rollback. For large graphs, this may be memory intensive.

`restore(snapshot)` ¶

Restore graph state from a snapshot.

Parameters:

Name	Type	Description	Default
`snapshot`	`dict`	Snapshot dictionary created by snapshot()	required

Note

Completely replaces the current graph state with the snapshot state.

`SQLiteBackend` ¶

SQLite storage backend with WAL mode for durability.

Provides ACID guarantees and concurrent readers through SQLite's WAL (Write-Ahead Logging) mode.

Schema

nodes: (id, labels, properties)
edges: (id, type, src_id, dst_id, properties)
adjacency_out: (node_id, edge_id) for outgoing edges
adjacency_in: (node_id, edge_id) for incoming edges

`init(db_path)` ¶

Initialize SQLite backend.

Parameters:

Name	Type	Description	Default
`db_path`	`Path`	Path to SQLite database file	required

`save_node(node)` ¶

Save a node to the database.

Parameters:

Name	Type	Description	Default
`node`	`NodeRef`	NodeRef to save	required

`save_edge(edge)` ¶

Save an edge to the database.

Parameters:

Name	Type	Description	Default
`edge`	`EdgeRef`	EdgeRef to save	required

`load_all_nodes()` ¶

Load all nodes from the database.

Returns:

Type	Description
`list[NodeRef]`	List of NodeRef instances

`load_all_edges()` ¶

Load all edges from the database.

Returns edge data as a dict mapping edge_id to (type, src_id, dst_id, properties). Caller must reconstruct EdgeRef with actual NodeRef instances.

Returns:

Type	Description
`dict[int, tuple]`	Dict mapping edge_id to (type, src_id, dst_id, properties)

`load_adjacency_out()` ¶

Load outgoing adjacency lists.

Returns:

Type	Description
`dict[int, list[int]]`	Dict mapping node_id to list of outgoing edge_ids

`load_adjacency_in()` ¶

Load incoming adjacency lists.

Returns:

Type	Description
`dict[int, list[int]]`	Dict mapping node_id to list of incoming edge_ids

`get_next_node_id()` ¶

Get the next available node ID.

Returns:

Type	Description
`int`	Next node ID (max + 1, or 1 if no nodes exist)

`get_next_edge_id()` ¶

Get the next available edge ID.

Returns:

Type	Description
`int`	Next edge ID (max + 1, or 1 if no edges exist)

`commit()` ¶

Commit the current transaction.

`rollback()` ¶

Roll back the current transaction.

`close()` ¶

Close the database connection. Safe to call multiple times.

`save_statistics(stats)` ¶

Save graph statistics to the database.

Parameters:

Name	Type	Description	Default
`stats`	`GraphStatistics`	GraphStatistics instance to save	required

`load_statistics()` ¶

Load graph statistics from the database.

Returns:

Type	Description
`GraphStatistics \| None`	GraphStatistics instance if found, None otherwise

`deserialize_model(model_class, data)` ¶

Deserialize a dictionary to a Pydantic model.

Uses Pydantic's model_validate() method which runs all validators and ensures the data matches the model schema.

Parameters:

Name	Type	Description	Default
`model_class`	`type[T]`	Pydantic BaseModel class to deserialize to	required
`data`	`dict[str, Any]`	Dictionary representation of the model	required

Returns:

Type	Description
`T`	Model instance

Raises:

Type	Description
`ValidationError`	If data doesn't match model schema

Examples:

>>> from graphforge.datasets.base import DatasetInfo
>>> data = {
...     "name": "test-dataset",
...     "description": "Test dataset",
...     "source": "test",
...     "url": "https://example.com/data.csv",
...     "nodes": 100,
...     "edges": 200,
...     "size_mb": 1.5,
...     "license": "MIT",
...     "category": "test",
...     "loader_class": "csv"
... }
>>> info = deserialize_model(DatasetInfo, data)
>>> info.name
'test-dataset'

`deserialize_model_from_json(model_class, json_str)` ¶

Deserialize a JSON string to a Pydantic model.

Parameters:

Name	Type	Description	Default
`model_class`	`type[T]`	Pydantic BaseModel class to deserialize to	required
`json_str`	`str`	JSON string representation	required

Returns:

Type	Description
`T`	Model instance

Raises:

Type	Description
`ValidationError`	If JSON doesn't match model schema

Examples:

>>> from graphforge.datasets.base import DatasetInfo
>>> json_str = '''
... {
...     "name": "test",
...     "description": "Test",
...     "source": "test",
...     "url": "https://example.com/data.csv",
...     "nodes": 100,
...     "edges": 200,
...     "size_mb": 1.5,
...     "license": "MIT",
...     "category": "test",
...     "loader_class": "csv"
... }
... '''
>>> info = deserialize_model_from_json(DatasetInfo, json_str)
>>> info.name
'test'

`deserialize_models_batch(model_class, data_list)` ¶

Deserialize a batch of dictionaries to Pydantic models.

Parameters:

Name	Type	Description	Default
`model_class`	`type[T]`	Pydantic BaseModel class to deserialize to	required
`data_list`	`list[dict[str, Any]]`	List of dictionary representations	required

Returns:

Type	Description
`list[T]`	List of model instances

Raises:

Type	Description
`ValidationError`	If any dictionary doesn't match model schema

Examples:

>>> from graphforge.datasets.base import DatasetInfo
>>> data_list = [
...     {"name": "test1", "description": "Test 1", "source": "test",
...      "url": "https://example.com/1.csv", "nodes": 100, "edges": 200,
...      "size_mb": 1.5, "license": "MIT", "category": "test",
...      "loader_class": "csv"},
...     {"name": "test2", "description": "Test 2", "source": "test",
...      "url": "https://example.com/2.csv", "nodes": 200, "edges": 400,
...      "size_mb": 2.5, "license": "MIT", "category": "test",
...      "loader_class": "csv"}
... ]
>>> datasets = deserialize_models_batch(DatasetInfo, data_list)
>>> len(datasets)
2

`load_model_from_file(model_class, path)` ¶

Load a Pydantic model from a JSON file.

Parameters:

Name	Type	Description	Default
`model_class`	`type[T]`	Pydantic BaseModel class to deserialize to	required
`path`	`Path \| str`	File path to load from	required

Returns:

Type	Description
`T`	Model instance

Raises:

Type	Description
`FileNotFoundError`	If file doesn't exist
`ValidationError`	If file content doesn't match model schema

Examples:

>>> from graphforge.datasets.base import DatasetInfo
>>> # Assuming dataset.json exists
>>> info = load_model_from_file(DatasetInfo, "/tmp/dataset.json")

`load_models_batch_from_file(model_class, path)` ¶

Load a batch of Pydantic models from a JSON file.

Parameters:

Name	Type	Description	Default
`model_class`	`type[T]`	Pydantic BaseModel class to deserialize to	required
`path`	`Path \| str`	File path to load from	required

Returns:

Type	Description
`list[T]`	List of model instances

Raises:

Type	Description
`FileNotFoundError`	If file doesn't exist
`ValidationError`	If file content doesn't match model schema

Examples:

>>> from graphforge.datasets.base import DatasetInfo
>>> # Assuming datasets.json exists
>>> datasets = load_models_batch_from_file(DatasetInfo, "/tmp/datasets.json")

`save_model_to_file(model, path, indent=2)` ¶

Save a Pydantic model to a JSON file.

Parameters:

Name	Type	Description	Default
`model`	`BaseModel`	Pydantic BaseModel instance	required
`path`	`Path \| str`	File path to save to	required
`indent`	`int \| None`	JSON indentation (None for compact, 2 for pretty)	`2`

Examples:

>>> from graphforge.datasets.base import DatasetInfo
>>> info = DatasetInfo(
...     name="test",
...     description="Test",
...     source="test",
...     url="https://example.com/data.csv",
...     nodes=100,
...     edges=200,
...     size_mb=1.5,
...     license="MIT",
...     category="test",
...     loader_class="csv"
... )
>>> save_model_to_file(info, "/tmp/dataset.json")

`save_models_batch_to_file(models, path, indent=2)` ¶

Save a batch of Pydantic models to a JSON file.

Parameters:

Name	Type	Description	Default
`models`	`list[BaseModel]`	List of Pydantic BaseModel instances	required
`path`	`Path \| str`	File path to save to	required
`indent`	`int \| None`	JSON indentation (None for compact, 2 for pretty)	`2`

Examples:

>>> from graphforge.datasets.base import DatasetInfo
>>> datasets = [
...     DatasetInfo(name="test1", description="Test 1", source="test",
...                 url="https://example.com/1.csv", nodes=100, edges=200,
...                 size_mb=1.5, license="MIT", category="test",
...                 loader_class="csv")
... ]
>>> save_models_batch_to_file(datasets, "/tmp/datasets.json")

`serialize_model(model)` ¶

Serialize a Pydantic model to a dictionary.

Uses Pydantic's model_dump() method with mode='python' to ensure all types are JSON-serializable.

Parameters:

Name	Type	Description	Default
`model`	`BaseModel`	Pydantic BaseModel instance	required

Returns:

Type	Description
`dict[str, Any]`	Dictionary representation of the model

Examples:

>>> from graphforge.datasets.base import DatasetInfo
>>> info = DatasetInfo(
...     name="test-dataset",
...     description="Test dataset",
...     source="test",
...     url="https://example.com/data.csv",
...     nodes=100,
...     edges=200,
...     size_mb=1.5,
...     license="MIT",
...     category="test",
...     loader_class="csv"
... )
>>> data = serialize_model(info)
>>> data['name']
'test-dataset'

`serialize_model_to_json(model, indent=2)` ¶

Serialize a Pydantic model to JSON string.

Parameters:

Name	Type	Description	Default
`model`	`BaseModel`	Pydantic BaseModel instance	required
`indent`	`int \| None`	JSON indentation (None for compact, 2 for pretty)	`2`

Returns:

Type	Description
`str`	JSON string representation

Examples:

>>> from graphforge.datasets.base import DatasetInfo
>>> info = DatasetInfo(
...     name="test",
...     description="Test",
...     source="test",
...     url="https://example.com/data.csv",
...     nodes=100,
...     edges=200,
...     size_mb=1.5,
...     license="MIT",
...     category="test",
...     loader_class="csv"
... )
>>> json_str = serialize_model_to_json(info)
>>> '"name": "test"' in json_str
True

`serialize_models_batch(models)` ¶

Serialize a batch of Pydantic models to dictionaries.

Useful for serializing lists of dataset metadata, AST nodes, etc.

Parameters:

Name	Type	Description	Default
`models`	`list[BaseModel]`	List of Pydantic BaseModel instances	required

Returns:

Type	Description
`list[dict[str, Any]]`	List of dictionary representations

Examples:

>>> from graphforge.datasets.base import DatasetInfo
>>> datasets = [
...     DatasetInfo(name="test1", description="Test 1", source="test",
...                 url="https://example.com/1.csv", nodes=100, edges=200,
...                 size_mb=1.5, license="MIT", category="test",
...                 loader_class="csv"),
...     DatasetInfo(name="test2", description="Test 2", source="test",
...                 url="https://example.com/2.csv", nodes=200, edges=400,
...                 size_mb=2.5, license="MIT", category="test",
...                 loader_class="csv")
... ]
>>> data = serialize_models_batch(datasets)
>>> len(data)
2

`deserialize_cypher_value(data)` ¶

Deserialize a dict to a CypherValue.

Parameters:

Name	Type	Description	Default
`data`	`dict`	Dict with 'type' and optional 'value' keys	required

Returns:

Type	Description
`CypherValue`	CypherValue instance

`deserialize_labels(data)` ¶

Deserialize bytes to labels.

Parameters:

Name	Type	Description	Default
`data`	`bytes`	MessagePack encoded bytes	required

Returns:

Type	Description
`frozenset[str]`	Frozenset of label strings

`deserialize_properties(data)` ¶

Deserialize bytes to a properties dict.

Parameters:

Name	Type	Description	Default
`data`	`bytes`	MessagePack encoded bytes	required

Returns:

Type	Description
`dict`	Dict mapping str to CypherValue

`serialize_cypher_value(value)` ¶

Serialize a CypherValue to a dict for msgpack.

Parameters:

Name	Type	Description	Default
`value`	`CypherValue`	CypherValue instance	required

Returns:

Type	Description
`dict`	Dict with 'type' and 'value' keys

`serialize_labels(labels)` ¶

Serialize labels to bytes.

Parameters:

Name	Type	Description	Default
`labels`	`frozenset[str]`	Frozenset of label strings	required

Returns:

Type	Description
`bytes`	MessagePack encoded bytes

`serialize_properties(properties)` ¶

Serialize a properties dict to bytes.

Parameters:

Name	Type	Description	Default
`properties`	`dict`	Dict mapping str to CypherValue	required

Returns:

Type	Description
`bytes`	MessagePack encoded bytes

Types¶

`graphforge.types` ¶

Type system for GraphForge.

This module contains the runtime type system including: - CypherValue types (null, int, float, bool, string, list, map, path) - Temporal types (date, datetime, time, duration) - Spatial types (point, distance) - Graph elements (NodeRef, EdgeRef)

`EdgeRef` `dataclass` ¶

Runtime reference to a relationship (edge) in the graph.

Edges have: - A unique stable ID - A relationship type (string) - A source node (NodeRef) - A destination node (NodeRef) - Intrinsic directionality (src -> dst) - Zero or more properties (dict mapping string to CypherValue)

Identity is defined by ID - two EdgeRefs with the same ID are considered the same relationship, even if they have different types or endpoints.

Edges are immutable and hashable, allowing them to be used in sets and as dictionary keys.

Examples:

>>> alice = NodeRef(id=1, labels=frozenset(["Person"]), properties={})
>>> bob = NodeRef(id=2, labels=frozenset(["Person"]), properties={})
>>> edge = EdgeRef(
...     id=10,
...     type="KNOWS",
...     src=alice,
...     dst=bob,
...     properties={"since": CypherInt(2020)}
... )
>>> edge.type
'KNOWS'
>>> edge.src.id
1
>>> edge.dst.id
2

`hash()` ¶

Hash based on ID only.

`eq(other)` ¶

Equality based on ID only.

`repr()` ¶

Readable string representation.

`NodeRef` `dataclass` ¶

Runtime reference to a node in the graph.

Nodes have: - A unique stable ID - Zero or more labels (frozenset of strings) - Zero or more properties (dict mapping string to CypherValue)

Identity is defined by ID - two NodeRefs with the same ID are considered the same node, even if they have different labels or properties.

Nodes are immutable and hashable, allowing them to be used in sets and as dictionary keys.

Examples:

>>> node = NodeRef(
...     id=1,
...     labels=frozenset(["Person", "Employee"]),
...     properties={"name": CypherString("Alice"), "age": CypherInt(30)}
... )
>>> node.id
1
>>> "Person" in node.labels
True
>>> node.properties["name"].value
'Alice'

`hash()` ¶

Hash based on ID only.

`eq(other)` ¶

Equality based on ID only.

`repr()` ¶

Readable string representation.

`CypherBool` ¶

Bases: CypherValue

Represents a boolean value in openCypher.

`CypherDate` ¶

Bases: CypherValue

Represents a date value in openCypher.

Stores a Python datetime.date object. Supports ISO 8601 date strings.

`CypherDateTime` ¶

Bases: CypherValue

Represents a datetime value in openCypher.

Stores a Python datetime.datetime object with timezone support. Supports ISO 8601 datetime strings.

`CypherDistance` ¶

Bases: CypherValue

Represents a distance value in openCypher.

Stores a float representing the Euclidean distance between two points. For WGS-84 points, uses the Haversine formula to compute great-circle distance.

`init(value)` ¶

Initialize a distance value.

Parameters:

Name	Type	Description	Default
`value`	`float`	Distance as a float (must be non-negative)	required

Raises:

Type	Description
`ValueError`	If distance is negative

`CypherDuration` ¶

Bases: CypherValue

Represents a duration value in openCypher.

Stores a Python datetime.timedelta or isodate.Duration object. Supports ISO 8601 duration strings. When parsing ISO 8601 strings with years/months (e.g., "P1Y2M"), stores an isodate.Duration; otherwise stores a datetime.timedelta.

`CypherFloat` ¶

Bases: CypherValue

Represents a floating-point value in openCypher.

`CypherInt` ¶

Bases: CypherValue

Represents an integer value in openCypher.

`CypherList` ¶

Bases: CypherValue

Represents a list value in openCypher.

`CypherMap` ¶

Bases: CypherValue

Represents a map (dictionary) value in openCypher.

`CypherNull` ¶

Bases: CypherValue

Represents NULL in openCypher.

`CypherPath` ¶

Bases: CypherValue

Represents a path through the graph.

A path consists of an alternating sequence of nodes and relationships: node -> edge -> node -> edge -> node

Attributes:

Name	Type	Description
`nodes`		List of NodeRef objects in the path (N nodes)
`relationships`		List of EdgeRef objects connecting the nodes (N-1 relationships)

The path structure ensures that: - len(relationships) == len(nodes) - 1 - relationships[i] connects nodes[i] to nodes[i+1]

Examples:

>>> # Path with 3 nodes: A -> B -> C
>>> path = CypherPath(
...     nodes=[node_a, node_b, node_c],
...     relationships=[edge_ab, edge_bc]
... )
>>> path.length()  # Number of relationships
2
>>> len(path.nodes)
3

`init(nodes, relationships)` ¶

Initialize a path from nodes and relationships.

Parameters:

Name	Type	Description	Default
`nodes`	`list[NodeRef]`	List of NodeRef objects in the path	required
`relationships`	`list[EdgeRef]`	List of EdgeRef objects connecting the nodes	required

Raises:

Type	Description
`ValueError`	If path structure is invalid

`length()` ¶

Return the length of the path (number of relationships).

`repr()` ¶

Readable string representation.

`CypherPoint` ¶

Bases: CypherValue

Represents a point value in openCypher.

Stores a dictionary with coordinate information. Supports: - 2D Cartesian: {"x": float, "y": float, "crs": "cartesian"} - 3D Cartesian: {"x": float, "y": float, "z": float, "crs": "cartesian-3d"} - WGS84 Geographic: {"latitude": float, "longitude": float, "crs": "wgs-84"}

The coordinate reference system (crs) is optional and inferred from keys.

`init(coordinates)` ¶

Initialize a point from coordinate dictionary.

Parameters:

Name	Type	Description	Default
`coordinates`	`dict[str, float]`	Dict with coordinate keys (x/y or latitude/longitude)	required

Raises:

Type	Description
`ValueError`	If coordinates are invalid or incomplete

`CypherString` ¶

Bases: CypherValue

Represents a string value in openCypher.

`CypherTime` ¶

Bases: CypherValue

Represents a time value in openCypher.

Stores a Python datetime.time object with timezone support. Supports ISO 8601 time strings.

`CypherType` ¶

Bases: Enum

openCypher value types.

`CypherValue` ¶

Base class for all openCypher values.

All comparison operations follow openCypher semantics: - NULL in any operation propagates NULL - Type-aware comparisons

`equals(other)` ¶

Check equality following openCypher semantics.

Returns:

Type	Description
`CypherValue`	CypherBool(True) if equal
`CypherValue`	CypherBool(False) if not equal
`CypherValue`	CypherNull if either operand is NULL

`less_than(other)` ¶

Check if this value is less than another.

Returns:

Type	Description
`CypherValue`	CypherBool(True/False) for comparable types
`CypherValue`	CypherNull if either operand is NULL

`to_python()` ¶

Convert this Cypher value to a Python value.

`from_python(value)` ¶

Convert a Python value to a CypherValue.

Parameters:

Name	Type	Description	Default
`value`	`Any`	Python value to convert	required

Returns:

Type	Description
`CypherValue`	Appropriate CypherValue subclass

Raises:

Type	Description
`TypeError`	If the value type is not supported

This documentation is automatically generated from source code docstrings.

API Documentation¶

Main API¶

graphforge.api ¶

QueryInput ¶

validate_query(v) classmethod ¶

NodeInput ¶

validate_labels(v) classmethod ¶

RelationshipInput ¶

validate_rel_type(v) classmethod ¶

DatasetNameInput ¶

validate_name(v) classmethod ¶

GraphForge ¶

__init__(path=None, enable_optimizer=True) ¶

from_dataset(name, path=None) classmethod ¶

register_function(name, func) ¶

execute(query) ¶

create_node(labels=None, **properties) ¶

create_relationship(src, dst, rel_type, **properties) ¶

begin() ¶

commit() ¶

rollback() ¶

close() ¶

clear() ¶

clone() ¶

AST (Abstract Syntax Tree)¶

graphforge.ast ¶

CreateClause ¶

DeleteClause ¶

LimitClause ¶

MatchClause ¶

MergeClause ¶

ReturnClause ¶

SetClause ¶

validate_items(v) classmethod ¶

SkipClause ¶

WhereClause ¶

BinaryOp ¶

validate_op(v) classmethod ¶

Literal ¶

validate_value(v) classmethod ¶

PropertyAccess ¶

validate_variable_or_base() ¶

validate_identifier(v) classmethod ¶

Subscript ¶

validate_index_or_slice() ¶

UnaryOp ¶

validate_op(v) classmethod ¶

Variable ¶

validate_name(v) classmethod ¶

Wildcard ¶

Direction ¶

NodePattern ¶

validate_variable(v) classmethod ¶

validate_labels(v) classmethod ¶

RelationshipPattern ¶

validate_variable(v) classmethod ¶

validate_types(v) classmethod ¶

validate_min_hops(v) classmethod ¶

validate_max_hops(v) classmethod ¶

CypherQuery dataclass ¶

Parser¶

graphforge.parser ¶

Query Planner¶

graphforge.planner ¶

ExpandEdges ¶

validate_direction(v) classmethod ¶

Filter ¶

Limit ¶

Project ¶

ScanNodes ¶

validate_variable(v) classmethod ¶

Skip ¶

Executor¶

graphforge.executor ¶

ExecutionContext ¶

__init__() ¶

bind(name, value) ¶

get(name) ¶

has(name) ¶

evaluate_expression(expr, ctx, executor=None) ¶

`graphforge.api` ¶

`QueryInput` ¶

`validate_query(v)` `classmethod` ¶

`NodeInput` ¶

`validate_labels(v)` `classmethod` ¶

`RelationshipInput` ¶

`validate_rel_type(v)` `classmethod` ¶

`DatasetNameInput` ¶

`validate_name(v)` `classmethod` ¶

`GraphForge` ¶

`init(path=None, enable_optimizer=True)` ¶

`from_dataset(name, path=None)` `classmethod` ¶

`register_function(name, func)` ¶

`execute(query)` ¶

`create_node(labels=None, **properties)` ¶

`create_relationship(src, dst, rel_type, **properties)` ¶

`begin()` ¶

`commit()` ¶

`rollback()` ¶

`close()` ¶

`clear()` ¶

`clone()` ¶

`graphforge.ast` ¶

`CreateClause` ¶

`DeleteClause` ¶

`LimitClause` ¶

`MatchClause` ¶

`MergeClause` ¶

`ReturnClause` ¶

`SetClause` ¶

`validate_items(v)` `classmethod` ¶

`SkipClause` ¶

`WhereClause` ¶

`BinaryOp` ¶

`validate_op(v)` `classmethod` ¶

`Literal` ¶

`validate_value(v)` `classmethod` ¶

`PropertyAccess` ¶

`validate_variable_or_base()` ¶

`validate_identifier(v)` `classmethod` ¶

`Subscript` ¶

`validate_index_or_slice()` ¶

`UnaryOp` ¶

`validate_op(v)` `classmethod` ¶

`Variable` ¶

`validate_name(v)` `classmethod` ¶

`Wildcard` ¶

`Direction` ¶

`NodePattern` ¶

`validate_variable(v)` `classmethod` ¶

`validate_labels(v)` `classmethod` ¶

`RelationshipPattern` ¶

`validate_variable(v)` `classmethod` ¶

`validate_types(v)` `classmethod` ¶

`validate_min_hops(v)` `classmethod` ¶

`validate_max_hops(v)` `classmethod` ¶

`CypherQuery` `dataclass` ¶

`graphforge.parser` ¶

`graphforge.planner` ¶

`ExpandEdges` ¶

`validate_direction(v)` `classmethod` ¶

`Filter` ¶

`Limit` ¶

`Project` ¶

`ScanNodes` ¶

`validate_variable(v)` `classmethod` ¶

`Skip` ¶

`graphforge.executor` ¶

`ExecutionContext` ¶

`init()` ¶

`bind(name, value)` ¶

`get(name)` ¶

`has(name)` ¶

`evaluate_expression(expr, ctx, executor=None)` ¶

`graphforge.storage` ¶

`Graph` ¶

`init()` ¶

`add_node(node)` ¶

`get_node(node_id)` ¶

`has_node(node_id)` ¶