Source code for ccsds_ndm.kvn_tokenizer

# CCSDS-NDM: CCSDS Navigation Data Messages Read/Write Library
#
# Copyright (C) Egemen Imre
#
# Licensed under GNU GPL v3.0. See LICENSE for more info.
"""
KVN tokenizer: convert raw KVN text into a list of classified line objects.

This module provides:
  - :class:`KvnLine` and its subclasses for representing each line format
  - :func:`tokenize` to convert raw KVN source into ``KvnLine`` objects
"""

from __future__ import annotations

from abc import ABC, abstractmethod
from dataclasses import dataclass, field

_SECTION_MARKERS = frozenset(
    {
        "META_START",
        "META_STOP",
        "DATA_START",
        "DATA_STOP",
        "COVARIANCE_START",
        "COVARIANCE_STOP",
    }
)



[docs]
@dataclass
class KvnLine(ABC):
    """
    Abstract base class for a single tokenised KVN line.

    Subclasses represent each distinct line format found in KVN files.
    Every subclass implements :meth:`to_str` to render itself back to a
    canonical KVN string, making round-trip writing straightforward: build a
    list of ``KvnLine`` instances, call ``to_str()`` on each, and join with
    newlines.
    """


[docs]
    @abstractmethod
    def to_str(self, **kwargs) -> str:
        """
        Render this line to a KVN string (no trailing newline).

        Subclasses accept keyword arguments relevant to their own format.
        Unknown kwargs are silently ignored, so callers can pass a common set
        (e.g. ``key_width=24``) to all line types without special-casing.
        """





[docs]
@dataclass
class BlankLine(KvnLine):
    """A whitespace-only (or completely empty) line."""


[docs]
    def to_str(self, **kwargs) -> str:
        return ""





[docs]
@dataclass
class SectionMarkerLine(KvnLine):
    """
    A block-delimiter keyword.

    Examples: ``META_START``, ``META_STOP``, ``DATA_START``, ``DATA_STOP``,
    ``COVARIANCE_START``, ``COVARIANCE_STOP``.
    """

    key: str = ""


[docs]
    def to_str(self, **kwargs) -> str:
        return self.key





[docs]
@dataclass
class CommentLine(KvnLine):
    """
    A ``COMMENT`` line.

    The comment text is stored in :attr:`text` with leading/trailing whitespace
    stripped.  Both the plain (``COMMENT text``) and equals (``COMMENT = text``)
    variants are normalised to plain on construction; ``to_str`` always writes
    the plain form.
    """

    text: str = ""


[docs]
    def to_str(self, **kwargs) -> str:
        return f"COMMENT {self.text}"





[docs]
@dataclass
class KvLine(KvnLine):
    """
    A standard ``KEY = value [unit]`` line.

    Attributes
    ----------
    key : str
        The KVN keyword (e.g. ``"EPOCH"``, ``"OBJECT_NAME"``).
    value : str
        The scalar value string, stripped of surrounding whitespace and units.
    unit : str
        Unit string extracted from the trailing ``[...]``, or empty if absent.
    """

    key: str = ""
    value: str = ""
    unit: str = ""


[docs]
    def to_str(self, key_width: int = 24, **kwargs) -> str:

        unit_str = f" [{self.unit}]" if self.unit else ""
        return f"{self.key:<{key_width}} = {self.value}{unit_str}"





[docs]
@dataclass
class TdmObsLine(KvnLine):
    """
    A TDM observation line: ``KEY = EPOCH  value``.

    TDM data lines carry an epoch and a numeric value in the value field,
    separated by whitespace, rather than a single scalar.

    Attributes
    ----------
    key : str
        The observation keyword (e.g. ``"TRANSMIT_FREQ_1"``).
    epoch : str
        The epoch token (e.g. ``"2007-075T11:50:43.000"``).
    value : str
        The numeric observation value as a string.
    unit : str
        Unit string, or empty if absent.
    """

    key: str = ""
    epoch: str = ""
    value: str = ""
    unit: str = ""


[docs]
    def to_str(self, key_width: int = 24, **kwargs) -> str:
        unit_str = f" [{self.unit}]" if self.unit else ""
        return f"{self.key:<{key_width}} = {self.epoch}  {self.value}{unit_str}"





[docs]
@dataclass
class PackedDataLine(KvnLine):
    """
    A space-separated data row whose first token is an epoch.

    Used for OEM state vectors and AEM attitude states, where an entire
    record is encoded on a single line with no explicit keys:
    ``EPOCH  x  y  z  x_dot  y_dot  z_dot``

    Attributes
    ----------
    epoch : str
        The epoch string (first token, also available as ``tokens[0]``).
    tokens : list[str]
        All whitespace-separated tokens on the line (epoch + numeric values).
    """

    epoch: str = ""
    tokens: list[str] = field(default_factory=list)


[docs]
    def to_str(self, **kwargs) -> str:
        return "  ".join(self.tokens)





[docs]
@dataclass
class CovarianceRowLine(KvnLine):
    """
    A space-separated row of plain numbers inside a covariance block.

    OEM covariance matrix rows contain only numeric tokens with no epoch
    and no key.  Each row represents one row of the lower-triangular matrix:
    ``v11``, ``v21  v22``, ``v31  v32  v33``, …

    Attributes
    ----------
    tokens : list[str]
        The numeric tokens on this row.
    """

    tokens: list[str] = field(default_factory=list)


[docs]
    def to_str(self, **kwargs) -> str:
        return "  ".join(self.tokens)




def _is_epoch(s: str) -> bool:
    """
    Return ``True`` if ``s`` looks like a CCSDS epoch string.

    A CCSDS epoch starts with a 4-digit year immediately followed by ``"-"``,
    e.g. ``"2007-075T16:50:01"`` or ``"2020-12-29T11:59:56"``.
    """
    return len(s) >= 5 and s[:4].isdigit() and s[4] == "-"



[docs]
def tokenize(kvn_source: str) -> list[KvnLine]:
    """
    Convert a raw KVN string into an ordered list of :class:`KvnLine` objects.

    Each input line is classified and parsed into the appropriate subclass.
    The rules applied in order are:

    1. Strip surrounding whitespace.  Empty result → :class:`BlankLine`.
    2. Line is in :data:`_SECTION_MARKERS` → :class:`SectionMarkerLine`.
    3. Line starts with ``"COMMENT"`` → :class:`CommentLine`.
       A leading ``"="`` after ``"COMMENT"`` is stripped (handles the
       ``COMMENT = text`` variant).
    4. Line contains ``"="``:\n
       a. Split on the first ``"="`` into *key* and *rest*.
       b. Extract a trailing ``[unit]`` from *rest* if present.
       c. Split remaining *rest* on whitespace.  Two tokens where the first
          looks like an epoch → :class:`TdmObsLine`.  Otherwise → :class:`KvLine`.

    5. No ``"="`` — split on whitespace:

       a. First token looks like an epoch → :class:`PackedDataLine`.
       b. Otherwise → :class:`CovarianceRowLine`.

    Blank lines that appear *before* the first ``CCSDS_`` header line are
    dropped so that files with a leading blank or BOM are handled cleanly.

    Parameters
    ----------
    kvn_source : str
        Raw KVN text (Windows or Unix line endings accepted).

    Returns
    -------
    list[KvnLine]
        Ordered list of classified line objects.
    """
    result: list[KvnLine] = []
    header_seen = False

    for raw_line in kvn_source.splitlines():
        line = raw_line.strip()

        # Drop blank lines before the CCSDS_ header
        if not header_seen:
            if not line:
                continue
            if line.startswith("CCSDS_"):
                header_seen = True

        # --- BlankLine ---
        if not line:
            result.append(BlankLine())
            continue

        # --- SectionMarkerLine ---
        if line in _SECTION_MARKERS:
            result.append(SectionMarkerLine(key=line))
            continue

        # --- CommentLine ---
        if line.startswith("COMMENT"):
            text = line[7:].strip()
            if text.startswith("="):
                text = text[1:].strip()
            result.append(CommentLine(text=text))
            continue

        # --- KvLine or TdmObsLine (line contains "=") ---
        if "=" in line:
            key, rest = line.split("=", maxsplit=1)
            key = key.strip()
            rest = rest.strip()

            # Extract trailing [unit] if present
            unit = ""
            if rest.endswith("]"):
                bracket = rest.rfind("[")
                if bracket >= 0:
                    unit = rest[bracket + 1 : -1].strip()
                    rest = rest[:bracket].strip()

            value_tokens = rest.split()
            if len(value_tokens) == 2 and _is_epoch(value_tokens[0]):
                result.append(
                    TdmObsLine(
                        key=key,
                        epoch=value_tokens[0],
                        value=value_tokens[1],
                        unit=unit,
                    )
                )
            else:
                result.append(KvLine(key=key, value=rest, unit=unit))
            continue

        # --- PackedDataLine or CovarianceRowLine (no "=") ---
        tokens = line.split()
        if tokens and _is_epoch(tokens[0]):
            result.append(PackedDataLine(epoch=tokens[0], tokens=tokens))
        else:
            result.append(CovarianceRowLine(tokens=tokens))

    return result