Module bases.alphabet.range_alphabet

Alphabets implicitly specified by Unicode codepoint range.

Expand source code
"""
    Alphabets implicitly specified by Unicode codepoint range.
"""

from typing import Any, Iterator, Mapping, overload, Union

from .abstract import Alphabet
from .string_alphabet import StringAlphabet

class RangeAlphabet(Alphabet):
    """
        Class for alphabets implicitly specified by a range of Unicode codepoints
        and optional case sensitivity (default: case-sensitive).

        Example usage:

        ```py
        >>> from bases.alphabet import RangeAlphabet
        >>> RangeAlphabet(range(0x00, 0x100))
        RangeAlphabet(range(0x0, 0x100))
        ```
    """

    _codepoints: range
    _revdir: Mapping[str, int]
    _case_sensitive: bool

    def __init__(self, codepoints: range, *,
                 case_sensitive: bool = True):
        super().__init__(case_sensitive)
        self._codepoints = codepoints
        self._revdir = _RangeAlphabetRevdir(self)
        self.__validate_init()

    def __validate_init(self) -> None:
        codepoints = self._codepoints
        case_sensitive = self.case_sensitive
        if len(codepoints) <= 1:
            raise ValueError("Alphabet must have at least two characters.")
        if not case_sensitive:
            codepoints_set = set(codepoints)
            for i in codepoints:
                c = chr(i)
                if ord(c.upper()) in codepoints_set and ord(c.lower()) in codepoints_set:
                    raise ValueError("Alphabet contains lowercase and uppercase versions of the same character, "
                                     "encoding must be case-sensitive.")

    @property
    def codepoints(self) -> range:
        """
            The codepoint range that defines this alphabet.

            Example usage:

            ```py
            >>> RangeAlphabet(range(0x00, 0x100)).codepoints
            range(0, 256)
            ```
        """
        return self._codepoints

    @property
    def revdir(self) -> Mapping[str, int]:
        return self._revdir

    def __len__(self) -> int:
        return len(self._codepoints)

    @overload
    def __getitem__(self, idx: int) -> str:
        ...

    @overload
    def __getitem__(self, idx: slice) -> "RangeAlphabet":
        ...

    def __getitem__(self, idx: Union[int, slice]) -> Union[str, "RangeAlphabet"]:
        if isinstance(idx, slice):
            new_codepoints = self._codepoints[idx]
            return RangeAlphabet(new_codepoints, case_sensitive=self.case_sensitive)
        return chr(self._codepoints[idx])

    def with_case_sensitivity(self, case_sensitive: bool) -> "RangeAlphabet":
        if case_sensitive == self.case_sensitive:
            return self
        return RangeAlphabet(self.codepoints, case_sensitive=case_sensitive)

    def as_string_alphabet(self) -> StringAlphabet:
        """
            Converts this alphabet into a string alphabet explicitly defined
            by the string containing all characters in the codepoint range.

            Example usage:

            ```py
            >>> RangeAlphabet(range(0x20, 0x7E)).as_string_alphabet()
            StringAlphabet(' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMN
                            OPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}')
            ```
        """
        chars = "".join(self)
        return StringAlphabet(chars, case_sensitive=self.case_sensitive)

    def upper(self) -> StringAlphabet:
        chars = "".join(self).upper()
        return StringAlphabet(chars, case_sensitive=self.case_sensitive)

    def lower(self) -> StringAlphabet:
        chars = "".join(self).lower()
        return StringAlphabet(chars, case_sensitive=self.case_sensitive)

    def __eq__(self, other: Any) -> bool:
        if not isinstance(other, RangeAlphabet):
            return NotImplemented
        return self.codepoints == other.codepoints and self.case_sensitive == other.case_sensitive

    def __hash__(self) -> int:
        return hash((type(self), self.codepoints, self.case_sensitive))

    def __repr__(self) -> str:
        codepoints_str = f"range({hex(self.codepoints.start)}, {hex(self.codepoints.stop)})"
        if self.case_sensitive:
            return f"RangeAlphabet({codepoints_str})"
        case_sensitive_str = f"case_sensitive={self.case_sensitive}"
        return f"RangeAlphabet({codepoints_str}, {case_sensitive_str})"

class _RangeAlphabetRevdir(Mapping[str, int]):

    _alphabet: RangeAlphabet

    def __init__(self, alphabet: RangeAlphabet):
        self._alphabet = alphabet

    def __iter__(self) -> Iterator[str]:
        return iter(self._alphabet)

    def __len__(self) -> int:
        return len(self._alphabet)

    def __contains__(self, char: Any) -> bool:
        if not isinstance(char, str):
            return False
        alphabet = self._alphabet
        if alphabet.case_sensitive:
            return ord(char) in alphabet.codepoints
        return ord(char.upper()) in alphabet.codepoints or ord(char.lower()) in alphabet.codepoints

    def __getitem__(self, char: str) -> int:
        alphabet = self._alphabet
        if ord(char) in alphabet.codepoints:
            return ord(char)-alphabet.codepoints.start
        if not alphabet.case_sensitive:
            if ord(char.upper()) in alphabet.codepoints:
                return ord(char.upper())-alphabet.codepoints.start
            if ord(char.lower()) in alphabet.codepoints:
                return ord(char.lower())-alphabet.codepoints.start
        raise KeyError(f"Character {repr(char)} not in alphabet.")

Classes

class RangeAlphabet (codepoints: range, *, case_sensitive: bool = True)

Class for alphabets implicitly specified by a range of Unicode codepoints and optional case sensitivity (default: case-sensitive).

Example usage:

>>> from bases.alphabet import RangeAlphabet
>>> RangeAlphabet(range(0x00, 0x100))
RangeAlphabet(range(0x0, 0x100))
Expand source code
class RangeAlphabet(Alphabet):
    """
        Class for alphabets implicitly specified by a range of Unicode codepoints
        and optional case sensitivity (default: case-sensitive).

        Example usage:

        ```py
        >>> from bases.alphabet import RangeAlphabet
        >>> RangeAlphabet(range(0x00, 0x100))
        RangeAlphabet(range(0x0, 0x100))
        ```
    """

    _codepoints: range
    _revdir: Mapping[str, int]
    _case_sensitive: bool

    def __init__(self, codepoints: range, *,
                 case_sensitive: bool = True):
        super().__init__(case_sensitive)
        self._codepoints = codepoints
        self._revdir = _RangeAlphabetRevdir(self)
        self.__validate_init()

    def __validate_init(self) -> None:
        codepoints = self._codepoints
        case_sensitive = self.case_sensitive
        if len(codepoints) <= 1:
            raise ValueError("Alphabet must have at least two characters.")
        if not case_sensitive:
            codepoints_set = set(codepoints)
            for i in codepoints:
                c = chr(i)
                if ord(c.upper()) in codepoints_set and ord(c.lower()) in codepoints_set:
                    raise ValueError("Alphabet contains lowercase and uppercase versions of the same character, "
                                     "encoding must be case-sensitive.")

    @property
    def codepoints(self) -> range:
        """
            The codepoint range that defines this alphabet.

            Example usage:

            ```py
            >>> RangeAlphabet(range(0x00, 0x100)).codepoints
            range(0, 256)
            ```
        """
        return self._codepoints

    @property
    def revdir(self) -> Mapping[str, int]:
        return self._revdir

    def __len__(self) -> int:
        return len(self._codepoints)

    @overload
    def __getitem__(self, idx: int) -> str:
        ...

    @overload
    def __getitem__(self, idx: slice) -> "RangeAlphabet":
        ...

    def __getitem__(self, idx: Union[int, slice]) -> Union[str, "RangeAlphabet"]:
        if isinstance(idx, slice):
            new_codepoints = self._codepoints[idx]
            return RangeAlphabet(new_codepoints, case_sensitive=self.case_sensitive)
        return chr(self._codepoints[idx])

    def with_case_sensitivity(self, case_sensitive: bool) -> "RangeAlphabet":
        if case_sensitive == self.case_sensitive:
            return self
        return RangeAlphabet(self.codepoints, case_sensitive=case_sensitive)

    def as_string_alphabet(self) -> StringAlphabet:
        """
            Converts this alphabet into a string alphabet explicitly defined
            by the string containing all characters in the codepoint range.

            Example usage:

            ```py
            >>> RangeAlphabet(range(0x20, 0x7E)).as_string_alphabet()
            StringAlphabet(' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMN
                            OPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}')
            ```
        """
        chars = "".join(self)
        return StringAlphabet(chars, case_sensitive=self.case_sensitive)

    def upper(self) -> StringAlphabet:
        chars = "".join(self).upper()
        return StringAlphabet(chars, case_sensitive=self.case_sensitive)

    def lower(self) -> StringAlphabet:
        chars = "".join(self).lower()
        return StringAlphabet(chars, case_sensitive=self.case_sensitive)

    def __eq__(self, other: Any) -> bool:
        if not isinstance(other, RangeAlphabet):
            return NotImplemented
        return self.codepoints == other.codepoints and self.case_sensitive == other.case_sensitive

    def __hash__(self) -> int:
        return hash((type(self), self.codepoints, self.case_sensitive))

    def __repr__(self) -> str:
        codepoints_str = f"range({hex(self.codepoints.start)}, {hex(self.codepoints.stop)})"
        if self.case_sensitive:
            return f"RangeAlphabet({codepoints_str})"
        case_sensitive_str = f"case_sensitive={self.case_sensitive}"
        return f"RangeAlphabet({codepoints_str}, {case_sensitive_str})"

Ancestors

  • Alphabet
  • abc.ABC
  • collections.abc.Sequence
  • collections.abc.Reversible
  • collections.abc.Collection
  • collections.abc.Sized
  • collections.abc.Iterable
  • collections.abc.Container
  • typing.Generic

Instance variables

var codepoints : range

The codepoint range that defines this alphabet.

Example usage:

>>> RangeAlphabet(range(0x00, 0x100)).codepoints
range(0, 256)
Expand source code
@property
def codepoints(self) -> range:
    """
        The codepoint range that defines this alphabet.

        Example usage:

        ```py
        >>> RangeAlphabet(range(0x00, 0x100)).codepoints
        range(0, 256)
        ```
    """
    return self._codepoints

Methods

def as_string_alphabet(self) ‑> StringAlphabet

Converts this alphabet into a string alphabet explicitly defined by the string containing all characters in the codepoint range.

Example usage:

>>> RangeAlphabet(range(0x20, 0x7E)).as_string_alphabet()
StringAlphabet(' !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMN
                OPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}')
Expand source code
def as_string_alphabet(self) -> StringAlphabet:
    """
        Converts this alphabet into a string alphabet explicitly defined
        by the string containing all characters in the codepoint range.

        Example usage:

        ```py
        >>> RangeAlphabet(range(0x20, 0x7E)).as_string_alphabet()
        StringAlphabet(' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMN
                        OPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}')
        ```
    """
    chars = "".join(self)
    return StringAlphabet(chars, case_sensitive=self.case_sensitive)

Inherited members