From 4f5b1beaf619b916390b15377f09132b08ffec2a Mon Sep 17 00:00:00 2001 From: rocky Date: Tue, 29 Jun 2021 19:50:25 -0400 Subject: [PATCH 1/7] WIP - start splitting up strings. --- mathics/builtin/__init__.py | 2 +- mathics/builtin/string/__init__.py | 6 + mathics/builtin/string/characters.py | 213 +++++ mathics/builtin/string/operations.py | 781 +++++++++++++++++++ mathics/builtin/string/patterns.py | 119 +++ mathics/builtin/strings.py | 1068 +------------------------- 6 files changed, 1125 insertions(+), 1064 deletions(-) create mode 100644 mathics/builtin/string/__init__.py create mode 100644 mathics/builtin/string/characters.py create mode 100644 mathics/builtin/string/operations.py create mode 100644 mathics/builtin/string/patterns.py diff --git a/mathics/builtin/__init__.py b/mathics/builtin/__init__.py index bf5b0cb38..dd271b8f5 100755 --- a/mathics/builtin/__init__.py +++ b/mathics/builtin/__init__.py @@ -153,7 +153,7 @@ def is_builtin(var): [] if ENABLE_FILES_MODULE else ["files_io.files", "files_io.importexport"] ) -for subdir in ("colors", "drawing", "files_io", "numbers", "specialfns", "fileformats"): +for subdir in ("colors", "drawing", "files_io", "numbers", "specialfns", "string", "fileformats"): import_name = f"{__name__}.{subdir}" if import_name in disable_file_module_names: diff --git a/mathics/builtin/string/__init__.py b/mathics/builtin/string/__init__.py new file mode 100644 index 000000000..9b776f908 --- /dev/null +++ b/mathics/builtin/string/__init__.py @@ -0,0 +1,6 @@ +""" +Strings and Characters + +""" + +from mathics.version import __version__ # noqa used in loading to check consistency. diff --git a/mathics/builtin/string/characters.py b/mathics/builtin/string/characters.py new file mode 100644 index 000000000..78fae72aa --- /dev/null +++ b/mathics/builtin/string/characters.py @@ -0,0 +1,213 @@ +# -*- coding: utf-8 -*- +""" +Characters in Strings +""" + +from mathics.version import __version__ # noqa used in loading to check consistency. + +from mathics.builtin.base import Builtin, Test + +from mathics.core.expression import ( + Expression, + String, + SymbolList, +) + + +class Characters(Builtin): + """ +
+
'Characters["$string$"]' +
returns a list of the characters in $string$. +
+ + >> Characters["abc"] + = {a, b, c} + + #> \\.78\\.79\\.7A + = xyz + + #> \\:0078\\:0079\\:007A + = xyz + + #> \\101\\102\\103\\061\\062\\063 + = ABC123 + + #> \\[Alpha]\\[Beta]\\[Gamma] + = \u03B1\u03B2\u03B3 + """ + + attributes = ("Listable",) + + def apply(self, string, evaluation): + "Characters[string_String]" + + return Expression(SymbolList, *(String(c) for c in string.value)) + + +class CharacterRange(Builtin): + """ +
+
'CharacterRange["$a$", "$b$"]' +
returns a list of the Unicode characters from $a$ to $b$ + inclusive. +
+ + >> CharacterRange["a", "e"] + = {a, b, c, d, e} + >> CharacterRange["b", "a"] + = {} + """ + + attributes = ("ReadProtected",) + + messages = { + "argtype": "Arguments `1` and `2` are not both strings of length 1.", + } + + def apply(self, start, stop, evaluation): + "CharacterRange[start_String, stop_String]" + + if len(start.value) != 1 or len(stop.value) != 1: + evaluation.message("CharacterRange", "argtype", start, stop) + return + start = ord(start.value[0]) + stop = ord(stop.value[0]) + return Expression( + "List", *[String(chr(code)) for code in range(start, stop + 1)] + ) + + +class DigitQ(Builtin): + """ +
+
'DigitQ[$string$]' + yields 'True' if all the characters in the $string$ are digits, and yields 'False' otherwise. +
+ + >> DigitQ["9"] + = True + + >> DigitQ["a"] + = False + + >> DigitQ["01001101011000010111010001101000011010010110001101110011"] + = True + + >> DigitQ["-123456789"] + = False + + """ + + rules = { + "DigitQ[string_]": ( + "If[StringQ[string], StringMatchQ[string, DigitCharacter...], False, False]" + ), + } + + +class LetterQ(Builtin): + """ +
+
'LetterQ[$string$]' + yields 'True' if all the characters in the $string$ are letters, and yields 'False' otherwise. +
+ + >> LetterQ["m"] + = True + + >> LetterQ["9"] + = False + + >> LetterQ["Mathics"] + = True + + >> LetterQ["Welcome to Mathics"] + = False + + #> LetterQ[""] + = True + + #> LetterQ["\\[Alpha]\\[Beta]\\[Gamma]\\[Delta]\\[Epsilon]\\[Zeta]\\[Eta]\\[Theta]"] + = True + """ + + rules = { + "LetterQ[string_]": ( + "If[StringQ[string], StringMatchQ[string, LetterCharacter...], False, False]" + ), + } + + +class LowerCaseQ(Test): + """ +
+
'LowerCaseQ[$s$]' +
returns True if $s$ consists wholly of lower case characters. +
+ + >> LowerCaseQ["abc"] + = True + + An empty string returns True. + >> LowerCaseQ[""] + = True + """ + + def test(self, s): + return isinstance(s, String) and all(c.islower() for c in s.get_string_value()) + + +class ToLowerCase(Builtin): + """ +
+
'ToLowerCase[$s$]' +
returns $s$ in all lower case. +
+ + >> ToLowerCase["New York"] + = new york + """ + + attributes = ("Listable", "Protected") + + def apply(self, s, evaluation): + "ToLowerCase[s_String]" + return String(s.get_string_value().lower()) + + +class ToUpperCase(Builtin): + """ +
+
'ToUpperCase[$s$]' +
returns $s$ in all upper case. +
+ + >> ToUpperCase["New York"] + = NEW YORK + """ + + attributes = ("Listable", "Protected") + + def apply(self, s, evaluation): + "ToUpperCase[s_String]" + return String(s.get_string_value().upper()) + + +class UpperCaseQ(Test): + """ +
+
'UpperCaseQ[$s$]' +
returns True if $s$ consists wholly of upper case characters. +
+ + >> UpperCaseQ["ABC"] + = True + + An empty string returns True. + >> UpperCaseQ[""] + = True + """ + + def test(self, s): + return isinstance(s, String) and all(c.isupper() for c in s.get_string_value()) diff --git a/mathics/builtin/string/operations.py b/mathics/builtin/string/operations.py new file mode 100644 index 000000000..9a68b2bd9 --- /dev/null +++ b/mathics/builtin/string/operations.py @@ -0,0 +1,781 @@ +# -*- coding: utf-8 -*- + +""" +Operations on Strings +""" + +import re +from sys import version_info +from binascii import hexlify, unhexlify +from heapq import heappush, heappop + +from mathics.version import __version__ # noqa used in loading to check consistency. + +from mathics.builtin.base import ( + BinaryOperator, + Builtin, +) +from mathics.core.expression import ( + Expression, + Symbol, + SymbolFalse, + SymbolTrue, + SymbolList, + String, + Integer, + Integer1, + from_python, +) +from mathics.builtin.lists import python_seq, convert_seq +from mathics.builtin.strings import ( + _StringFind, + _decode_pname, + _encode_pname, + _evaluate_match, + _parallel_match, + to_regex, +) + + +class StringDrop(Builtin): + """ +
+
'StringDrop["$string$", $n$]' +
gives $string$ with the first $n$ characters dropped. +
'StringDrop["$string$", -$n$]' +
gives $string$ with the last $n$ characters dropped. +
'StringDrop["$string$", {$n$}]' +
gives $string$ with the $n$th character dropped. +
'StringDrop["$string$", {$m$, $n$}]' +
gives $string$ with the characters $m$ through $n$ dropped. +
+ + >> StringDrop["abcde", 2] + = cde + >> StringDrop["abcde", -2] + = abc + >> StringDrop["abcde", {2}] + = acde + >> StringDrop["abcde", {2,3}] + = ade + >> StringDrop["abcd",{3,2}] + = abcd + >> StringDrop["abcd",0] + = abcd + """ + + messages = { + "strse": "String expected at position 1.", + "mseqs": "Integer or list of two Integers are expected at position 2.", + "drop": 'Cannot drop positions `1` through `2` in "`3`".', + } + + def apply_with_n(self, string, n, evaluation): + "StringDrop[string_,n_Integer]" + if not isinstance(string, String): + return evaluation.message("StringDrop", "strse") + if isinstance(n, Integer): + pos = n.value + if pos > len(string.get_string_value()): + return evaluation.message("StringDrop", "drop", 1, pos, string) + if pos < -len(string.get_string_value()): + return evaluation.message("StringDrop", "drop", pos, -1, string) + if pos > 0: + return String(string.get_string_value()[pos:]) + if pos < 0: + return String(string.get_string_value()[:(pos)]) + if pos == 0: + return string + return evaluation.message("StringDrop", "mseqs") + + def apply_with_ni_nf(self, string, ni, nf, evaluation): + "StringDrop[string_,{ni_Integer,nf_Integer}]" + if not isinstance(string, String): + return evaluation.message("StringDrop", "strse", string) + + if ni.value == 0 or nf.value == 0: + return evaluation.message("StringDrop", "drop", ni, nf) + fullstring = string.get_string_value() + lenfullstring = len(fullstring) + posi = ni.value + if posi < 0: + posi = lenfullstring + posi + 1 + posf = nf.value + if posf < 0: + posf = lenfullstring + posf + 1 + if posf > lenfullstring or posi > lenfullstring or posf <= 0 or posi <= 0: + # positions out or range + return evaluation.message("StringDrop", "drop", ni, nf, fullstring) + if posf < posi: + return string # this is what actually mma does + return String(fullstring[: (posi - 1)] + fullstring[posf:]) + + def apply_with_ni(self, string, ni, evaluation): + "StringDrop[string_,{ni_Integer}]" + if not isinstance(string, String): + return evaluation.message("StringDrop", "strse", string) + if ni.value == 0: + return evaluation.message("StringDrop", "drop", ni, ni) + fullstring = string.get_string_value() + lenfullstring = len(fullstring) + posi = ni.value + if posi < 0: + posi = lenfullstring + posi + 1 + if posi > lenfullstring or posi <= 0: + return evaluation.message("StringDrop", "drop", ni, ni, fullstring) + return String(fullstring[: (posi - 1)] + fullstring[posi:]) + + def apply(self, string, something, evaluation): + "StringDrop[string_,something___]" + if not isinstance(string, String): + return evaluation.message("StringDrop", "strse") + return evaluation.message("StringDrop", "mseqs") + + +class StringInsert(Builtin): + """ +
+
'StringInsert["$string$", "$snew$", $n$]' +
yields a string with $snew$ inserted starting at position $n$ in $string$. + +
'StringInsert["$string$", "$snew$", -$n$]' +
inserts a at position $n$ from the end of "$string$". + +
'StringInsert["$string$", "$snew$", {$n_1$, $n_2$, ...}]' +
inserts a copy of $snew$ at each position $n_i$ in $string$; + the $n_i$ are taken before any insertion is done. + +
'StringInsert[{$s_1$, $s_2$, ...}, "$snew$", $n$]' +
gives the list of resutls for each of the $s_i$. +
+ + >> StringInsert["noting", "h", 4] + = nothing + + #> StringInsert["abcdefghijklm", "X", 15] + : Cannot insert at position 15 in abcdefghijklm. + = StringInsert[abcdefghijklm, X, 15] + + #> StringInsert[abcdefghijklm, "X", 4] + : String or list of strings expected at position 1 in StringInsert[abcdefghijklm, X, 4]. + = StringInsert[abcdefghijklm, X, 4] + + #> StringInsert["abcdefghijklm", X, 4] + : String expected at position 2 in StringInsert[abcdefghijklm, X, 4]. + = StringInsert[abcdefghijklm, X, 4] + + #> StringInsert["abcdefghijklm", "X", a] + : Position specification a in StringInsert[abcdefghijklm, X, a] is not a machine-sized integer or a list of machine-sized integers. + = StringInsert[abcdefghijklm, X, a] + + #> StringInsert["abcdefghijklm", "X", 0] + : Cannot insert at position 0 in abcdefghijklm. + = StringInsert[abcdefghijklm, X, 0] + + >> StringInsert["note", "d", -1] + = noted + + >> StringInsert["here", "t", -5] + = there + + #> StringInsert["abcdefghijklm", "X", -15] + : Cannot insert at position -15 in abcdefghijklm. + = StringInsert[abcdefghijklm, X, -15] + + >> StringInsert["adac", "he", {1, 5}] + = headache + + #> StringInsert["abcdefghijklm", "X", {1, -1, 14, -14}] + = XXabcdefghijklmXX + + #> StringInsert["abcdefghijklm", "X", {1, 0}] + : Cannot insert at position 0 in abcdefghijklm. + = StringInsert[abcdefghijklm, X, {1, 0}] + + #> StringInsert["", "X", {1}] + = X + + #> StringInsert["", "X", {1, -1}] + = XX + + #> StringInsert["", "", {1}] + = #<--# + + #> StringInsert["", "X", {1, 2}] + : Cannot insert at position 2 in . + = StringInsert[, X, {1, 2}] + + #> StringInsert["abcdefghijklm", "", {1, 2, 3, 4 ,5, -6}] + = abcdefghijklm + + #> StringInsert["abcdefghijklm", "X", {}] + = abcdefghijklm + + >> StringInsert[{"something", "sometimes"}, " ", 5] + = {some thing, some times} + + #> StringInsert[{"abcdefghijklm", "Mathics"}, "X", 13] + : Cannot insert at position 13 in Mathics. + = {abcdefghijklXm, StringInsert[Mathics, X, 13]} + + #> StringInsert[{"", ""}, "", {1, 1, 1, 1}] + = {, } + + #> StringInsert[{"abcdefghijklm", "Mathics"}, "X", {0, 2}] + : Cannot insert at position 0 in abcdefghijklm. + : Cannot insert at position 0 in Mathics. + = {StringInsert[abcdefghijklm, X, {0, 2}], StringInsert[Mathics, X, {0, 2}]} + + #> StringInsert[{"abcdefghijklm", Mathics}, "X", {1, 2}] + : String or list of strings expected at position 1 in StringInsert[{abcdefghijklm, Mathics}, X, {1, 2}]. + = StringInsert[{abcdefghijklm, Mathics}, X, {1, 2}] + + #> StringInsert[{"", "Mathics"}, "X", {1, 1, -1}] + = {XXX, XXMathicsX} + + >> StringInsert["1234567890123456", ".", Range[-16, -4, 3]] + = 1.234.567.890.123.456""" + + messages = { + "strse": "String or list of strings expected at position `1` in `2`.", + "string": "String expected at position `1` in `2`.", + "ins": "Cannot insert at position `1` in `2`.", + "psl": "Position specification `1` in `2` is not a machine-sized integer or a list of machine-sized integers.", + } + + def _insert(self, str, add, lpos, evaluation): + for pos in lpos: + if abs(pos) < 1 or abs(pos) > len(str) + 1: + evaluation.message("StringInsert", "ins", Integer(pos), String(str)) + return evaluation.format_output( + Expression( + "StringInsert", str, add, lpos[0] if len(lpos) == 1 else lpos + ) + ) + + # Create new list of position which are rearranged + pos_limit = len(str) + 2 + listpos = [p if p > 0 else pos_limit + p for p in lpos] + listpos.sort() + + result = "" + start = 0 + for pos in listpos: + stop = pos - 1 + result += str[start:stop] + add + start = stop + else: + result += str[start : len(str)] + + return result + + def apply(self, strsource, strnew, pos, evaluation): + "StringInsert[strsource_, strnew_, pos_]" + + exp = Expression("StringInsert", strsource, strnew, pos) + + py_strnew = strnew.get_string_value() + if py_strnew is None: + return evaluation.message("StringInsert", "string", Integer(2), exp) + + # Check and create list of position + listpos = [] + if pos.has_form("List", None): + leaves = pos.get_leaves() + if not leaves: + return strsource + else: + for i, posi in enumerate(leaves): + py_posi = posi.get_int_value() + if py_posi is None: + return evaluation.message("StringInsert", "psl", pos, exp) + listpos.append(py_posi) + else: + py_pos = pos.get_int_value() + if py_pos is None: + return evaluation.message("StringInsert", "psl", pos, exp) + listpos.append(py_pos) + + # Check and perform the insertion + if strsource.has_form("List", None): + py_strsource = [sub.get_string_value() for sub in strsource.leaves] + if any(sub is None for sub in py_strsource): + return evaluation.message("StringInsert", "strse", Integer1, exp) + return Expression( + "List", + *[ + String(self._insert(s, py_strnew, listpos, evaluation)) + for s in py_strsource + ] + ) + else: + py_strsource = strsource.get_string_value() + if py_strsource is None: + return evaluation.message("StringInsert", "strse", Integer1, exp) + return String(self._insert(py_strsource, py_strnew, listpos, evaluation)) + + +class StringJoin(BinaryOperator): + """ +
+
'StringJoin["$s1$", "$s2$", ...]' +
returns the concatenation of the strings $s1$, $s2$, . +
+ + >> StringJoin["a", "b", "c"] + = abc + >> "a" <> "b" <> "c" // InputForm + = "abc" + + 'StringJoin' flattens lists out: + >> StringJoin[{"a", "b"}] // InputForm + = "ab" + >> Print[StringJoin[{"Hello", " ", {"world"}}, "!"]] + | Hello world! + """ + + operator = "<>" + precedence = 600 + attributes = ("Flat", "OneIdentity") + + def apply(self, items, evaluation): + "StringJoin[items___]" + + result = "" + items = items.flatten(SymbolList) + if items.get_head_name() == "System`List": + items = items.leaves + else: + items = items.get_sequence() + for item in items: + if not isinstance(item, String): + evaluation.message("StringJoin", "string") + return + result += item.value + return String(result) + + +class StringLength(Builtin): + """ +
+
'StringLength["$string$"]' +
gives the length of $string$. +
+ + >> StringLength["abc"] + = 3 + 'StringLength' is listable: + >> StringLength[{"a", "bc"}] + = {1, 2} + + >> StringLength[x] + : String expected. + = StringLength[x] + """ + + attributes = ("Listable",) + + def apply(self, str, evaluation): + "StringLength[str_]" + + if not isinstance(str, String): + evaluation.message("StringLength", "string") + return + return Integer(len(str.value)) + + +class StringPosition(Builtin): + """ +
+
'StringPosition["$string$", $patt$]' +
gives a list of starting and ending positions where $patt$ matches "$string$". +
'StringPosition["$string$", $patt$, $n$]' +
returns the first $n$ matches only. +
'StringPosition["$string$", {$patt1$, $patt2$, ...}, $n$]' +
matches multiple patterns. +
'StringPosition[{$s1$, $s2$, ...}, $patt$]' +
returns a list of matches for multiple strings. +
+ + >> StringPosition["123ABCxyABCzzzABCABC", "ABC"] + = {{4, 6}, {9, 11}, {15, 17}, {18, 20}} + + >> StringPosition["123ABCxyABCzzzABCABC", "ABC", 2] + = {{4, 6}, {9, 11}} + + 'StringPosition' can be useful for searching through text. + >> data = Import["ExampleData/EinsteinSzilLetter.txt"]; + >> StringPosition[data, "uranium"] + = {{299, 305}, {870, 876}, {1538, 1544}, {1671, 1677}, {2300, 2306}, {2784, 2790}, {3093, 3099}} + + #> StringPosition["123ABCxyABCzzzABCABC", "ABC", -1] + : Non-negative integer or Infinity expected at position 3 in StringPosition[123ABCxyABCzzzABCABC, ABC, -1]. + = StringPosition[123ABCxyABCzzzABCABC, ABC, -1] + + ## Overlaps + #> StringPosition["1231221312112332", RegularExpression["[12]+"]] + = {{1, 2}, {2, 2}, {4, 7}, {5, 7}, {6, 7}, {7, 7}, {9, 13}, {10, 13}, {11, 13}, {12, 13}, {13, 13}, {16, 16}} + #> StringPosition["1231221312112332", RegularExpression["[12]+"], Overlaps -> False] + = {{1, 2}, {4, 7}, {9, 13}, {16, 16}} + #> StringPosition["1231221312112332", RegularExpression["[12]+"], Overlaps -> x] + = {{1, 2}, {4, 7}, {9, 13}, {16, 16}} + #> StringPosition["1231221312112332", RegularExpression["[12]+"], Overlaps -> All] + : Overlaps -> All option is not currently implemented in Mathics. + = {{1, 2}, {2, 2}, {4, 7}, {5, 7}, {6, 7}, {7, 7}, {9, 13}, {10, 13}, {11, 13}, {12, 13}, {13, 13}, {16, 16}} + + #> StringPosition["21211121122", {"121", "11"}] + = {{2, 4}, {4, 5}, {5, 6}, {6, 8}, {8, 9}} + #> StringPosition["21211121122", {"121", "11"}, Overlaps -> False] + = {{2, 4}, {5, 6}, {8, 9}} + + #> StringPosition[{"abc", "abcda"}, "a"] + = {{{1, 1}}, {{1, 1}, {5, 5}}} + + #> StringPosition[{"abc"}, "a", Infinity] + = {{{1, 1}}} + + #> StringPosition["abc"]["123AabcDEabc"] + = {{5, 7}, {10, 12}} + """ + + options = { + "IgnoreCase": "False", + "MetaCharacters": "None", + "Overlaps": "True", + } + + messages = { + "strse": "String or list of strings expected at position `1` in `2`.", + "overall": "Overlaps -> All option is not currently implemented in Mathics.", + "innf": "Non-negative integer or Infinity expected at position `2` in `1`.", + } + + rules = { + "StringPosition[patt_][s_]": "StringPosition[s, patt]", + } + + def apply(self, string, patt, evaluation, options): + "StringPosition[string_, patt_, OptionsPattern[StringPosition]]" + + return self.apply_n( + string, + patt, + Expression("DirectedInfinity", Integer1), + evaluation, + options, + ) + + def apply_n(self, string, patt, n, evaluation, options): + "StringPosition[string_, patt_, n:(_Integer|DirectedInfinity[1]), OptionsPattern[StringPosition]]" + + expr = Expression("StringPosition", string, patt, n) + + # check n + if n.has_form("DirectedInfinity", 1): + py_n = float("inf") + else: + py_n = n.get_int_value() + if py_n is None or py_n < 0: + return evaluation.message("StringPosition", "innf", expr, Integer(3)) + + # check options + if options["System`Overlaps"] == SymbolTrue: + overlap = True + elif options["System`Overlaps"] == SymbolFalse: + overlap = False + elif options["System`Overlaps"] == Symbol("All"): + # TODO + evaluation.message("StringPosition", "overall") + overlap = True + else: + overlap = False # unknown options are teated as False + + # convert patterns + if patt.has_form("List", None): + patts = patt.get_leaves() + else: + patts = [patt] + re_patts = [] + for p in patts: + py_p = to_regex(p, evaluation) + if py_p is None: + return evaluation.message("StringExpression", "invld", p, patt) + re_patts.append(py_p) + compiled_patts = [re.compile(re_patt) for re_patt in re_patts] + + # string or list of strings + if string.has_form("List", None): + py_strings = [s.get_string_value() for s in string.leaves] + if None in py_strings: + return + results = [ + self.do_apply(py_string, compiled_patts, py_n, overlap) + for py_string in py_strings + ] + return Expression(SymbolList, *results) + else: + py_string = string.get_string_value() + if py_string is None: + return + return self.do_apply(py_string, compiled_patts, py_n, overlap) + + @staticmethod + def do_apply(py_string, compiled_patts, py_n, overlap): + result = [] + start = 0 + while start < len(py_string): + found_match = False + for compiled_patt in compiled_patts: + m = compiled_patt.match(py_string, start) + if m is None: + continue + found_match = True + result.append([m.start() + 1, m.end()]) # 0 to 1 based indexing + if len(result) == py_n: + return from_python(result) + if not overlap: + start = m.end() + if overlap or not found_match: + start += 1 + return from_python(result) + + +class StringReplace(_StringFind): + """ +
+
'StringReplace["$string$", "$a$"->"$b$"]' +
replaces each occurrence of $old$ with $new$ in $string$. +
'StringReplace["$string$", {"$s1$"->"$sp1$", "$s2$"->"$sp2$"}]' +
performs multiple replacements of each $si$ by the + corresponding $spi$ in $string$. +
'StringReplace["$string$", $srules$, $n$]' +
only performs the first $n$ replacements. +
'StringReplace[{"$string1$", "$string2$", ...}, $srules$]' +
performs the replacements specified by $srules$ on a list + of strings. +
+ + StringReplace replaces all occurrences of one substring with another: + >> StringReplace["xyxyxyyyxxxyyxy", "xy" -> "A"] + = AAAyyxxAyA + + Multiple replacements can be supplied: + >> StringReplace["xyzwxyzwxxyzxyzw", {"xyz" -> "A", "w" -> "BCD"}] + = ABCDABCDxAABCD + + Only replace the first 2 occurences: + >> StringReplace["xyxyxyyyxxxyyxy", "xy" -> "A", 2] + = AAxyyyxxxyyxy + + Also works for multiple rules: + >> StringReplace["abba", {"a" -> "A", "b" -> "B"}, 2] + = ABba + + StringReplace acts on lists of strings too: + >> StringReplace[{"xyxyxxy", "yxyxyxxxyyxy"}, "xy" -> "A"] + = {AAxA, yAAxxAyA} + + #> StringReplace["abcabc", "a" -> "b", Infinity] + = bbcbbc + #> StringReplace[x, "a" -> "b"] + : String or list of strings expected at position 1 in StringReplace[x, a -> b]. + = StringReplace[x, a -> b] + #> StringReplace["xyzwxyzwaxyzxyzw", x] + : x is not a valid string replacement rule. + = StringReplace[xyzwxyzwaxyzxyzw, x] + #> StringReplace["xyzwxyzwaxyzxyzw", x -> y] + : Element x is not a valid string or pattern element in x. + = StringReplace[xyzwxyzwaxyzxyzw, x -> y] + #> StringReplace["abcabc", "a" -> "b", -1] + : Non-negative integer or Infinity expected at position 3 in StringReplace[abcabc, a -> b, -1]. + = StringReplace[abcabc, a -> b, -1] + #> StringReplace["abc", "b" -> 4] + : String expected. + = a <> 4 <> c + + #> StringReplace["01101100010", "01" .. -> "x"] + = x1x100x0 + + #> StringReplace["abc abcb abdc", "ab" ~~ _ -> "X"] + = X Xb Xc + + #> StringReplace["abc abcd abcd", WordBoundary ~~ "abc" ~~ WordBoundary -> "XX"] + = XX abcd abcd + + #> StringReplace["abcd acbd", RegularExpression["[ab]"] -> "XX"] + = XXXXcd XXcXXd + + #> StringReplace["abcd acbd", RegularExpression["[ab]"] ~~ _ -> "YY"] + = YYcd YYYY + + #> StringReplace["abcdabcdaabcabcd", {"abc" -> "Y", "d" -> "XXX"}] + = YXXXYXXXaYYXXX + + + #> StringReplace[" Have a nice day. ", (StartOfString ~~ Whitespace) | (Whitespace ~~ EndOfString) -> ""] // FullForm + = "Have a nice day." + + #> StringReplace["xyXY", "xy" -> "01"] + = 01XY + #> StringReplace["xyXY", "xy" -> "01", IgnoreCase -> True] + = 0101 + + StringReplace also can be used as an operator: + >> StringReplace["y" -> "ies"]["city"] + = cities + """ + + # TODO Special Characters + """ + #> StringReplace["product: A \\[CirclePlus] B" , "\\[CirclePlus]" -> "x"] + = A x B + """ + + rules = { + "StringReplace[rule_][string_]": "StringReplace[string, rule]", + } + + def _find(self, py_stri, py_rules, py_n, flags, evaluation): + def cases(): + k = 0 + for match, form in _parallel_match(py_stri, py_rules, flags, py_n): + start, end = match.span() + if start > k: + yield String(py_stri[k:start]) + yield _evaluate_match(form, match, evaluation) + k = end + if k < len(py_stri): + yield String(py_stri[k:]) + + return Expression("StringJoin", *list(cases())) + + def apply(self, string, rule, n, evaluation, options): + "%(name)s[string_, rule_, OptionsPattern[%(name)s], n_:System`Private`Null]" + # this pattern is a slight hack to get around missing Shortest/Longest. + return self._apply(string, rule, n, evaluation, options, False) + + +class StringReverse(Builtin): + """ +
+
'StringReverse["$string$"]' +
reverses the order of the characters in "string". +
+ + >> StringReverse["live"] + = evil + """ + + attributes = ("Listable", "Protected") + + def apply(self, string, evaluation): + "StringReverse[string_String]" + return String(string.get_string_value()[::-1]) + + +class StringTake(Builtin): + """ +
+
'StringTake["$string$", $n$]' +
gives the first $n$ characters in $string$. + +
'StringTake["$string$", -$n$]' +
gives the last $n$ characters in $string$. + +
'StringTake["$string$", {$n$}]' +
gives the $n$th character in $string$. + +
'StringTake["$string$", {$m$, $n$}]' +
gives characters $m$ through $n$ in $string$. + +
'StringTake["$string$", {$m$, $n$, $s$}]' +
gives characters $m$ through $n$ in steps of $s$. + +
'StringTake[{$s1$, $s2$, ...} $spec$}]' +
gives the list of results for each of the $si$. +
+ + >> StringTake["abcde", 2] + = ab + >> StringTake["abcde", 0] + = #<--# + >> StringTake["abcde", -2] + = de + >> StringTake["abcde", {2}] + = b + >> StringTake["abcd", {2,3}] + = bc + >> StringTake["abcdefgh", {1, 5, 2}] + = ace + + Take the last 2 characters from several strings: + >> StringTake[{"abcdef", "stuv", "xyzw"}, -2] + = {ef, uv, zw} + + StringTake also supports standard sequence specifications + >> StringTake["abcdef", All] + = abcdef + + #> StringTake["abcd", 0] // InputForm + = "" + #> StringTake["abcd", {3, 2}] // InputForm + = "" + #> StringTake["", {1, 0}] // InputForm + = "" + + #> StringTake["abc", {0, 0}] + : Cannot take positions 0 through 0 in "abc". + = StringTake[abc, {0, 0}] + + #> StringTake[{2, 4},2] + : String or list of strings expected at position 1. + = StringTake[{2, 4}, 2] + + #> StringTake["kkkl",Graphics[{}]] + : Integer or a list of sequence specifications expected at position 2. + = StringTake[kkkl, -Graphics-] + """ + + messages = { + "strse": "String or list of strings expected at position 1.", + # FIXME: mseqs should be: Sequence specification (+n, -n, {+n}, {-n}, {m, n}, or {m, n, s}) or a list + # of sequence specifications expected at position 2 in + "mseqs": "Integer or a list of sequence specifications expected at position 2.", + "take": 'Cannot take positions `1` through `2` in "`3`".', + } + + def apply(self, string, seqspec, evaluation): + "StringTake[string_String, seqspec_]" + result = string.get_string_value() + if result is None: + return evaluation.message("StringTake", "strse") + + if isinstance(seqspec, Integer): + pos = seqspec.get_int_value() + if pos >= 0: + seq = (1, pos, 1) + else: + seq = (pos, None, 1) + else: + seq = convert_seq(seqspec) + + if seq is None: + return evaluation.message("StringTake", "mseqs") + + start, stop, step = seq + py_slice = python_seq(start, stop, step, len(result)) + + if py_slice is None: + return evaluation.message("StringTake", "take", start, stop, string) + + return String(result[py_slice]) + + def apply_strings(self, strings, spec, evaluation): + "StringTake[strings__, spec_]" + result_list = [] + for string in strings.leaves: + result = self.apply(string, spec, evaluation) + if result is None: + return None + result_list.append(result) + return Expression("List", *result_list) diff --git a/mathics/builtin/string/patterns.py b/mathics/builtin/string/patterns.py new file mode 100644 index 000000000..1b94e308c --- /dev/null +++ b/mathics/builtin/string/patterns.py @@ -0,0 +1,119 @@ +# -*- coding: utf-8 -*- +""" +String Patterns +""" + +import re + +from mathics.version import __version__ # noqa used in loading to check consistency. + +from mathics.builtin.base import Builtin +from mathics.core.expression import ( + Expression, + Integer1, + SymbolFalse, + SymbolTrue, + ) + + +from mathics.builtin.strings import ( + anchor_pattern, + to_regex, +) + +class StringMatchQ(Builtin): + r""" + >> StringMatchQ["abc", "abc"] + = True + + >> StringMatchQ["abc", "abd"] + = False + + >> StringMatchQ["15a94xcZ6", (DigitCharacter | LetterCharacter)..] + = True + + #> StringMatchQ["abc1", LetterCharacter] + = False + + #> StringMatchQ["abc", "ABC"] + = False + #> StringMatchQ["abc", "ABC", IgnoreCase -> True] + = True + + ## Words containing nonword characters + #> StringMatchQ[{"monkey", "don't", "AAA", "S&P"}, ___ ~~ Except[WordCharacter] ~~ ___] + = {False, True, False, True} + + ## Try to match a literal number + #> StringMatchQ[1.5, NumberString] + : String or list of strings expected at position 1 in StringMatchQ[1.5, NumberString]. + = StringMatchQ[1.5, NumberString] + + Use StringMatchQ as an operator + >> StringMatchQ[LetterCharacter]["a"] + = True + + ## Abbreviated string patterns Issue #517 + #> StringMatchQ["abcd", "abc*"] + = True + #> StringMatchQ["abc", "abc*"] + = True + #> StringMatchQ["abc\\", "abc\\"] + = True + #> StringMatchQ["abc*d", "abc\\*d"] + = True + #> StringMatchQ["abc*d", "abc\\**"] + = True + #> StringMatchQ["abcde", "a*f"] + = False + + #> StringMatchQ["abcde", "a@e"] + = True + #> StringMatchQ["aBCDe", "a@e"] + = False + #> StringMatchQ["ae", "a@e"] + = False + """ + + attributes = ("Listable",) + + options = { + "IgnoreCase": "False", + "SpellingCorrections": "None", + } + + messages = { + "strse": "String or list of strings expected at position `1` in `2`.", + } + + rules = { + "StringMatchQ[patt_][expr_]": "StringMatchQ[expr, patt]", + } + + def apply(self, string, patt, evaluation, options): + "StringMatchQ[string_, patt_, OptionsPattern[%(name)s]]" + py_string = string.get_string_value() + if py_string is None: + return evaluation.message( + "StringMatchQ", + "strse", + Integer1, + Expression("StringMatchQ", string, patt), + ) + + re_patt = to_regex(patt, evaluation, abbreviated_patterns=True) + if re_patt is None: + return evaluation.message( + "StringExpression", "invld", patt, Expression("StringExpression", patt) + ) + + re_patt = anchor_pattern(re_patt) + + flags = re.MULTILINE + if options["System`IgnoreCase"] == SymbolTrue: + flags = flags | re.IGNORECASE + + if re.match(re_patt, py_string, flags=flags) is None: + return SymbolFalse + else: + return SymbolTrue diff --git a/mathics/builtin/strings.py b/mathics/builtin/strings.py index beef742fa..ff34b1a38 100644 --- a/mathics/builtin/strings.py +++ b/mathics/builtin/strings.py @@ -1,13 +1,11 @@ # -*- coding: utf-8 -*- - """ -Strings and Characters +Unsorted Strings and Characters """ import io import re import sys -from sys import version_info import unicodedata from binascii import hexlify, unhexlify from heapq import heappush, heappop @@ -33,11 +31,9 @@ Integer, Integer0, Integer1, - from_python, string_list, ) from mathics.core.parser import MathicsFileLineFeeder, parse -from mathics.builtin.lists import python_seq, convert_seq from mathics.settings import SYSTEM_CHARACTER_ENCODING from mathics_scanner import TranslateError @@ -295,24 +291,11 @@ def mathics_split(patt, string, flags): return [string[start:stop] for start, stop in indices] -if version_info >= (3, 0): - - def pack_bytes(codes): - return bytes(codes) - - def unpack_bytes(codes): - return [int(code) for code in codes] - - -else: - from struct import pack, unpack - - def pack_bytes(codes): - return pack("B" * len(codes), *codes) - - def unpack_bytes(codes): - return unpack("B" * len(codes), codes) +def pack_bytes(codes): + return bytes(codes) +def unpack_bytes(codes): + return [int(code) for code in codes] class SystemCharacterEncoding(Predefined): """ @@ -833,205 +816,6 @@ class HexidecimalCharacter(Builtin): """ -class DigitQ(Builtin): - """ -
-
'DigitQ[$string$]' - yields 'True' if all the characters in the $string$ are digits, and yields 'False' otherwise. -
- - >> DigitQ["9"] - = True - - >> DigitQ["a"] - = False - - >> DigitQ["01001101011000010111010001101000011010010110001101110011"] - = True - - >> DigitQ["-123456789"] - = False - - """ - - rules = { - "DigitQ[string_]": ( - "If[StringQ[string], StringMatchQ[string, DigitCharacter...], False, False]" - ), - } - - -class LetterQ(Builtin): - """ -
-
'LetterQ[$string$]' - yields 'True' if all the characters in the $string$ are letters, and yields 'False' otherwise. -
- - >> LetterQ["m"] - = True - - >> LetterQ["9"] - = False - - >> LetterQ["Mathics"] - = True - - >> LetterQ["Welcome to Mathics"] - = False - - #> LetterQ[""] - = True - - #> LetterQ["\\[Alpha]\\[Beta]\\[Gamma]\\[Delta]\\[Epsilon]\\[Zeta]\\[Eta]\\[Theta]"] - = True - """ - - rules = { - "LetterQ[string_]": ( - "If[StringQ[string], StringMatchQ[string, LetterCharacter...], False, False]" - ), - } - - -class StringMatchQ(Builtin): - r""" - >> StringMatchQ["abc", "abc"] - = True - - >> StringMatchQ["abc", "abd"] - = False - - >> StringMatchQ["15a94xcZ6", (DigitCharacter | LetterCharacter)..] - = True - - #> StringMatchQ["abc1", LetterCharacter] - = False - - #> StringMatchQ["abc", "ABC"] - = False - #> StringMatchQ["abc", "ABC", IgnoreCase -> True] - = True - - ## Words containing nonword characters - #> StringMatchQ[{"monkey", "don't", "AAA", "S&P"}, ___ ~~ Except[WordCharacter] ~~ ___] - = {False, True, False, True} - - ## Try to match a literal number - #> StringMatchQ[1.5, NumberString] - : String or list of strings expected at position 1 in StringMatchQ[1.5, NumberString]. - = StringMatchQ[1.5, NumberString] - - Use StringMatchQ as an operator - >> StringMatchQ[LetterCharacter]["a"] - = True - - ## Abbreviated string patterns Issue #517 - #> StringMatchQ["abcd", "abc*"] - = True - #> StringMatchQ["abc", "abc*"] - = True - #> StringMatchQ["abc\\", "abc\\"] - = True - #> StringMatchQ["abc*d", "abc\\*d"] - = True - #> StringMatchQ["abc*d", "abc\\**"] - = True - #> StringMatchQ["abcde", "a*f"] - = False - - #> StringMatchQ["abcde", "a@e"] - = True - #> StringMatchQ["aBCDe", "a@e"] - = False - #> StringMatchQ["ae", "a@e"] - = False - """ - - attributes = ("Listable",) - - options = { - "IgnoreCase": "False", - "SpellingCorrections": "None", - } - - messages = { - "strse": "String or list of strings expected at position `1` in `2`.", - } - - rules = { - "StringMatchQ[patt_][expr_]": "StringMatchQ[expr, patt]", - } - - def apply(self, string, patt, evaluation, options): - "StringMatchQ[string_, patt_, OptionsPattern[%(name)s]]" - py_string = string.get_string_value() - if py_string is None: - return evaluation.message( - "StringMatchQ", - "strse", - Integer1, - Expression("StringMatchQ", string, patt), - ) - - re_patt = to_regex(patt, evaluation, abbreviated_patterns=True) - if re_patt is None: - return evaluation.message( - "StringExpression", "invld", patt, Expression("StringExpression", patt) - ) - - re_patt = anchor_pattern(re_patt) - - flags = re.MULTILINE - if options["System`IgnoreCase"] == SymbolTrue: - flags = flags | re.IGNORECASE - - if re.match(re_patt, py_string, flags=flags) is None: - return SymbolFalse - else: - return SymbolTrue - - -class StringJoin(BinaryOperator): - """ -
-
'StringJoin["$s1$", "$s2$", ...]' -
returns the concatenation of the strings $s1$, $s2$, …. -
- - >> StringJoin["a", "b", "c"] - = abc - >> "a" <> "b" <> "c" // InputForm - = "abc" - - 'StringJoin' flattens lists out: - >> StringJoin[{"a", "b"}] // InputForm - = "ab" - >> Print[StringJoin[{"Hello", " ", {"world"}}, "!"]] - | Hello world! - """ - - operator = "<>" - precedence = 600 - attributes = ("Flat", "OneIdentity") - - def apply(self, items, evaluation): - "StringJoin[items___]" - - result = "" - items = items.flatten(SymbolList) - if items.get_head_name() == "System`List": - items = items.leaves - else: - items = items.get_sequence() - for item in items: - if not isinstance(item, String): - evaluation.message("StringJoin", "string") - return - result += item.value - return String(result) - - class StringSplit(Builtin): """
@@ -1137,191 +921,6 @@ def apply(self, string, patt, evaluation, options): ) -class StringPosition(Builtin): - """ -
-
'StringPosition["$string$", $patt$]' -
gives a list of starting and ending positions where $patt$ matches "$string$". -
'StringPosition["$string$", $patt$, $n$]' -
returns the first $n$ matches only. -
'StringPosition["$string$", {$patt1$, $patt2$, ...}, $n$]' -
matches multiple patterns. -
'StringPosition[{$s1$, $s2$, ...}, $patt$]' -
returns a list of matches for multiple strings. -
- - >> StringPosition["123ABCxyABCzzzABCABC", "ABC"] - = {{4, 6}, {9, 11}, {15, 17}, {18, 20}} - - >> StringPosition["123ABCxyABCzzzABCABC", "ABC", 2] - = {{4, 6}, {9, 11}} - - 'StringPosition' can be useful for searching through text. - >> data = Import["ExampleData/EinsteinSzilLetter.txt"]; - >> StringPosition[data, "uranium"] - = {{299, 305}, {870, 876}, {1538, 1544}, {1671, 1677}, {2300, 2306}, {2784, 2790}, {3093, 3099}} - - #> StringPosition["123ABCxyABCzzzABCABC", "ABC", -1] - : Non-negative integer or Infinity expected at position 3 in StringPosition[123ABCxyABCzzzABCABC, ABC, -1]. - = StringPosition[123ABCxyABCzzzABCABC, ABC, -1] - - ## Overlaps - #> StringPosition["1231221312112332", RegularExpression["[12]+"]] - = {{1, 2}, {2, 2}, {4, 7}, {5, 7}, {6, 7}, {7, 7}, {9, 13}, {10, 13}, {11, 13}, {12, 13}, {13, 13}, {16, 16}} - #> StringPosition["1231221312112332", RegularExpression["[12]+"], Overlaps -> False] - = {{1, 2}, {4, 7}, {9, 13}, {16, 16}} - #> StringPosition["1231221312112332", RegularExpression["[12]+"], Overlaps -> x] - = {{1, 2}, {4, 7}, {9, 13}, {16, 16}} - #> StringPosition["1231221312112332", RegularExpression["[12]+"], Overlaps -> All] - : Overlaps -> All option is not currently implemented in Mathics. - = {{1, 2}, {2, 2}, {4, 7}, {5, 7}, {6, 7}, {7, 7}, {9, 13}, {10, 13}, {11, 13}, {12, 13}, {13, 13}, {16, 16}} - - #> StringPosition["21211121122", {"121", "11"}] - = {{2, 4}, {4, 5}, {5, 6}, {6, 8}, {8, 9}} - #> StringPosition["21211121122", {"121", "11"}, Overlaps -> False] - = {{2, 4}, {5, 6}, {8, 9}} - - #> StringPosition[{"abc", "abcda"}, "a"] - = {{{1, 1}}, {{1, 1}, {5, 5}}} - - #> StringPosition[{"abc"}, "a", Infinity] - = {{{1, 1}}} - - #> StringPosition["abc"]["123AabcDEabc"] - = {{5, 7}, {10, 12}} - """ - - options = { - "IgnoreCase": "False", - "MetaCharacters": "None", - "Overlaps": "True", - } - - messages = { - "strse": "String or list of strings expected at position `1` in `2`.", - "overall": "Overlaps -> All option is not currently implemented in Mathics.", - "innf": "Non-negative integer or Infinity expected at position `2` in `1`.", - } - - rules = { - "StringPosition[patt_][s_]": "StringPosition[s, patt]", - } - - def apply(self, string, patt, evaluation, options): - "StringPosition[string_, patt_, OptionsPattern[StringPosition]]" - - return self.apply_n( - string, - patt, - Expression("DirectedInfinity", Integer1), - evaluation, - options, - ) - - def apply_n(self, string, patt, n, evaluation, options): - "StringPosition[string_, patt_, n:(_Integer|DirectedInfinity[1]), OptionsPattern[StringPosition]]" - - expr = Expression("StringPosition", string, patt, n) - - # check n - if n.has_form("DirectedInfinity", 1): - py_n = float("inf") - else: - py_n = n.get_int_value() - if py_n is None or py_n < 0: - return evaluation.message("StringPosition", "innf", expr, Integer(3)) - - # check options - if options["System`Overlaps"] == SymbolTrue: - overlap = True - elif options["System`Overlaps"] == SymbolFalse: - overlap = False - elif options["System`Overlaps"] == Symbol("All"): - # TODO - evaluation.message("StringPosition", "overall") - overlap = True - else: - overlap = False # unknown options are teated as False - - # convert patterns - if patt.has_form("List", None): - patts = patt.get_leaves() - else: - patts = [patt] - re_patts = [] - for p in patts: - py_p = to_regex(p, evaluation) - if py_p is None: - return evaluation.message("StringExpression", "invld", p, patt) - re_patts.append(py_p) - compiled_patts = [re.compile(re_patt) for re_patt in re_patts] - - # string or list of strings - if string.has_form("List", None): - py_strings = [s.get_string_value() for s in string.leaves] - if None in py_strings: - return - results = [ - self.do_apply(py_string, compiled_patts, py_n, overlap) - for py_string in py_strings - ] - return Expression(SymbolList, *results) - else: - py_string = string.get_string_value() - if py_string is None: - return - return self.do_apply(py_string, compiled_patts, py_n, overlap) - - @staticmethod - def do_apply(py_string, compiled_patts, py_n, overlap): - result = [] - start = 0 - while start < len(py_string): - found_match = False - for compiled_patt in compiled_patts: - m = compiled_patt.match(py_string, start) - if m is None: - continue - found_match = True - result.append([m.start() + 1, m.end()]) # 0 to 1 based indexing - if len(result) == py_n: - return from_python(result) - if not overlap: - start = m.end() - if overlap or not found_match: - start += 1 - return from_python(result) - - -class StringLength(Builtin): - """ -
-
'StringLength["$string$"]' -
gives the length of $string$. -
- - >> StringLength["abc"] - = 3 - 'StringLength' is listable: - >> StringLength[{"a", "bc"}] - = {1, 2} - - >> StringLength[x] - : String expected. - = StringLength[x] - """ - - attributes = ("Listable",) - - def apply(self, str, evaluation): - "StringLength[str_]" - - if not isinstance(str, String): - evaluation.message("StringLength", "string") - return - return Integer(len(str.value)) - - class _StringFind(Builtin): attributes = "Protected" @@ -1410,139 +1009,6 @@ def convert_rule(r): return self._find(py_strings, py_rules, py_n, flags, evaluation) -class StringReplace(_StringFind): - """ -
-
'StringReplace["$string$", "$a$"->"$b$"]' -
replaces each occurrence of $old$ with $new$ in $string$. -
'StringReplace["$string$", {"$s1$"->"$sp1$", "$s2$"->"$sp2$"}]' -
performs multiple replacements of each $si$ by the - corresponding $spi$ in $string$. -
'StringReplace["$string$", $srules$, $n$]' -
only performs the first $n$ replacements. -
'StringReplace[{"$string1$", "$string2$", ...}, $srules$]' -
performs the replacements specified by $srules$ on a list - of strings. -
- - StringReplace replaces all occurrences of one substring with another: - >> StringReplace["xyxyxyyyxxxyyxy", "xy" -> "A"] - = AAAyyxxAyA - - Multiple replacements can be supplied: - >> StringReplace["xyzwxyzwxxyzxyzw", {"xyz" -> "A", "w" -> "BCD"}] - = ABCDABCDxAABCD - - Only replace the first 2 occurences: - >> StringReplace["xyxyxyyyxxxyyxy", "xy" -> "A", 2] - = AAxyyyxxxyyxy - - Also works for multiple rules: - >> StringReplace["abba", {"a" -> "A", "b" -> "B"}, 2] - = ABba - - StringReplace acts on lists of strings too: - >> StringReplace[{"xyxyxxy", "yxyxyxxxyyxy"}, "xy" -> "A"] - = {AAxA, yAAxxAyA} - - #> StringReplace["abcabc", "a" -> "b", Infinity] - = bbcbbc - #> StringReplace[x, "a" -> "b"] - : String or list of strings expected at position 1 in StringReplace[x, a -> b]. - = StringReplace[x, a -> b] - #> StringReplace["xyzwxyzwaxyzxyzw", x] - : x is not a valid string replacement rule. - = StringReplace[xyzwxyzwaxyzxyzw, x] - #> StringReplace["xyzwxyzwaxyzxyzw", x -> y] - : Element x is not a valid string or pattern element in x. - = StringReplace[xyzwxyzwaxyzxyzw, x -> y] - #> StringReplace["abcabc", "a" -> "b", -1] - : Non-negative integer or Infinity expected at position 3 in StringReplace[abcabc, a -> b, -1]. - = StringReplace[abcabc, a -> b, -1] - #> StringReplace["abc", "b" -> 4] - : String expected. - = a <> 4 <> c - - #> StringReplace["01101100010", "01" .. -> "x"] - = x1x100x0 - - #> StringReplace["abc abcb abdc", "ab" ~~ _ -> "X"] - = X Xb Xc - - #> StringReplace["abc abcd abcd", WordBoundary ~~ "abc" ~~ WordBoundary -> "XX"] - = XX abcd abcd - - #> StringReplace["abcd acbd", RegularExpression["[ab]"] -> "XX"] - = XXXXcd XXcXXd - - #> StringReplace["abcd acbd", RegularExpression["[ab]"] ~~ _ -> "YY"] - = YYcd YYYY - - #> StringReplace["abcdabcdaabcabcd", {"abc" -> "Y", "d" -> "XXX"}] - = YXXXYXXXaYYXXX - - - #> StringReplace[" Have a nice day. ", (StartOfString ~~ Whitespace) | (Whitespace ~~ EndOfString) -> ""] // FullForm - = "Have a nice day." - - #> StringReplace["xyXY", "xy" -> "01"] - = 01XY - #> StringReplace["xyXY", "xy" -> "01", IgnoreCase -> True] - = 0101 - - StringReplace also can be used as an operator: - >> StringReplace["y" -> "ies"]["city"] - = cities - """ - - # TODO Special Characters - """ - #> StringReplace["product: A \\[CirclePlus] B" , "\\[CirclePlus]" -> "x"] - = A x B - """ - - rules = { - "StringReplace[rule_][string_]": "StringReplace[string, rule]", - } - - def _find(self, py_stri, py_rules, py_n, flags, evaluation): - def cases(): - k = 0 - for match, form in _parallel_match(py_stri, py_rules, flags, py_n): - start, end = match.span() - if start > k: - yield String(py_stri[k:start]) - yield _evaluate_match(form, match, evaluation) - k = end - if k < len(py_stri): - yield String(py_stri[k:]) - - return Expression("StringJoin", *list(cases())) - - def apply(self, string, rule, n, evaluation, options): - "%(name)s[string_, rule_, OptionsPattern[%(name)s], n_:System`Private`Null]" - # this pattern is a slight hack to get around missing Shortest/Longest. - return self._apply(string, rule, n, evaluation, options, False) - - -class StringReverse(Builtin): - """ -
-
'StringReverse["$string$"]' -
reverses the order of the characters in "string". -
- - >> StringReverse["live"] - = evil - """ - - attributes = ("Listable", "Protected") - - def apply(self, string, evaluation): - "StringReverse[string_String]" - return String(string.get_string_value()[::-1]) - - class StringCases(_StringFind): """
@@ -1658,70 +1124,6 @@ def apply_truncated(self, s, n, m, expression, evaluation): return String((py_s * py_n)[:py_m]) -class Characters(Builtin): - """ -
-
'Characters["$string$"]' -
returns a list of the characters in $string$. -
- - >> Characters["abc"] - = {a, b, c} - - #> \\.78\\.79\\.7A - = xyz - - #> \\:0078\\:0079\\:007A - = xyz - - #> \\101\\102\\103\\061\\062\\063 - = ABC123 - - #> \\[Alpha]\\[Beta]\\[Gamma] - = \u03B1\u03B2\u03B3 - """ - - attributes = ("Listable",) - - def apply(self, string, evaluation): - "Characters[string_String]" - - return Expression(SymbolList, *(String(c) for c in string.value)) - - -class CharacterRange(Builtin): - """ -
-
'CharacterRange["$a$", "$b$"]' -
returns a list of the Unicode characters from $a$ to $b$ - inclusive. -
- - >> CharacterRange["a", "e"] - = {a, b, c, d, e} - >> CharacterRange["b", "a"] - = {} - """ - - attributes = ("ReadProtected",) - - messages = { - "argtype": "Arguments `1` and `2` are not both strings of length 1.", - } - - def apply(self, start, stop, evaluation): - "CharacterRange[start_String, stop_String]" - - if len(start.value) != 1 or len(stop.value) != 1: - evaluation.message("CharacterRange", "argtype", start, stop) - return - start = ord(start.value[0]) - stop = ord(stop.value[0]) - return Expression( - "List", *[String(chr(code)) for code in range(start, stop + 1)] - ) - - class String_(Builtin): """
@@ -1746,80 +1148,6 @@ class String_(Builtin): name = "String" -class LowerCaseQ(Test): - """ -
-
'LowerCaseQ[$s$]' -
returns True if $s$ consists wholly of lower case characters. -
- - >> LowerCaseQ["abc"] - = True - - An empty string returns True. - >> LowerCaseQ[""] - = True - """ - - def test(self, s): - return isinstance(s, String) and all(c.islower() for c in s.get_string_value()) - - -class ToLowerCase(Builtin): - """ -
-
'ToLowerCase[$s$]' -
returns $s$ in all lower case. -
- - >> ToLowerCase["New York"] - = new york - """ - - attributes = ("Listable", "Protected") - - def apply(self, s, evaluation): - "ToLowerCase[s_String]" - return String(s.get_string_value().lower()) - - -class UpperCaseQ(Test): - """ -
-
'UpperCaseQ[$s$]' -
returns True if $s$ consists wholly of upper case characters. -
- - >> UpperCaseQ["ABC"] - = True - - An empty string returns True. - >> UpperCaseQ[""] - = True - """ - - def test(self, s): - return isinstance(s, String) and all(c.isupper() for c in s.get_string_value()) - - -class ToUpperCase(Builtin): - """ -
-
'ToUpperCase[$s$]' -
returns $s$ in all upper case. -
- - >> ToUpperCase["New York"] - = NEW YORK - """ - - attributes = ("Listable", "Protected") - - def apply(self, s, evaluation): - "ToUpperCase[s_String]" - return String(s.get_string_value().upper()) - - class ToString(Builtin): """
@@ -2287,209 +1615,6 @@ def test(self, expr): return isinstance(expr, String) -class StringTake(Builtin): - """ -
-
'StringTake["$string$", $n$]' -
gives the first $n$ characters in $string$. - -
'StringTake["$string$", -$n$]' -
gives the last $n$ characters in $string$. - -
'StringTake["$string$", {$n$}]' -
gives the $n$th character in $string$. - -
'StringTake["$string$", {$m$, $n$}]' -
gives characters $m$ through $n$ in $string$. - -
'StringTake["$string$", {$m$, $n$, $s$}]' -
gives characters $m$ through $n$ in steps of $s$. - -
'StringTake[{$s1$, $s2$, ...} $spec$}]' -
gives the list of results for each of the $si$. -
- - >> StringTake["abcde", 2] - = ab - >> StringTake["abcde", 0] - = #<--# - >> StringTake["abcde", -2] - = de - >> StringTake["abcde", {2}] - = b - >> StringTake["abcd", {2,3}] - = bc - >> StringTake["abcdefgh", {1, 5, 2}] - = ace - - Take the last 2 characters from several strings: - >> StringTake[{"abcdef", "stuv", "xyzw"}, -2] - = {ef, uv, zw} - - StringTake also supports standard sequence specifications - >> StringTake["abcdef", All] - = abcdef - - #> StringTake["abcd", 0] // InputForm - = "" - #> StringTake["abcd", {3, 2}] // InputForm - = "" - #> StringTake["", {1, 0}] // InputForm - = "" - - #> StringTake["abc", {0, 0}] - : Cannot take positions 0 through 0 in "abc". - = StringTake[abc, {0, 0}] - - #> StringTake[{2, 4},2] - : String or list of strings expected at position 1. - = StringTake[{2, 4}, 2] - - #> StringTake["kkkl",Graphics[{}]] - : Integer or a list of sequence specifications expected at position 2. - = StringTake[kkkl, -Graphics-] - """ - - messages = { - "strse": "String or list of strings expected at position 1.", - # FIXME: mseqs should be: Sequence specification (+n, -n, {+n}, {-n}, {m, n}, or {m, n, s}) or a list - # of sequence specifications expected at position 2 in - "mseqs": "Integer or a list of sequence specifications expected at position 2.", - "take": 'Cannot take positions `1` through `2` in "`3`".', - } - - def apply(self, string, seqspec, evaluation): - "StringTake[string_String, seqspec_]" - result = string.get_string_value() - if result is None: - return evaluation.message("StringTake", "strse") - - if isinstance(seqspec, Integer): - pos = seqspec.get_int_value() - if pos >= 0: - seq = (1, pos, 1) - else: - seq = (pos, None, 1) - else: - seq = convert_seq(seqspec) - - if seq is None: - return evaluation.message("StringTake", "mseqs") - - start, stop, step = seq - py_slice = python_seq(start, stop, step, len(result)) - - if py_slice is None: - return evaluation.message("StringTake", "take", start, stop, string) - - return String(result[py_slice]) - - def apply_strings(self, strings, spec, evaluation): - "StringTake[strings__, spec_]" - result_list = [] - for string in strings.leaves: - result = self.apply(string, spec, evaluation) - if result is None: - return None - result_list.append(result) - return Expression("List", *result_list) - - -class StringDrop(Builtin): - """ -
-
'StringDrop["$string$", $n$]' -
gives $string$ with the first $n$ characters dropped. -
'StringDrop["$string$", -$n$]' -
gives $string$ with the last $n$ characters dropped. -
'StringDrop["$string$", {$n$}]' -
gives $string$ with the $n$th character dropped. -
'StringDrop["$string$", {$m$, $n$}]' -
gives $string$ with the characters $m$ through $n$ dropped. -
- - >> StringDrop["abcde", 2] - = cde - >> StringDrop["abcde", -2] - = abc - >> StringDrop["abcde", {2}] - = acde - >> StringDrop["abcde", {2,3}] - = ade - >> StringDrop["abcd",{3,2}] - = abcd - >> StringDrop["abcd",0] - = abcd - """ - - messages = { - "strse": "String expected at position 1.", - "mseqs": "Integer or list of two Integers are expected at position 2.", - "drop": 'Cannot drop positions `1` through `2` in "`3`".', - } - - def apply_with_n(self, string, n, evaluation): - "StringDrop[string_,n_Integer]" - if not isinstance(string, String): - return evaluation.message("StringDrop", "strse") - if isinstance(n, Integer): - pos = n.value - if pos > len(string.get_string_value()): - return evaluation.message("StringDrop", "drop", 1, pos, string) - if pos < -len(string.get_string_value()): - return evaluation.message("StringDrop", "drop", pos, -1, string) - if pos > 0: - return String(string.get_string_value()[pos:]) - if pos < 0: - return String(string.get_string_value()[:(pos)]) - if pos == 0: - return string - return evaluation.message("StringDrop", "mseqs") - - def apply_with_ni_nf(self, string, ni, nf, evaluation): - "StringDrop[string_,{ni_Integer,nf_Integer}]" - if not isinstance(string, String): - return evaluation.message("StringDrop", "strse", string) - - if ni.value == 0 or nf.value == 0: - return evaluation.message("StringDrop", "drop", ni, nf) - fullstring = string.get_string_value() - lenfullstring = len(fullstring) - posi = ni.value - if posi < 0: - posi = lenfullstring + posi + 1 - posf = nf.value - if posf < 0: - posf = lenfullstring + posf + 1 - if posf > lenfullstring or posi > lenfullstring or posf <= 0 or posi <= 0: - # positions out or range - return evaluation.message("StringDrop", "drop", ni, nf, fullstring) - if posf < posi: - return string # this is what actually mma does - return String(fullstring[: (posi - 1)] + fullstring[posf:]) - - def apply_with_ni(self, string, ni, evaluation): - "StringDrop[string_,{ni_Integer}]" - if not isinstance(string, String): - return evaluation.message("StringDrop", "strse", string) - if ni.value == 0: - return evaluation.message("StringDrop", "drop", ni, ni) - fullstring = string.get_string_value() - lenfullstring = len(fullstring) - posi = ni.value - if posi < 0: - posi = lenfullstring + posi + 1 - if posi > lenfullstring or posi <= 0: - return evaluation.message("StringDrop", "drop", ni, ni, fullstring) - return String(fullstring[: (posi - 1)] + fullstring[posi:]) - - def apply(self, string, something, evaluation): - "StringDrop[string_,something___]" - if not isinstance(string, String): - return evaluation.message("StringDrop", "strse") - return evaluation.message("StringDrop", "mseqs") - - class HammingDistance(Builtin): """
@@ -2844,189 +1969,6 @@ def apply_pattern(self, s, patt, expression, evaluation): return String(text[left:right]) -class StringInsert(Builtin): - """ -
-
'StringInsert["$string$", "$snew$", $n$]' -
yields a string with $snew$ inserted starting at position $n$ in $string$. - -
'StringInsert["$string$", "$snew$", -$n$]' -
inserts a at position $n$ from the end of "$string$". - -
'StringInsert["$string$", "$snew$", {$n_1$, $n_2$, ...}]' -
inserts a copy of $snew$ at each position $n_i$ in $string$; - the $n_i$ are taken before any insertion is done. - -
'StringInsert[{$s_1$, $s_2$, ...}, "$snew$", $n$]' -
gives the list of resutls for each of the $s_i$. -
- - >> StringInsert["noting", "h", 4] - = nothing - - #> StringInsert["abcdefghijklm", "X", 15] - : Cannot insert at position 15 in abcdefghijklm. - = StringInsert[abcdefghijklm, X, 15] - - #> StringInsert[abcdefghijklm, "X", 4] - : String or list of strings expected at position 1 in StringInsert[abcdefghijklm, X, 4]. - = StringInsert[abcdefghijklm, X, 4] - - #> StringInsert["abcdefghijklm", X, 4] - : String expected at position 2 in StringInsert[abcdefghijklm, X, 4]. - = StringInsert[abcdefghijklm, X, 4] - - #> StringInsert["abcdefghijklm", "X", a] - : Position specification a in StringInsert[abcdefghijklm, X, a] is not a machine-sized integer or a list of machine-sized integers. - = StringInsert[abcdefghijklm, X, a] - - #> StringInsert["abcdefghijklm", "X", 0] - : Cannot insert at position 0 in abcdefghijklm. - = StringInsert[abcdefghijklm, X, 0] - - >> StringInsert["note", "d", -1] - = noted - - >> StringInsert["here", "t", -5] - = there - - #> StringInsert["abcdefghijklm", "X", -15] - : Cannot insert at position -15 in abcdefghijklm. - = StringInsert[abcdefghijklm, X, -15] - - >> StringInsert["adac", "he", {1, 5}] - = headache - - #> StringInsert["abcdefghijklm", "X", {1, -1, 14, -14}] - = XXabcdefghijklmXX - - #> StringInsert["abcdefghijklm", "X", {1, 0}] - : Cannot insert at position 0 in abcdefghijklm. - = StringInsert[abcdefghijklm, X, {1, 0}] - - #> StringInsert["", "X", {1}] - = X - - #> StringInsert["", "X", {1, -1}] - = XX - - #> StringInsert["", "", {1}] - = #<--# - - #> StringInsert["", "X", {1, 2}] - : Cannot insert at position 2 in . - = StringInsert[, X, {1, 2}] - - #> StringInsert["abcdefghijklm", "", {1, 2, 3, 4 ,5, -6}] - = abcdefghijklm - - #> StringInsert["abcdefghijklm", "X", {}] - = abcdefghijklm - - >> StringInsert[{"something", "sometimes"}, " ", 5] - = {some thing, some times} - - #> StringInsert[{"abcdefghijklm", "Mathics"}, "X", 13] - : Cannot insert at position 13 in Mathics. - = {abcdefghijklXm, StringInsert[Mathics, X, 13]} - - #> StringInsert[{"", ""}, "", {1, 1, 1, 1}] - = {, } - - #> StringInsert[{"abcdefghijklm", "Mathics"}, "X", {0, 2}] - : Cannot insert at position 0 in abcdefghijklm. - : Cannot insert at position 0 in Mathics. - = {StringInsert[abcdefghijklm, X, {0, 2}], StringInsert[Mathics, X, {0, 2}]} - - #> StringInsert[{"abcdefghijklm", Mathics}, "X", {1, 2}] - : String or list of strings expected at position 1 in StringInsert[{abcdefghijklm, Mathics}, X, {1, 2}]. - = StringInsert[{abcdefghijklm, Mathics}, X, {1, 2}] - - #> StringInsert[{"", "Mathics"}, "X", {1, 1, -1}] - = {XXX, XXMathicsX} - - >> StringInsert["1234567890123456", ".", Range[-16, -4, 3]] - = 1.234.567.890.123.456""" - - messages = { - "strse": "String or list of strings expected at position `1` in `2`.", - "string": "String expected at position `1` in `2`.", - "ins": "Cannot insert at position `1` in `2`.", - "psl": "Position specification `1` in `2` is not a machine-sized integer or a list of machine-sized integers.", - } - - def _insert(self, str, add, lpos, evaluation): - for pos in lpos: - if abs(pos) < 1 or abs(pos) > len(str) + 1: - evaluation.message("StringInsert", "ins", Integer(pos), String(str)) - return evaluation.format_output( - Expression( - "StringInsert", str, add, lpos[0] if len(lpos) == 1 else lpos - ) - ) - - # Create new list of position which are rearranged - pos_limit = len(str) + 2 - listpos = [p if p > 0 else pos_limit + p for p in lpos] - listpos.sort() - - result = "" - start = 0 - for pos in listpos: - stop = pos - 1 - result += str[start:stop] + add - start = stop - else: - result += str[start : len(str)] - - return result - - def apply(self, strsource, strnew, pos, evaluation): - "StringInsert[strsource_, strnew_, pos_]" - - exp = Expression("StringInsert", strsource, strnew, pos) - - py_strnew = strnew.get_string_value() - if py_strnew is None: - return evaluation.message("StringInsert", "string", Integer(2), exp) - - # Check and create list of position - listpos = [] - if pos.has_form("List", None): - leaves = pos.get_leaves() - if not leaves: - return strsource - else: - for i, posi in enumerate(leaves): - py_posi = posi.get_int_value() - if py_posi is None: - return evaluation.message("StringInsert", "psl", pos, exp) - listpos.append(py_posi) - else: - py_pos = pos.get_int_value() - if py_pos is None: - return evaluation.message("StringInsert", "psl", pos, exp) - listpos.append(py_pos) - - # Check and perform the insertion - if strsource.has_form("List", None): - py_strsource = [sub.get_string_value() for sub in strsource.leaves] - if any(sub is None for sub in py_strsource): - return evaluation.message("StringInsert", "strse", Integer1, exp) - return Expression( - "List", - *[ - String(self._insert(s, py_strnew, listpos, evaluation)) - for s in py_strsource - ] - ) - else: - py_strsource = strsource.get_string_value() - if py_strsource is None: - return evaluation.message("StringInsert", "strse", Integer1, exp) - return String(self._insert(py_strsource, py_strnew, listpos, evaluation)) - - def _pattern_search(name, string, patt, evaluation, options, matched): # Get the pattern list and check validity for each if patt.has_form("List", None): From d976d9abbba9f87a5cc7a35502753059747cb4eb Mon Sep 17 00:00:00 2001 From: rocky Date: Tue, 29 Jun 2021 21:08:51 -0400 Subject: [PATCH 2/7] Split off Character Codes --- mathics/builtin/string/charcodes.py | 272 ++++++++++++++++++++++++++++ mathics/builtin/strings.py | 252 -------------------------- setup.py | 1 + 3 files changed, 273 insertions(+), 252 deletions(-) create mode 100644 mathics/builtin/string/charcodes.py diff --git a/mathics/builtin/string/charcodes.py b/mathics/builtin/string/charcodes.py new file mode 100644 index 000000000..6010fa831 --- /dev/null +++ b/mathics/builtin/string/charcodes.py @@ -0,0 +1,272 @@ +# -*- coding: utf-8 -*- +""" +Character Codes +""" + +from mathics.version import __version__ # noqa used in loading to check consistency. + +from mathics.builtin.base import Builtin + +from mathics.core.expression import ( + Expression, + Integer, + Integer1, + String, + SymbolList, +) + +from mathics.builtin.strings import ( + _encodings, + to_python_encoding + ) + +def pack_bytes(codes): + return bytes(codes) + +def unpack_bytes(codes): + return [int(code) for code in codes] + +class ToCharacterCode(Builtin): + u""" +
+
'ToCharacterCode["$string$"]' +
converts the string to a list of character codes (Unicode + codepoints). +
'ToCharacterCode[{"$string1$", "$string2$", ...}]' +
converts a list of strings to character codes. +
+ + >> ToCharacterCode["abc"] + = {97, 98, 99} + >> FromCharacterCode[%] + = abc + + >> ToCharacterCode["\\[Alpha]\\[Beta]\\[Gamma]"] + = {945, 946, 947} + + >> ToCharacterCode["ä", "UTF8"] + = {195, 164} + + >> ToCharacterCode["ä", "ISO8859-1"] + = {228} + + >> ToCharacterCode[{"ab", "c"}] + = {{97, 98}, {99}} + + #> ToCharacterCode[{"ab"}] + = {{97, 98}} + + #> ToCharacterCode[{{"ab"}}] + : String or list of strings expected at position 1 in ToCharacterCode[{{ab}}]. + = ToCharacterCode[{{ab}}] + + >> ToCharacterCode[{"ab", x}] + : String or list of strings expected at position 1 in ToCharacterCode[{ab, x}]. + = ToCharacterCode[{ab, x}] + + >> ListPlot[ToCharacterCode["plot this string"], Filling -> Axis] + = -Graphics- + + #> ToCharacterCode[x] + : String or list of strings expected at position 1 in ToCharacterCode[x]. + = ToCharacterCode[x] + + #> ToCharacterCode[""] + = {} + """ + + messages = { + "strse": "String or list of strings expected at position `1` in `2`.", + } + + def _encode(self, string, encoding, evaluation): + exp = Expression("ToCharacterCode", string) + + if string.has_form("List", None): + string = [substring.get_string_value() for substring in string.leaves] + if any(substring is None for substring in string): + evaluation.message("ToCharacterCode", "strse", Integer1, exp) + return None + else: + string = string.get_string_value() + if string is None: + evaluation.message("ToCharacterCode", "strse", Integer1, exp) + return None + + if encoding == "Unicode": + + def convert(s): + return Expression(SymbolList, *[Integer(ord(code)) for code in s]) + + else: + py_encoding = to_python_encoding(encoding) + if py_encoding is None: + evaluation.message("General", "charcode", encoding) + return + + def convert(s): + return Expression( + "List", *[Integer(x) for x in unpack_bytes(s.encode(py_encoding))] + ) + + if isinstance(string, list): + return Expression(SymbolList, *[convert(substring) for substring in string]) + elif isinstance(string, str): + return convert(string) + + def apply_default(self, string, evaluation): + "ToCharacterCode[string_]" + return self._encode(string, "Unicode", evaluation) + + def apply(self, string, encoding, evaluation): + "ToCharacterCode[string_, encoding_String]" + return self._encode(string, encoding.get_string_value(), evaluation) + + +class _InvalidCodepointError(ValueError): + pass + + +class FromCharacterCode(Builtin): + """ +
+
'FromCharacterCode[$n$]' +
returns the character corresponding to Unicode codepoint $n$. +
'FromCharacterCode[{$n1$, $n2$, ...}]' +
returns a string with characters corresponding to $n_i$. +
'FromCharacterCode[{{$n11$, $n12$, ...}, {$n21$, $n22$, ...}, ...}]' +
returns a list of strings. +
+ + >> FromCharacterCode[100] + = d + + >> FromCharacterCode[228, "ISO8859-1"] + = ä + + >> FromCharacterCode[{100, 101, 102}] + = def + >> ToCharacterCode[%] + = {100, 101, 102} + + >> FromCharacterCode[{{97, 98, 99}, {100, 101, 102}}] + = {abc, def} + + >> ToCharacterCode["abc 123"] // FromCharacterCode + = abc 123 + + #> #1 == ToCharacterCode[FromCharacterCode[#1]] & [RandomInteger[{0, 65535}, 100]] + = True + + #> FromCharacterCode[{}] // InputForm + = "" + + #> FromCharacterCode[65536] + : A character code, which should be a non-negative integer less than 65536, is expected at position 1 in {65536}. + = FromCharacterCode[65536] + #> FromCharacterCode[-1] + : Non-negative machine-sized integer expected at position 1 in FromCharacterCode[-1]. + = FromCharacterCode[-1] + #> FromCharacterCode[444444444444444444444444444444444444] + : Non-negative machine-sized integer expected at position 1 in FromCharacterCode[444444444444444444444444444444444444]. + = FromCharacterCode[444444444444444444444444444444444444] + + #> FromCharacterCode[{100, 101, -1}] + : A character code, which should be a non-negative integer less than 65536, is expected at position 3 in {100, 101, -1}. + = FromCharacterCode[{100, 101, -1}] + #> FromCharacterCode[{100, 101, 65536}] + : A character code, which should be a non-negative integer less than 65536, is expected at position 3 in {100, 101, 65536}. + = FromCharacterCode[{100, 101, 65536}] + #> FromCharacterCode[{100, 101, x}] + : A character code, which should be a non-negative integer less than 65536, is expected at position 3 in {100, 101, x}. + = FromCharacterCode[{100, 101, x}] + #> FromCharacterCode[{100, {101}}] + : A character code, which should be a non-negative integer less than 65536, is expected at position 2 in {100, {101}}. + = FromCharacterCode[{100, {101}}] + + #> FromCharacterCode[{{97, 98, 99}, {100, 101, x}}] + : A character code, which should be a non-negative integer less than 65536, is expected at position 3 in {100, 101, x}. + = FromCharacterCode[{{97, 98, 99}, {100, 101, x}}] + #> FromCharacterCode[{{97, 98, x}, {100, 101, x}}] + : A character code, which should be a non-negative integer less than 65536, is expected at position 3 in {97, 98, x}. + = FromCharacterCode[{{97, 98, x}, {100, 101, x}}] + """ + + messages = { + "notunicode": ( + "A character code, which should be a non-negative integer less " + "than 65536, is expected at position `2` in `1`." + ), + "intnm": ( + "Non-negative machine-sized integer expected at " "position `2` in `1`." + ), + "utf8": "The given codes could not be decoded as utf-8.", + } + + def _decode(self, n, encoding, evaluation): + exp = Expression("FromCharacterCode", n) + + py_encoding = to_python_encoding(encoding) + if py_encoding is None: + evaluation.message("General", "charcode", encoding) + return + + def convert_codepoint_list(l): + if encoding == "Unicode": + s = "" + for i, ni in enumerate(l): + pyni = ni.get_int_value() + if not (pyni is not None and 0 <= pyni <= 0xFFFF): + evaluation.message( + "FromCharacterCode", + "notunicode", + Expression(SymbolList, *l), + Integer(i + 1), + ) + raise _InvalidCodepointError + s += chr(pyni) + return s + else: + codes = [x.get_int_value() & 0xFF for x in l] + return pack_bytes(codes).decode(py_encoding) + + try: + if n.has_form("List", None): + if not n.get_leaves(): + return String("") + # Mathematica accepts FromCharacterCode[{{100}, 101}], + # so to match this, just check the first leaf to see + # if we're dealing with nested lists. + elif n.get_leaves()[0].has_form("List", None): + list_of_strings = [] + for leaf in n.get_leaves(): + if leaf.has_form("List", None): + stringi = convert_codepoint_list(leaf.get_leaves()) + else: + stringi = convert_codepoint_list([leaf]) + list_of_strings.append(String(stringi)) + return Expression(SymbolList, *list_of_strings) + else: + return String(convert_codepoint_list(n.get_leaves())) + else: + pyn = n.get_int_value() + if not (isinstance(pyn, int) and pyn > 0 and pyn < sys.maxsize): + return evaluation.message( + "FromCharacterCode", "intnm", exp, Integer1 + ) + return String(convert_codepoint_list([n])) + except _InvalidCodepointError: + return + except UnicodeDecodeError: + evaluation.message(self.get_name(), "utf8") + return + + assert False, "can't get here" + + def apply_default(self, n, evaluation): + "FromCharacterCode[n_]" + return self._decode(n, "Unicode", evaluation) + + def apply(self, n, encoding, evaluation): + "FromCharacterCode[n_, encoding_String]" + return self._decode(n, encoding.get_string_value(), evaluation) diff --git a/mathics/builtin/strings.py b/mathics/builtin/strings.py index ff34b1a38..78a5ddfd2 100644 --- a/mathics/builtin/strings.py +++ b/mathics/builtin/strings.py @@ -291,12 +291,6 @@ def mathics_split(patt, string, flags): return [string[start:stop] for start, stop in indices] -def pack_bytes(codes): - return bytes(codes) - -def unpack_bytes(codes): - return [int(code) for code in codes] - class SystemCharacterEncoding(Predefined): """
@@ -1350,252 +1344,6 @@ def apply_empty(self, evaluation): return -class ToCharacterCode(Builtin): - """ -
-
'ToCharacterCode["$string$"]' -
converts the string to a list of character codes (Unicode - codepoints). -
'ToCharacterCode[{"$string1$", "$string2$", ...}]' -
converts a list of strings to character codes. -
- - >> ToCharacterCode["abc"] - = {97, 98, 99} - >> FromCharacterCode[%] - = abc - - >> ToCharacterCode["\\[Alpha]\\[Beta]\\[Gamma]"] - = {945, 946, 947} - - >> ToCharacterCode["ä", "UTF8"] - = {195, 164} - - >> ToCharacterCode["ä", "ISO8859-1"] - = {228} - - >> ToCharacterCode[{"ab", "c"}] - = {{97, 98}, {99}} - - #> ToCharacterCode[{"ab"}] - = {{97, 98}} - - #> ToCharacterCode[{{"ab"}}] - : String or list of strings expected at position 1 in ToCharacterCode[{{ab}}]. - = ToCharacterCode[{{ab}}] - - >> ToCharacterCode[{"ab", x}] - : String or list of strings expected at position 1 in ToCharacterCode[{ab, x}]. - = ToCharacterCode[{ab, x}] - - >> ListPlot[ToCharacterCode["plot this string"], Filling -> Axis] - = -Graphics- - - #> ToCharacterCode[x] - : String or list of strings expected at position 1 in ToCharacterCode[x]. - = ToCharacterCode[x] - - #> ToCharacterCode[""] - = {} - """ - - messages = { - "strse": "String or list of strings expected at position `1` in `2`.", - } - - def _encode(self, string, encoding, evaluation): - exp = Expression("ToCharacterCode", string) - - if string.has_form("List", None): - string = [substring.get_string_value() for substring in string.leaves] - if any(substring is None for substring in string): - evaluation.message("ToCharacterCode", "strse", Integer1, exp) - return None - else: - string = string.get_string_value() - if string is None: - evaluation.message("ToCharacterCode", "strse", Integer1, exp) - return None - - if encoding == "Unicode": - - def convert(s): - return Expression(SymbolList, *[Integer(ord(code)) for code in s]) - - else: - py_encoding = to_python_encoding(encoding) - if py_encoding is None: - evaluation.message("General", "charcode", encoding) - return - - def convert(s): - return Expression( - "List", *[Integer(x) for x in unpack_bytes(s.encode(py_encoding))] - ) - - if isinstance(string, list): - return Expression(SymbolList, *[convert(substring) for substring in string]) - elif isinstance(string, str): - return convert(string) - - def apply_default(self, string, evaluation): - "ToCharacterCode[string_]" - return self._encode(string, "Unicode", evaluation) - - def apply(self, string, encoding, evaluation): - "ToCharacterCode[string_, encoding_String]" - return self._encode(string, encoding.get_string_value(), evaluation) - - -class _InvalidCodepointError(ValueError): - pass - - -class FromCharacterCode(Builtin): - """ -
-
'FromCharacterCode[$n$]' -
returns the character corresponding to Unicode codepoint $n$. -
'FromCharacterCode[{$n1$, $n2$, ...}]' -
returns a string with characters corresponding to $n_i$. -
'FromCharacterCode[{{$n11$, $n12$, ...}, {$n21$, $n22$, ...}, ...}]' -
returns a list of strings. -
- - >> FromCharacterCode[100] - = d - - >> FromCharacterCode[228, "ISO8859-1"] - = ä - - >> FromCharacterCode[{100, 101, 102}] - = def - >> ToCharacterCode[%] - = {100, 101, 102} - - >> FromCharacterCode[{{97, 98, 99}, {100, 101, 102}}] - = {abc, def} - - >> ToCharacterCode["abc 123"] // FromCharacterCode - = abc 123 - - #> #1 == ToCharacterCode[FromCharacterCode[#1]] & [RandomInteger[{0, 65535}, 100]] - = True - - #> FromCharacterCode[{}] // InputForm - = "" - - #> FromCharacterCode[65536] - : A character code, which should be a non-negative integer less than 65536, is expected at position 1 in {65536}. - = FromCharacterCode[65536] - #> FromCharacterCode[-1] - : Non-negative machine-sized integer expected at position 1 in FromCharacterCode[-1]. - = FromCharacterCode[-1] - #> FromCharacterCode[444444444444444444444444444444444444] - : Non-negative machine-sized integer expected at position 1 in FromCharacterCode[444444444444444444444444444444444444]. - = FromCharacterCode[444444444444444444444444444444444444] - - #> FromCharacterCode[{100, 101, -1}] - : A character code, which should be a non-negative integer less than 65536, is expected at position 3 in {100, 101, -1}. - = FromCharacterCode[{100, 101, -1}] - #> FromCharacterCode[{100, 101, 65536}] - : A character code, which should be a non-negative integer less than 65536, is expected at position 3 in {100, 101, 65536}. - = FromCharacterCode[{100, 101, 65536}] - #> FromCharacterCode[{100, 101, x}] - : A character code, which should be a non-negative integer less than 65536, is expected at position 3 in {100, 101, x}. - = FromCharacterCode[{100, 101, x}] - #> FromCharacterCode[{100, {101}}] - : A character code, which should be a non-negative integer less than 65536, is expected at position 2 in {100, {101}}. - = FromCharacterCode[{100, {101}}] - - #> FromCharacterCode[{{97, 98, 99}, {100, 101, x}}] - : A character code, which should be a non-negative integer less than 65536, is expected at position 3 in {100, 101, x}. - = FromCharacterCode[{{97, 98, 99}, {100, 101, x}}] - #> FromCharacterCode[{{97, 98, x}, {100, 101, x}}] - : A character code, which should be a non-negative integer less than 65536, is expected at position 3 in {97, 98, x}. - = FromCharacterCode[{{97, 98, x}, {100, 101, x}}] - """ - - messages = { - "notunicode": ( - "A character code, which should be a non-negative integer less " - "than 65536, is expected at position `2` in `1`." - ), - "intnm": ( - "Non-negative machine-sized integer expected at " "position `2` in `1`." - ), - "utf8": "The given codes could not be decoded as utf-8.", - } - - def _decode(self, n, encoding, evaluation): - exp = Expression("FromCharacterCode", n) - - py_encoding = to_python_encoding(encoding) - if py_encoding is None: - evaluation.message("General", "charcode", encoding) - return - - def convert_codepoint_list(l): - if encoding == "Unicode": - s = "" - for i, ni in enumerate(l): - pyni = ni.get_int_value() - if not (pyni is not None and 0 <= pyni <= 0xFFFF): - evaluation.message( - "FromCharacterCode", - "notunicode", - Expression(SymbolList, *l), - Integer(i + 1), - ) - raise _InvalidCodepointError - s += chr(pyni) - return s - else: - codes = [x.get_int_value() & 0xFF for x in l] - return pack_bytes(codes).decode(py_encoding) - - try: - if n.has_form("List", None): - if not n.get_leaves(): - return String("") - # Mathematica accepts FromCharacterCode[{{100}, 101}], - # so to match this, just check the first leaf to see - # if we're dealing with nested lists. - elif n.get_leaves()[0].has_form("List", None): - list_of_strings = [] - for leaf in n.get_leaves(): - if leaf.has_form("List", None): - stringi = convert_codepoint_list(leaf.get_leaves()) - else: - stringi = convert_codepoint_list([leaf]) - list_of_strings.append(String(stringi)) - return Expression(SymbolList, *list_of_strings) - else: - return String(convert_codepoint_list(n.get_leaves())) - else: - pyn = n.get_int_value() - if not (isinstance(pyn, int) and pyn > 0 and pyn < sys.maxsize): - return evaluation.message( - "FromCharacterCode", "intnm", exp, Integer1 - ) - return String(convert_codepoint_list([n])) - except _InvalidCodepointError: - return - except UnicodeDecodeError: - evaluation.message(self.get_name(), "utf8") - return - - assert False, "can't get here" - - def apply_default(self, n, evaluation): - "FromCharacterCode[n_]" - return self._decode(n, "Unicode", evaluation) - - def apply(self, n, encoding, evaluation): - "FromCharacterCode[n_, encoding_String]" - return self._decode(n, encoding.get_string_value(), evaluation) - - class StringQ(Test): """
diff --git a/setup.py b/setup.py index 6c478c359..3e2a18923 100644 --- a/setup.py +++ b/setup.py @@ -141,6 +141,7 @@ def subdirs(root, file="*.*", depth=10): "mathics.builtin.pymimesniffer", "mathics.builtin.pympler", "mathics.builtin.specialfns", + "mathics.builtin.string", "mathics.doc", "mathics.format", ], From d96a56fdac26f8d4a9890d6b2e1b4c7e67f9228a Mon Sep 17 00:00:00 2001 From: rocky Date: Tue, 29 Jun 2021 21:43:44 -0400 Subject: [PATCH 3/7] Revise strings; add regexp --- mathics/builtin/string/operations.py | 288 ++++++++++++- mathics/builtin/string/patterns.py | 280 ++++++++++++- mathics/builtin/string/regexp.py | 32 ++ mathics/builtin/strings.py | 582 --------------------------- 4 files changed, 595 insertions(+), 587 deletions(-) create mode 100644 mathics/builtin/string/regexp.py diff --git a/mathics/builtin/string/operations.py b/mathics/builtin/string/operations.py index 9a68b2bd9..b07db6247 100644 --- a/mathics/builtin/string/operations.py +++ b/mathics/builtin/string/operations.py @@ -17,14 +17,15 @@ ) from mathics.core.expression import ( Expression, + Integer, + Integer1, + String, Symbol, SymbolFalse, - SymbolTrue, SymbolList, - String, - Integer, - Integer1, + SymbolTrue, from_python, + string_list, ) from mathics.builtin.lists import python_seq, convert_seq from mathics.builtin.strings import ( @@ -673,6 +674,239 @@ def apply(self, string, evaluation): return String(string.get_string_value()[::-1]) +class StringRiffle(Builtin): + """ +
+
'StringRiffle[{s1, s2, s3, ...}]' +
returns a new string by concatenating all the $si$, with spaces inserted between them. +
'StringRiffle[list, sep]' +
inserts the separator $sep$ between all elements in $list$. +
'StringRiffle[list, {"left", "sep", "right"}]' +
use $left$ and $right$ as delimiters after concatenation. + + ## These 2 forms are not currently implemented + ##
'StringRiffle[{{s11, s12, ...}, {s21, s22, ...}, ...}]' + ##
returns a new string by concatenating the $sij$, and inserting spaces at the lowest level and newlines at the higher level. + ##
'StringRiffle[list, sep1, sep2, ...]' + ##
inserts separator $sepi$ between elements of list at level i. +
+ + >> StringRiffle[{"a", "b", "c", "d", "e"}] + = a b c d e + + #> StringRiffle[{a, b, c, "d", e, "f"}] + = a b c d e f + + ## 1st is not a list + #> StringRiffle["abcdef"] + : List expected at position 1 in StringRiffle[abcdef]. + : StringRiffle called with 1 argument; 2 or more arguments are expected. + = StringRiffle[abcdef] + + #> StringRiffle[{"", "", ""}] // FullForm + = " " + + ## This form is not supported + #> StringRiffle[{{"a", "b"}, {"c", "d"}}] + : Sublist form in position 1 is is not implemented yet. + = StringRiffle[{{a, b}, {c, d}}] + + >> StringRiffle[{"a", "b", "c", "d", "e"}, ", "] + = a, b, c, d, e + + #> StringRiffle[{"a", "b", "c", "d", "e"}, sep] + : String expected at position 2 in StringRiffle[{a, b, c, d, e}, sep]. + = StringRiffle[{a, b, c, d, e}, sep] + + >> StringRiffle[{"a", "b", "c", "d", "e"}, {"(", " ", ")"}] + = (a b c d e) + + #> StringRiffle[{"a", "b", "c", "d", "e"}, {" ", ")"}] + : String expected at position 2 in StringRiffle[{a, b, c, d, e}, { , )}]. + = StringRiffle[{a, b, c, d, e}, { , )}] + #> StringRiffle[{"a", "b", "c", "d", "e"}, {left, " ", "."}] + : String expected at position 2 in StringRiffle[{a, b, c, d, e}, {left, , .}]. + = StringRiffle[{a, b, c, d, e}, {left, , .}] + + ## This form is not supported + #> StringRiffle[{"a", "b", "c"}, "+", "-"] + ## Mathematica result: a+b+c, but we are not support multiple separators + : Multiple separators form is not implemented yet. + = StringRiffle[{a, b, c}, +, -] + """ + + attributes = ("ReadProtected",) + + messages = { + "list": "List expected at position `1` in `2`.", + "argmu": "StringRiffle called with 1 argument; 2 or more arguments are expected.", + "argm": "StringRiffle called with 0 arguments; 2 or more arguments are expected.", + "string": "String expected at position `1` in `2`.", + "sublist": "Sublist form in position 1 is is not implemented yet.", + "mulsep": "Multiple separators form is not implemented yet.", + } + + def apply(self, liststr, seps, evaluation): + "StringRiffle[liststr_, seps___]" + separators = seps.get_sequence() + exp = ( + Expression("StringRiffle", liststr, seps) + if separators + else Expression("StringRiffle", liststr) + ) + + # Validate separators + if len(separators) > 1: + return evaluation.message("StringRiffle", "mulsep") + elif len(separators) == 1: + if separators[0].has_form("List", None): + if len(separators[0].leaves) != 3 or any( + not isinstance(s, String) for s in separators[0].leaves + ): + return evaluation.message("StringRiffle", "string", Integer(2), exp) + elif not isinstance(separators[0], String): + return evaluation.message("StringRiffle", "string", Integer(2), exp) + + # Validate list of string + if not liststr.has_form("List", None): + evaluation.message("StringRiffle", "list", Integer1, exp) + return evaluation.message("StringRiffle", "argmu", exp) + elif any(leaf.has_form("List", None) for leaf in liststr.leaves): + return evaluation.message("StringRiffle", "sublist") + + # Determine the separation token + left, right = "", "" + if len(separators) == 0: + sep = " " + else: + if separators[0].has_form("List", None): + left = separators[0].leaves[0].value + sep = separators[0].leaves[1].value + right = separators[0].leaves[2].value + else: + sep = separators[0].get_string_value() + + # Getting all together + result = left + for i in range(len(liststr.leaves)): + text = ( + liststr.leaves[i] + .format(evaluation, "System`OutputForm") + .boxes_to_text(evaluation=evaluation) + ) + if i == len(liststr.leaves) - 1: + result += text + right + else: + result += text + sep + + return String(result) + + +class StringSplit(Builtin): + """ +
+
'StringSplit["$s$"]' +
splits the string $s$ at whitespace, discarding the + whitespace and returning a list of strings. +
'StringSplit["$s$", "$d$"]' +
splits $s$ at the delimiter $d$. +
'StringSplit[$s$, {"$d1$", "$d2$", ...}]' +
splits $s$ using multiple delimiters. +
'StringSplit[{$s_1$, $s_2, ...}, {"$d1$", "$d2$", ...}]' +
returns a list with the result of applying the function to + each element. +
+ + >> StringSplit["abc,123", ","] + = {abc, 123} + + >> StringSplit["abc 123"] + = {abc, 123} + + #> StringSplit[" abc 123 "] + = {abc, 123} + + >> StringSplit["abc,123.456", {",", "."}] + = {abc, 123, 456} + + >> StringSplit["a b c", RegularExpression[" +"]] + = {a, b, c} + + >> StringSplit[{"a b", "c d"}, RegularExpression[" +"]] + = {{a, b}, {c, d}} + + #> StringSplit["x", "x"] + = {} + + #> StringSplit[x] + : String or list of strings expected at position 1 in StringSplit[x]. + = StringSplit[x, Whitespace] + + #> StringSplit["x", x] + : Element x is not a valid string or pattern element in x. + = StringSplit[x, x] + + #> StringSplit["12312123", "12"..] + = {3, 3} + + #> StringSplit["abaBa", "b"] + = {a, aBa} + #> StringSplit["abaBa", "b", IgnoreCase -> True] + = {a, a, a} + """ + + rules = { + "StringSplit[s_]": "StringSplit[s, Whitespace]", + } + + options = { + "IgnoreCase": "False", + "MetaCharacters": "None", + } + + messages = { + "strse": "String or list of strings expected at position `1` in `2`.", + "pysplit": "As of Python 3.5 re.split does not handle empty pattern matches.", + } + + def apply(self, string, patt, evaluation, options): + "StringSplit[string_, patt_, OptionsPattern[%(name)s]]" + + if string.get_head_name() == "System`List": + leaves = [self.apply(s, patt, evaluation, options) for s in string._leaves] + return Expression(SymbolList, *leaves) + + py_string = string.get_string_value() + + if py_string is None: + return evaluation.message( + "StringSplit", "strse", Integer1, Expression("StringSplit", string) + ) + + if patt.has_form("List", None): + patts = patt.get_leaves() + else: + patts = [patt] + re_patts = [] + for p in patts: + py_p = to_regex(p, evaluation) + if py_p is None: + return evaluation.message("StringExpression", "invld", p, patt) + re_patts.append(py_p) + + flags = re.MULTILINE + if options["System`IgnoreCase"] == SymbolTrue: + flags = flags | re.IGNORECASE + + result = [py_string] + for re_patt in re_patts: + result = [t for s in result for t in mathics_split(re_patt, s, flags=flags)] + + return string_list( + SymbolList, [String(x) for x in result if x != ""], evaluation + ) + + class StringTake(Builtin): """
@@ -779,3 +1013,49 @@ def apply_strings(self, strings, spec, evaluation): return None result_list.append(result) return Expression("List", *result_list) + +class StringTrim(Builtin): + """ +
+
'StringTrim[$s$]' +
returns a version of $s$ with whitespace removed from start and end. +
+ + >> StringJoin["a", StringTrim[" \\tb\\n "], "c"] + = abc + + >> StringTrim["ababaxababyaabab", RegularExpression["(ab)+"]] + = axababya + """ + + def apply(self, s, evaluation): + "StringTrim[s_String]" + return String(s.get_string_value().strip(" \t\n")) + + def apply_pattern(self, s, patt, expression, evaluation): + "StringTrim[s_String, patt_]" + text = s.get_string_value() + if not text: + return s + + py_patt = to_regex(patt, evaluation) + if py_patt is None: + return evaluation.message("StringExpression", "invld", patt, expression) + + if not py_patt.startswith(r"\A"): + left_patt = r"\A" + py_patt + else: + left_patt = py_patt + + if not py_patt.endswith(r"\Z"): + right_patt = py_patt + r"\Z" + else: + right_patt = py_patt + + m = re.search(left_patt, text) + left = m.end(0) if m else 0 + + m = re.search(right_patt, text) + right = m.start(0) if m else len(text) + + return String(text[left:right]) diff --git a/mathics/builtin/string/patterns.py b/mathics/builtin/string/patterns.py index 1b94e308c..10a1c0468 100644 --- a/mathics/builtin/string/patterns.py +++ b/mathics/builtin/string/patterns.py @@ -7,7 +7,11 @@ from mathics.version import __version__ # noqa used in loading to check consistency. -from mathics.builtin.base import Builtin +from mathics.builtin.base import ( + BinaryOperator, + Builtin +) + from mathics.core.expression import ( Expression, Integer1, @@ -17,10 +21,245 @@ from mathics.builtin.strings import ( + _StringFind, anchor_pattern, to_regex, ) +class DigitCharacter(Builtin): + """ +
+
'DigitCharacter' +
represents the digits 0-9. +
+ + >> StringMatchQ["1", DigitCharacter] + = True + >> StringMatchQ["a", DigitCharacter] + = False + >> StringMatchQ["12", DigitCharacter] + = False + + >> StringMatchQ["123245", DigitCharacter..] + = True + + #> StringMatchQ["123245a6", DigitCharacter..] + = False + """ + + +class LetterCharacter(Builtin): + """ +
+
'LetterCharacter' +
represents letters. +
+ + >> StringMatchQ[#, LetterCharacter] & /@ {"a", "1", "A", " ", "."} + = {True, False, True, False, False} + + LetterCharacter also matches unicode characters. + >> StringMatchQ["\\[Lambda]", LetterCharacter] + = True + """ + + +class StringCases(_StringFind): + """ +
+
'StringCases["$string$", $pattern$]' +
gives all occurences of $pattern$ in $string$. +
'StringReplace["$string$", $pattern$ -> $form$]' +
gives all instances of $form$ that stem from occurences of $pattern$ in $string$. +
'StringCases["$string$", {$pattern1$, $pattern2$, ...}]' +
gives all occurences of $pattern1$, $pattern2$, .... +
'StringReplace["$string$", $pattern$, $n$]' +
gives only the first $n$ occurences. +
'StringReplace[{"$string1$", "$string2$", ...}, $pattern$]' +
gives occurences in $string1$, $string2$, ... +
+ + >> StringCases["axbaxxb", "a" ~~ x_ ~~ "b"] + = {axb} + + >> StringCases["axbaxxb", "a" ~~ x__ ~~ "b"] + = {axbaxxb} + + >> StringCases["axbaxxb", Shortest["a" ~~ x__ ~~ "b"]] + = {axb, axxb} + + >> StringCases["-abc- def -uvw- xyz", Shortest["-" ~~ x__ ~~ "-"] -> x] + = {abc, uvw} + + >> StringCases["-öhi- -abc- -.-", "-" ~~ x : WordCharacter .. ~~ "-" -> x] + = {öhi, abc} + + >> StringCases["abc-abc xyz-uvw", Shortest[x : WordCharacter .. ~~ "-" ~~ x_] -> x] + = {abc} + + #> StringCases["abc-abc xyz-uvw", Shortest[x : WordCharacter .. ~~ "-" ~~ x : LetterCharacter] -> x] + : Ignored restriction given for x in x : LetterCharacter as it does not match previous occurences of x. + = {abc} + + >> StringCases["abba", {"a" -> 10, "b" -> 20}, 2] + = {10, 20} + + >> StringCases["a#ä_123", WordCharacter] + = {a, ä, 1, 2, 3} + + >> StringCases["a#ä_123", LetterCharacter] + = {a, ä} + """ + + rules = { + "StringCases[rule_][string_]": "StringCases[string, rule]", + } + + def _find(self, py_stri, py_rules, py_n, flags, evaluation): + def cases(): + for match, form in _parallel_match(py_stri, py_rules, flags, py_n): + if form is None: + yield String(match.group(0)) + else: + yield _evaluate_match(form, match, evaluation) + + return Expression(SymbolList, *list(cases())) + + def apply(self, string, rule, n, evaluation, options): + "%(name)s[string_, rule_, OptionsPattern[%(name)s], n_:System`Private`Null]" + # this pattern is a slight hack to get around missing Shortest/Longest. + return self._apply(string, rule, n, evaluation, options, True) + + +class StringExpression(BinaryOperator): + """ +
+
'StringExpression[s_1, s_2, ...]' +
represents a sequence of strings and symbolic string objects $s_i$. +
+ + >> "a" ~~ "b" // FullForm + = "ab" + + #> "a" ~~ "b" ~~ "c" // FullForm + = "abc" + + #> a ~~ b + = a ~~ b + """ + + operator = "~~" + precedence = 135 + attributes = ("Flat", "OneIdentity", "Protected") + + messages = { + "invld": "Element `1` is not a valid string or pattern element in `2`.", + "cond": "Ignored restriction given for `1` in `2` as it does not match previous occurences of `1`.", + } + + def apply(self, args, evaluation): + "StringExpression[args__String]" + args = args.get_sequence() + args = [arg.get_string_value() for arg in args] + if None in args: + return + return String("".join(args)) + +class StringFreeQ(Builtin): + """ +
+
'StringFreeQ["$string$", $patt$]' +
returns True if no substring in $string$ matches the string expression $patt$, and returns False otherwise. +
'StringFreeQ[{"s1", "s2", ...}, patt]' +
returns the list of results for each element of string list. +
'StringFreeQ["string", {p1, p2, ...}]' +
returns True if no substring matches any of the $pi$. +
'StringFreeQ[patt]' +
represents an operator form of StringFreeQ that can be applied to an expression. +
+ + >> StringFreeQ["mathics", "m" ~~ __ ~~ "s"] + = False + + >> StringFreeQ["mathics", "a" ~~ __ ~~ "m"] + = True + + #> StringFreeQ["Hello", "o"] + = False + + #> StringFreeQ["a"]["abcd"] + = False + + #> StringFreeQ["Mathics", "ma", IgnoreCase -> False] + = True + + >> StringFreeQ["Mathics", "MA" , IgnoreCase -> True] + = False + + #> StringFreeQ["", "Empty String"] + = True + + #> StringFreeQ["", ___] + = False + + #> StringFreeQ["Empty Pattern", ""] + = False + + #> StringFreeQ[notastring, "n"] + : String or list of strings expected at position 1 in StringFreeQ[notastring, n]. + = StringFreeQ[notastring, n] + + #> StringFreeQ["Welcome", notapattern] + : Element notapattern is not a valid string or pattern element in notapattern. + = StringFreeQ[Welcome, notapattern] + + >> StringFreeQ[{"g", "a", "laxy", "universe", "sun"}, "u"] + = {True, True, True, False, False} + + #> StringFreeQ[{}, "list of string is empty"] + = {} + + >> StringFreeQ["e" ~~ ___ ~~ "u"] /@ {"The Sun", "Mercury", "Venus", "Earth", "Mars", "Jupiter", "Saturn", "Uranus", "Neptune"} + = {False, False, False, True, True, True, True, True, False} + + #> StringFreeQ[{"A", "Galaxy", "Far", "Far", "Away"}, {"F" ~~ __ ~~ "r", "aw" ~~ ___}] + = {True, True, False, False, True} + + >> StringFreeQ[{"A", "Galaxy", "Far", "Far", "Away"}, {"F" ~~ __ ~~ "r", "aw" ~~ ___}, IgnoreCase -> True] + = {True, True, False, False, False} + + #> StringFreeQ[{"A", "Galaxy", "Far", "Far", "Away"}, {}] + = {True, True, True, True, True} + + #> StringFreeQ[{"A", Galaxy, "Far", "Far", Away}, {"F" ~~ __ ~~ "r", "aw" ~~ ___}] + : String or list of strings expected at position 1 in StringFreeQ[{A, Galaxy, Far, Far, Away}, {F ~~ __ ~~ r, aw ~~ ___}]. + = StringFreeQ[{A, Galaxy, Far, Far, Away}, {F ~~ __ ~~ r, aw ~~ ___}] + + #> StringFreeQ[{"A", "Galaxy", "Far", "Far", "Away"}, {F ~~ __ ~~ "r", aw ~~ ___}] + : Element F ~~ __ ~~ r is not a valid string or pattern element in {F ~~ __ ~~ r, aw ~~ ___}. + = StringFreeQ[{A, Galaxy, Far, Far, Away}, {F ~~ __ ~~ r, aw ~~ ___}] + ## Mathematica can detemine correct invalid element in the pattern, it reports error: + ## Element F is not a valid string or pattern element in {F ~~ __ ~~ r, aw ~~ ___}. + """ + + options = { + "IgnoreCase": "False", + } + + rules = { + "StringFreeQ[patt_][expr_]": "StringFreeQ[expr, patt]", + } + + messages = { + "strse": "String or list of strings expected at position `1` in `2`.", + } + + def apply(self, string, patt, evaluation, options): + "StringFreeQ[string_, patt_, OptionsPattern[%(name)s]]" + return _pattern_search( + self.__class__.__name__, string, patt, evaluation, options, False + ) + class StringMatchQ(Builtin): r""" >> StringMatchQ["abc", "abc"] @@ -117,3 +356,42 @@ def apply(self, string, patt, evaluation, options): return SymbolFalse else: return SymbolTrue + + +class WhitespaceCharacter(Builtin): + r""" +
+
'WhitespaceCharacter' +
represents a single whitespace character. +
+ + >> StringMatchQ["\n", WhitespaceCharacter] + = True + + >> StringSplit["a\nb\r\nc\rd", WhitespaceCharacter] + = {a, b, c, d} + + For sequences of whitespace characters use 'Whitespace': + >> StringMatchQ[" \n", WhitespaceCharacter] + = False + >> StringMatchQ[" \n", Whitespace] + = True + """ + + +class WordCharacter(Builtin): + r""" +
+
'WordCharacter' +
represents a single letter or digit character. +
+ + >> StringMatchQ[#, WordCharacter] &/@ {"1", "a", "A", ",", " "} + = {True, True, True, False, False} + + Test whether a string is alphanumeric: + >> StringMatchQ["abc123DEF", WordCharacter..] + = True + >> StringMatchQ["$b;123", WordCharacter..] + = False + """ diff --git a/mathics/builtin/string/regexp.py b/mathics/builtin/string/regexp.py new file mode 100644 index 000000000..dfb3d8e0d --- /dev/null +++ b/mathics/builtin/string/regexp.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- +""" +Regular Expressions +""" + +from mathics.version import __version__ # noqa used in loading to check consistency. + +from mathics.builtin.base import Builtin + +# builtin.strings.to_regex seems to have the implementation. +class RegularExpression(Builtin): + r""" +
+
'RegularExpression["regex"]' +
represents the regex specified by the string $"regex"$. +
+ + >> StringSplit["1.23, 4.56 7.89", RegularExpression["(\\s|,)+"]] + = {1.23, 4.56, 7.89} + + #> RegularExpression["[abc]"] + = RegularExpression[[abc]] + + ## Mathematica doesn't seem to verify the correctness of regex + #> StringSplit["ab23c", RegularExpression["[0-9]++"]] + : Element RegularExpression[[0-9]++] is not a valid string or pattern element in RegularExpression[[0-9]++]. + = StringSplit[ab23c, RegularExpression[[0-9]++]] + + #> StringSplit["ab23c", RegularExpression[2]] + : Element RegularExpression[2] is not a valid string or pattern element in RegularExpression[2]. + = StringSplit[ab23c, RegularExpression[2]] + """ diff --git a/mathics/builtin/strings.py b/mathics/builtin/strings.py index 78a5ddfd2..227c6c963 100644 --- a/mathics/builtin/strings.py +++ b/mathics/builtin/strings.py @@ -5,7 +5,6 @@ import io import re -import sys import unicodedata from binascii import hexlify, unhexlify from heapq import heappush, heappop @@ -14,7 +13,6 @@ from mathics.version import __version__ # noqa used in loading to check consistency. from mathics.builtin.base import ( - BinaryOperator, Builtin, Test, Predefined, @@ -31,7 +29,6 @@ Integer, Integer0, Integer1, - string_list, ) from mathics.core.parser import MathicsFileLineFeeder, parse from mathics.settings import SYSTEM_CHARACTER_ENCODING @@ -381,65 +378,6 @@ class CharacterEncodings(Predefined): } -class StringExpression(BinaryOperator): - """ -
-
'StringExpression[s_1, s_2, ...]' -
represents a sequence of strings and symbolic string objects $s_i$. -
- - >> "a" ~~ "b" // FullForm - = "ab" - - #> "a" ~~ "b" ~~ "c" // FullForm - = "abc" - - #> a ~~ b - = a ~~ b - """ - - operator = "~~" - precedence = 135 - attributes = ("Flat", "OneIdentity", "Protected") - - messages = { - "invld": "Element `1` is not a valid string or pattern element in `2`.", - "cond": "Ignored restriction given for `1` in `2` as it does not match previous occurences of `1`.", - } - - def apply(self, args, evaluation): - "StringExpression[args__String]" - args = args.get_sequence() - args = [arg.get_string_value() for arg in args] - if None in args: - return - return String("".join(args)) - - -class RegularExpression(Builtin): - r""" -
-
'RegularExpression["regex"]' -
represents the regex specified by the string $"regex"$. -
- - >> StringSplit["1.23, 4.56 7.89", RegularExpression["(\\s|,)+"]] - = {1.23, 4.56, 7.89} - - #> RegularExpression["[abc]"] - = RegularExpression[[abc]] - - ## Mathematica doesn't seem to verify the correctness of regex - #> StringSplit["ab23c", RegularExpression["[0-9]++"]] - : Element RegularExpression[[0-9]++] is not a valid string or pattern element in RegularExpression[[0-9]++]. - = StringSplit[ab23c, RegularExpression[[0-9]++]] - - #> StringSplit["ab23c", RegularExpression[2]] - : Element RegularExpression[2] is not a valid string or pattern element in RegularExpression[2]. - = StringSplit[ab23c, RegularExpression[2]] - """ - - class NumberString(Builtin): """
@@ -458,28 +396,6 @@ class NumberString(Builtin): """ -class DigitCharacter(Builtin): - """ -
-
'DigitCharacter' -
represents the digits 0-9. -
- - >> StringMatchQ["1", DigitCharacter] - = True - >> StringMatchQ["a", DigitCharacter] - = False - >> StringMatchQ["12", DigitCharacter] - = False - - >> StringMatchQ["123245", DigitCharacter..] - = True - - #> StringMatchQ["123245a6", DigitCharacter..] - = False - """ - - class Whitespace(Builtin): r"""
@@ -498,45 +414,6 @@ class Whitespace(Builtin): """ -class WhitespaceCharacter(Builtin): - r""" -
-
'WhitespaceCharacter' -
represents a single whitespace character. -
- - >> StringMatchQ["\n", WhitespaceCharacter] - = True - - >> StringSplit["a\nb\r\nc\rd", WhitespaceCharacter] - = {a, b, c, d} - - For sequences of whitespace characters use 'Whitespace': - >> StringMatchQ[" \n", WhitespaceCharacter] - = False - >> StringMatchQ[" \n", Whitespace] - = True - """ - - -class WordCharacter(Builtin): - r""" -
-
'WordCharacter' -
represents a single letter or digit character. -
- - >> StringMatchQ[#, WordCharacter] &/@ {"1", "a", "A", ",", " "} - = {True, True, True, False, False} - - Test whether a string is alphanumeric: - >> StringMatchQ["abc123DEF", WordCharacter..] - = True - >> StringMatchQ["$b;123", WordCharacter..] - = False - """ - - class StartOfString(Builtin): r"""
@@ -623,22 +500,6 @@ class WordBoundary(Builtin): """ -class LetterCharacter(Builtin): - """ -
-
'LetterCharacter' -
represents letters. -
- - >> StringMatchQ[#, LetterCharacter] & /@ {"a", "1", "A", " ", "."} - = {True, False, True, False, False} - - LetterCharacter also matches unicode characters. - >> StringMatchQ["\\[Lambda]", LetterCharacter] - = True - """ - - # FIXME: Generalize string.lower() and ord() def letter_number(chars: List[str], start_ord) -> List["Integer"]: # Note caller has verified that everything isalpha() and @@ -810,111 +671,6 @@ class HexidecimalCharacter(Builtin): """ -class StringSplit(Builtin): - """ -
-
'StringSplit["$s$"]' -
splits the string $s$ at whitespace, discarding the - whitespace and returning a list of strings. -
'StringSplit["$s$", "$d$"]' -
splits $s$ at the delimiter $d$. -
'StringSplit[$s$, {"$d1$", "$d2$", ...}]' -
splits $s$ using multiple delimiters. -
'StringSplit[{$s_1$, $s_2, ...}, {"$d1$", "$d2$", ...}]' -
returns a list with the result of applying the function to - each element. -
- - >> StringSplit["abc,123", ","] - = {abc, 123} - - >> StringSplit["abc 123"] - = {abc, 123} - - #> StringSplit[" abc 123 "] - = {abc, 123} - - >> StringSplit["abc,123.456", {",", "."}] - = {abc, 123, 456} - - >> StringSplit["a b c", RegularExpression[" +"]] - = {a, b, c} - - >> StringSplit[{"a b", "c d"}, RegularExpression[" +"]] - = {{a, b}, {c, d}} - - #> StringSplit["x", "x"] - = {} - - #> StringSplit[x] - : String or list of strings expected at position 1 in StringSplit[x]. - = StringSplit[x, Whitespace] - - #> StringSplit["x", x] - : Element x is not a valid string or pattern element in x. - = StringSplit[x, x] - - #> StringSplit["12312123", "12"..] - = {3, 3} - - #> StringSplit["abaBa", "b"] - = {a, aBa} - #> StringSplit["abaBa", "b", IgnoreCase -> True] - = {a, a, a} - """ - - rules = { - "StringSplit[s_]": "StringSplit[s, Whitespace]", - } - - options = { - "IgnoreCase": "False", - "MetaCharacters": "None", - } - - messages = { - "strse": "String or list of strings expected at position `1` in `2`.", - "pysplit": "As of Python 3.5 re.split does not handle empty pattern matches.", - } - - def apply(self, string, patt, evaluation, options): - "StringSplit[string_, patt_, OptionsPattern[%(name)s]]" - - if string.get_head_name() == "System`List": - leaves = [self.apply(s, patt, evaluation, options) for s in string._leaves] - return Expression(SymbolList, *leaves) - - py_string = string.get_string_value() - - if py_string is None: - return evaluation.message( - "StringSplit", "strse", Integer1, Expression("StringSplit", string) - ) - - if patt.has_form("List", None): - patts = patt.get_leaves() - else: - patts = [patt] - re_patts = [] - for p in patts: - py_p = to_regex(p, evaluation) - if py_p is None: - return evaluation.message("StringExpression", "invld", p, patt) - re_patts.append(py_p) - - flags = re.MULTILINE - if options["System`IgnoreCase"] == SymbolTrue: - flags = flags | re.IGNORECASE - - result = [py_string] - for re_patt in re_patts: - result = [t for s in result for t in mathics_split(re_patt, s, flags=flags)] - - return string_list( - SymbolList, [String(x) for x in result if x != ""], evaluation - ) - - class _StringFind(Builtin): attributes = "Protected" @@ -1003,73 +759,6 @@ def convert_rule(r): return self._find(py_strings, py_rules, py_n, flags, evaluation) -class StringCases(_StringFind): - """ -
-
'StringCases["$string$", $pattern$]' -
gives all occurences of $pattern$ in $string$. -
'StringReplace["$string$", $pattern$ -> $form$]' -
gives all instances of $form$ that stem from occurences of $pattern$ in $string$. -
'StringCases["$string$", {$pattern1$, $pattern2$, ...}]' -
gives all occurences of $pattern1$, $pattern2$, .... -
'StringReplace["$string$", $pattern$, $n$]' -
gives only the first $n$ occurences. -
'StringReplace[{"$string1$", "$string2$", ...}, $pattern$]' -
gives occurences in $string1$, $string2$, ... -
- - >> StringCases["axbaxxb", "a" ~~ x_ ~~ "b"] - = {axb} - - >> StringCases["axbaxxb", "a" ~~ x__ ~~ "b"] - = {axbaxxb} - - >> StringCases["axbaxxb", Shortest["a" ~~ x__ ~~ "b"]] - = {axb, axxb} - - >> StringCases["-abc- def -uvw- xyz", Shortest["-" ~~ x__ ~~ "-"] -> x] - = {abc, uvw} - - >> StringCases["-öhi- -abc- -.-", "-" ~~ x : WordCharacter .. ~~ "-" -> x] - = {öhi, abc} - - >> StringCases["abc-abc xyz-uvw", Shortest[x : WordCharacter .. ~~ "-" ~~ x_] -> x] - = {abc} - - #> StringCases["abc-abc xyz-uvw", Shortest[x : WordCharacter .. ~~ "-" ~~ x : LetterCharacter] -> x] - : Ignored restriction given for x in x : LetterCharacter as it does not match previous occurences of x. - = {abc} - - >> StringCases["abba", {"a" -> 10, "b" -> 20}, 2] - = {10, 20} - - >> StringCases["a#ä_123", WordCharacter] - = {a, ä, 1, 2, 3} - - >> StringCases["a#ä_123", LetterCharacter] - = {a, ä} - """ - - rules = { - "StringCases[rule_][string_]": "StringCases[string, rule]", - } - - def _find(self, py_stri, py_rules, py_n, flags, evaluation): - def cases(): - for match, form in _parallel_match(py_stri, py_rules, flags, py_n): - if form is None: - yield String(match.group(0)) - else: - yield _evaluate_match(form, match, evaluation) - - return Expression(SymbolList, *list(cases())) - - def apply(self, string, rule, n, evaluation, options): - "%(name)s[string_, rule_, OptionsPattern[%(name)s], n_:System`Private`Null]" - # this pattern is a slight hack to get around missing Shortest/Longest. - return self._apply(string, rule, n, evaluation, options, True) - - class StringRepeat(Builtin): """
@@ -1670,53 +1359,6 @@ def apply(self, s, evaluation): return String(unidecode(s.get_string_value())) -class StringTrim(Builtin): - """ -
-
'StringTrim[$s$]' -
returns a version of $s$ with whitespace removed from start and end. -
- - >> StringJoin["a", StringTrim[" \\tb\\n "], "c"] - = abc - - >> StringTrim["ababaxababyaabab", RegularExpression["(ab)+"]] - = axababya - """ - - def apply(self, s, evaluation): - "StringTrim[s_String]" - return String(s.get_string_value().strip(" \t\n")) - - def apply_pattern(self, s, patt, expression, evaluation): - "StringTrim[s_String, patt_]" - text = s.get_string_value() - if not text: - return s - - py_patt = to_regex(patt, evaluation) - if py_patt is None: - return evaluation.message("StringExpression", "invld", patt, expression) - - if not py_patt.startswith(r"\A"): - left_patt = r"\A" + py_patt - else: - left_patt = py_patt - - if not py_patt.endswith(r"\Z"): - right_patt = py_patt + r"\Z" - else: - right_patt = py_patt - - m = re.search(left_patt, text) - left = m.end(0) if m else 0 - - m = re.search(right_patt, text) - right = m.start(0) if m else len(text) - - return String(text[left:right]) - - def _pattern_search(name, string, patt, evaluation, options, matched): # Get the pattern list and check validity for each if patt.has_form("List", None): @@ -1851,227 +1493,3 @@ def apply(self, string, patt, evaluation, options): return _pattern_search( self.__class__.__name__, string, patt, evaluation, options, True ) - - -class StringFreeQ(Builtin): - """ -
-
'StringFreeQ["$string$", $patt$]' -
returns True if no substring in $string$ matches the string expression $patt$, and returns False otherwise. -
'StringFreeQ[{"s1", "s2", ...}, patt]' -
returns the list of results for each element of string list. -
'StringFreeQ["string", {p1, p2, ...}]' -
returns True if no substring matches any of the $pi$. -
'StringFreeQ[patt]' -
represents an operator form of StringFreeQ that can be applied to an expression. -
- - >> StringFreeQ["mathics", "m" ~~ __ ~~ "s"] - = False - - >> StringFreeQ["mathics", "a" ~~ __ ~~ "m"] - = True - - #> StringFreeQ["Hello", "o"] - = False - - #> StringFreeQ["a"]["abcd"] - = False - - #> StringFreeQ["Mathics", "ma", IgnoreCase -> False] - = True - - >> StringFreeQ["Mathics", "MA" , IgnoreCase -> True] - = False - - #> StringFreeQ["", "Empty String"] - = True - - #> StringFreeQ["", ___] - = False - - #> StringFreeQ["Empty Pattern", ""] - = False - - #> StringFreeQ[notastring, "n"] - : String or list of strings expected at position 1 in StringFreeQ[notastring, n]. - = StringFreeQ[notastring, n] - - #> StringFreeQ["Welcome", notapattern] - : Element notapattern is not a valid string or pattern element in notapattern. - = StringFreeQ[Welcome, notapattern] - - >> StringFreeQ[{"g", "a", "laxy", "universe", "sun"}, "u"] - = {True, True, True, False, False} - - #> StringFreeQ[{}, "list of string is empty"] - = {} - - >> StringFreeQ["e" ~~ ___ ~~ "u"] /@ {"The Sun", "Mercury", "Venus", "Earth", "Mars", "Jupiter", "Saturn", "Uranus", "Neptune"} - = {False, False, False, True, True, True, True, True, False} - - #> StringFreeQ[{"A", "Galaxy", "Far", "Far", "Away"}, {"F" ~~ __ ~~ "r", "aw" ~~ ___}] - = {True, True, False, False, True} - - >> StringFreeQ[{"A", "Galaxy", "Far", "Far", "Away"}, {"F" ~~ __ ~~ "r", "aw" ~~ ___}, IgnoreCase -> True] - = {True, True, False, False, False} - - #> StringFreeQ[{"A", "Galaxy", "Far", "Far", "Away"}, {}] - = {True, True, True, True, True} - - #> StringFreeQ[{"A", Galaxy, "Far", "Far", Away}, {"F" ~~ __ ~~ "r", "aw" ~~ ___}] - : String or list of strings expected at position 1 in StringFreeQ[{A, Galaxy, Far, Far, Away}, {F ~~ __ ~~ r, aw ~~ ___}]. - = StringFreeQ[{A, Galaxy, Far, Far, Away}, {F ~~ __ ~~ r, aw ~~ ___}] - - #> StringFreeQ[{"A", "Galaxy", "Far", "Far", "Away"}, {F ~~ __ ~~ "r", aw ~~ ___}] - : Element F ~~ __ ~~ r is not a valid string or pattern element in {F ~~ __ ~~ r, aw ~~ ___}. - = StringFreeQ[{A, Galaxy, Far, Far, Away}, {F ~~ __ ~~ r, aw ~~ ___}] - ## Mathematica can detemine correct invalid element in the pattern, it reports error: - ## Element F is not a valid string or pattern element in {F ~~ __ ~~ r, aw ~~ ___}. - """ - - options = { - "IgnoreCase": "False", - } - - rules = { - "StringFreeQ[patt_][expr_]": "StringFreeQ[expr, patt]", - } - - messages = { - "strse": "String or list of strings expected at position `1` in `2`.", - } - - def apply(self, string, patt, evaluation, options): - "StringFreeQ[string_, patt_, OptionsPattern[%(name)s]]" - return _pattern_search( - self.__class__.__name__, string, patt, evaluation, options, False - ) - - -class StringRiffle(Builtin): - """ -
-
'StringRiffle[{s1, s2, s3, ...}]' -
returns a new string by concatenating all the $si$, with spaces inserted between them. -
'StringRiffle[list, sep]' -
inserts the separator $sep$ between all elements in $list$. -
'StringRiffle[list, {"left", "sep", "right"}]' -
use $left$ and $right$ as delimiters after concatenation. - - ## These 2 forms are not currently implemented - ##
'StringRiffle[{{s11, s12, ...}, {s21, s22, ...}, ...}]' - ##
returns a new string by concatenating the $sij$, and inserting spaces at the lowest level and newlines at the higher level. - ##
'StringRiffle[list, sep1, sep2, ...]' - ##
inserts separator $sepi$ between elements of list at level i. -
- - >> StringRiffle[{"a", "b", "c", "d", "e"}] - = a b c d e - - #> StringRiffle[{a, b, c, "d", e, "f"}] - = a b c d e f - - ## 1st is not a list - #> StringRiffle["abcdef"] - : List expected at position 1 in StringRiffle[abcdef]. - : StringRiffle called with 1 argument; 2 or more arguments are expected. - = StringRiffle[abcdef] - - #> StringRiffle[{"", "", ""}] // FullForm - = " " - - ## This form is not supported - #> StringRiffle[{{"a", "b"}, {"c", "d"}}] - : Sublist form in position 1 is is not implemented yet. - = StringRiffle[{{a, b}, {c, d}}] - - >> StringRiffle[{"a", "b", "c", "d", "e"}, ", "] - = a, b, c, d, e - - #> StringRiffle[{"a", "b", "c", "d", "e"}, sep] - : String expected at position 2 in StringRiffle[{a, b, c, d, e}, sep]. - = StringRiffle[{a, b, c, d, e}, sep] - - >> StringRiffle[{"a", "b", "c", "d", "e"}, {"(", " ", ")"}] - = (a b c d e) - - #> StringRiffle[{"a", "b", "c", "d", "e"}, {" ", ")"}] - : String expected at position 2 in StringRiffle[{a, b, c, d, e}, { , )}]. - = StringRiffle[{a, b, c, d, e}, { , )}] - #> StringRiffle[{"a", "b", "c", "d", "e"}, {left, " ", "."}] - : String expected at position 2 in StringRiffle[{a, b, c, d, e}, {left, , .}]. - = StringRiffle[{a, b, c, d, e}, {left, , .}] - - ## This form is not supported - #> StringRiffle[{"a", "b", "c"}, "+", "-"] - ## Mathematica result: a+b+c, but we are not support multiple separators - : Multiple separators form is not implemented yet. - = StringRiffle[{a, b, c}, +, -] - """ - - attributes = ("ReadProtected",) - - messages = { - "list": "List expected at position `1` in `2`.", - "argmu": "StringRiffle called with 1 argument; 2 or more arguments are expected.", - "argm": "StringRiffle called with 0 arguments; 2 or more arguments are expected.", - "string": "String expected at position `1` in `2`.", - "sublist": "Sublist form in position 1 is is not implemented yet.", - "mulsep": "Multiple separators form is not implemented yet.", - } - - def apply(self, liststr, seps, evaluation): - "StringRiffle[liststr_, seps___]" - separators = seps.get_sequence() - exp = ( - Expression("StringRiffle", liststr, seps) - if separators - else Expression("StringRiffle", liststr) - ) - - # Validate separators - if len(separators) > 1: - return evaluation.message("StringRiffle", "mulsep") - elif len(separators) == 1: - if separators[0].has_form("List", None): - if len(separators[0].leaves) != 3 or any( - not isinstance(s, String) for s in separators[0].leaves - ): - return evaluation.message("StringRiffle", "string", Integer(2), exp) - elif not isinstance(separators[0], String): - return evaluation.message("StringRiffle", "string", Integer(2), exp) - - # Validate list of string - if not liststr.has_form("List", None): - evaluation.message("StringRiffle", "list", Integer1, exp) - return evaluation.message("StringRiffle", "argmu", exp) - elif any(leaf.has_form("List", None) for leaf in liststr.leaves): - return evaluation.message("StringRiffle", "sublist") - - # Determine the separation token - left, right = "", "" - if len(separators) == 0: - sep = " " - else: - if separators[0].has_form("List", None): - left = separators[0].leaves[0].value - sep = separators[0].leaves[1].value - right = separators[0].leaves[2].value - else: - sep = separators[0].get_string_value() - - # Getting all together - result = left - for i in range(len(liststr.leaves)): - text = ( - liststr.leaves[i] - .format(evaluation, "System`OutputForm") - .boxes_to_text(evaluation=evaluation) - ) - if i == len(liststr.leaves) - 1: - result += text + right - else: - result += text + sep - - return String(result) From 3b76785e96217d0865d1db6d743a720931e27279 Mon Sep 17 00:00:00 2001 From: autoblack Date: Wed, 30 Jun 2021 01:45:08 +0000 Subject: [PATCH 4/7] fixup: Format Python code with Black --- mathics/builtin/__init__.py | 10 +++++++++- mathics/builtin/string/charcodes.py | 8 ++++---- mathics/builtin/string/operations.py | 1 + mathics/builtin/string/patterns.py | 10 +++++----- 4 files changed, 19 insertions(+), 10 deletions(-) diff --git a/mathics/builtin/__init__.py b/mathics/builtin/__init__.py index dd271b8f5..bc1db2e7f 100755 --- a/mathics/builtin/__init__.py +++ b/mathics/builtin/__init__.py @@ -153,7 +153,15 @@ def is_builtin(var): [] if ENABLE_FILES_MODULE else ["files_io.files", "files_io.importexport"] ) -for subdir in ("colors", "drawing", "files_io", "numbers", "specialfns", "string", "fileformats"): +for subdir in ( + "colors", + "drawing", + "files_io", + "numbers", + "specialfns", + "string", + "fileformats", +): import_name = f"{__name__}.{subdir}" if import_name in disable_file_module_names: diff --git a/mathics/builtin/string/charcodes.py b/mathics/builtin/string/charcodes.py index 6010fa831..be6d1ec49 100644 --- a/mathics/builtin/string/charcodes.py +++ b/mathics/builtin/string/charcodes.py @@ -15,17 +15,17 @@ SymbolList, ) -from mathics.builtin.strings import ( - _encodings, - to_python_encoding - ) +from mathics.builtin.strings import _encodings, to_python_encoding + def pack_bytes(codes): return bytes(codes) + def unpack_bytes(codes): return [int(code) for code in codes] + class ToCharacterCode(Builtin): u"""
diff --git a/mathics/builtin/string/operations.py b/mathics/builtin/string/operations.py index b07db6247..e67499c80 100644 --- a/mathics/builtin/string/operations.py +++ b/mathics/builtin/string/operations.py @@ -1014,6 +1014,7 @@ def apply_strings(self, strings, spec, evaluation): result_list.append(result) return Expression("List", *result_list) + class StringTrim(Builtin): """
diff --git a/mathics/builtin/string/patterns.py b/mathics/builtin/string/patterns.py index 10a1c0468..7a96342e8 100644 --- a/mathics/builtin/string/patterns.py +++ b/mathics/builtin/string/patterns.py @@ -7,17 +7,14 @@ from mathics.version import __version__ # noqa used in loading to check consistency. -from mathics.builtin.base import ( - BinaryOperator, - Builtin -) +from mathics.builtin.base import BinaryOperator, Builtin from mathics.core.expression import ( Expression, Integer1, SymbolFalse, SymbolTrue, - ) +) from mathics.builtin.strings import ( @@ -26,6 +23,7 @@ to_regex, ) + class DigitCharacter(Builtin): """
@@ -165,6 +163,7 @@ def apply(self, args, evaluation): return return String("".join(args)) + class StringFreeQ(Builtin): """
@@ -260,6 +259,7 @@ def apply(self, string, patt, evaluation, options): self.__class__.__name__, string, patt, evaluation, options, False ) + class StringMatchQ(Builtin): r""" >> StringMatchQ["abc", "abc"] From 2cc98251481f3f194934e88dd810e33640b7989f Mon Sep 17 00:00:00 2001 From: rocky Date: Tue, 29 Jun 2021 22:29:12 -0400 Subject: [PATCH 5/7] Add distance and similarity --- mathics/builtin/__init__.py | 13 +- mathics/builtin/colors/color_directives.py | 31 ++- mathics/builtin/distance/stringdata.py | 270 +++++++++++++++++++++ mathics/builtin/strings.py | 257 +------------------- 4 files changed, 301 insertions(+), 270 deletions(-) create mode 100644 mathics/builtin/distance/stringdata.py diff --git a/mathics/builtin/__init__.py b/mathics/builtin/__init__.py index dd271b8f5..6305c8fad 100755 --- a/mathics/builtin/__init__.py +++ b/mathics/builtin/__init__.py @@ -40,7 +40,7 @@ def add_builtins(new_builtins): if isinstance(builtin, SympyObject): mathics_to_sympy[name] = builtin for sympy_name in builtin.get_sympy_names(): - ### print("XXX1", sympy_name) + # print("XXX1", sympy_name) sympy_to_mathics[sympy_name] = builtin if isinstance(builtin, Operator): builtins_precedence[name] = builtin.precedence @@ -153,7 +153,16 @@ def is_builtin(var): [] if ENABLE_FILES_MODULE else ["files_io.files", "files_io.importexport"] ) -for subdir in ("colors", "drawing", "files_io", "numbers", "specialfns", "string", "fileformats"): +for subdir in ( + "colors", + "distance", + "drawing", + "files_io", + "numbers", + "specialfns", + "string", + "fileformats", +): import_name = f"{__name__}.{subdir}" if import_name in disable_file_module_names: diff --git a/mathics/builtin/colors/color_directives.py b/mathics/builtin/colors/color_directives.py index 5c475a312..a24a66e4a 100644 --- a/mathics/builtin/colors/color_directives.py +++ b/mathics/builtin/colors/color_directives.py @@ -1,5 +1,7 @@ """ Color Directives + +There are many different way to specify color; we support all of the color formats below and will convert between the different color formats. """ from math import atan2, cos, exp, pi, radians, sin, sqrt @@ -225,24 +227,27 @@ class CMYKColor(_Color): class ColorDistance(Builtin): """
-
'ColorDistance[$c1$, $c2$]' -
returns a measure of color distance between the colors $c1$ and $c2$. -
'ColorDistance[$list$, $c2$]' -
returns a list of color distances between the colors in $list$ and $c2$. +
'ColorDistance[$c1$, $c2$]' +
returns a measure of color distance between the colors $c1$ and $c2$. + +
'ColorDistance[$list$, $c2$]' +
returns a list of color distances between the colors in $list$ and $c2$.
The option DistanceFunction specifies the method used to measure the color distance. Available options are: - CIE76: euclidean distance in the LABColor space - CIE94: euclidean distance in the LCHColor space - CIE2000 or CIEDE2000: CIE94 distance with corrections - CMC: Colour Measurement Committee metric (1984) - DeltaL: difference in the L component of LCHColor - DeltaC: difference in the C component of LCHColor - DeltaH: difference in the H component of LCHColor +
    +
  • CIE76: Euclidean distance in the LABColor space +
  • CIE94: Euclidean distance in the LCHColor space +
  • CIE2000 or CIEDE2000: CIE94 distance with corrections +
  • CMC: Color Measurement Committee metric (1984) +
  • DeltaL: difference in the L component of LCHColor +
  • DeltaC: difference in the C component of LCHColor +
  • DeltaH: difference in the H component of LCHColor +
- It is also possible to specify a custom distance + It is also possible to specify a custom distance. >> ColorDistance[Magenta, Green] = 2.2507 @@ -374,7 +379,7 @@ def compute(a, b): ), ) - if compute == None: + if compute is None: evaluation.message("ColorDistance", "invdist", distance_function) return diff --git a/mathics/builtin/distance/stringdata.py b/mathics/builtin/distance/stringdata.py new file mode 100644 index 000000000..b4ac85ee7 --- /dev/null +++ b/mathics/builtin/distance/stringdata.py @@ -0,0 +1,270 @@ +# -*- coding: utf-8 -*- +""" +String Distances and Similarity Measures +""" + +import unicodedata + +from typing import Callable + +from mathics.version import __version__ # noqa used in loading to check consistency. + +from mathics.builtin.base import Builtin + +from mathics.core.expression import ( + Expression, + Integer, + String, + SymbolTrue, +) + + +# Levenshtein's algorithm is defined by the following construction: +# (adapted from https://de.wikipedia.org/wiki/Levenshtein-Distanz) +# +# given two strings s1, s2, we build a matrix D sized (len(s1) + 1, +# len(s2) + 1) and fill it using the following rules: +# +# (1) D(0, 0) = 0 +# (2) D(i, 0) = i, 1 <= i <= len(s1) +# (3) D(0, j) = j, 1 <= j <= len(s2) +# (4) D(i, j) = minimum of +# D(i - 1, j - 1) + 0 if s1(j) = s2(j) +# D(i - 1, j - 1) + 1 (substitution) +# D(i, j - 1) + 1 (insertion) +# D(i - 1, j) + 1 (deletion) +# +# The computed distance will be in D(len(s1) + 1, len(s2) + 1). +# +# note: double brackets indicate 1-based indices below, e.g. s1[[1]] + +def _one_based(l): # makes an enumerated generator 1-based + return ((i + 1, x) for i, x in l) + + +def _prev_curr(l): # yields pairs of (x[i - 1], x[i]) for i in 1, 2, ... + prev = None + for curr in l: + yield prev, curr + prev = curr + + +def _levenshtein_d0(s2): # compute D(0, ...) + return list(range(len(s2) + 1)) # see (1), (3) + + +def _levenshtein_di(c1, s2, i, d_prev, sameQ, cost): # compute one new row + # given c1 = s1[i], s2, i, d_prev = D(i - 1, ...), compute D(i, ...) + + yield i # start with D(i, 0) = i, see (2) + d_curr_prev_j = i # d_curr_prev_j stores D(i, j - 1) + + for j, c2 in _one_based(enumerate(s2)): # c2 = s2[[j]] + cond = 0 if sameQ(c1, c2) else cost + + d_curr_j = min( # see (4) + d_prev[j - 1] + cond, # D(i - 1, j - 1) + cond; substitution + d_curr_prev_j + 1, # D(i, j - 1) + 1; insertion + d_prev[j] + 1, + ) # D(i - 1, j) + 1; deletion + + yield d_curr_j + d_curr_prev_j = d_curr_j + + +def _levenshtein(s1, s2, sameQ: Callable[..., bool]): + d_prev = _levenshtein_d0(s2) + for i, c1 in _one_based(enumerate(s1)): # c1 = s1[[i]] + d_prev = list(_levenshtein_di(c1, s2, i, d_prev, sameQ, 1)) + return d_prev[-1] + + +def _damerau_levenshtein(s1, s2, sameQ: Callable[..., bool]): + # _damerau_levenshtein works like _levenshtein, except for one additional + # rule covering transposition: + # + # if i > 1 and j > 1 and a[i] == b[j - 1] and a[i - 1] == b[j] then + # D(i, j) = minimum(D(i, j), D(i - 2, j - 2) + transposition_cost) + + def row(d_prev_prev, d_prev, i, prev_c1, c1, cost): + # given c1 = s1[i], d_prev_prev = D(i - 2), d_prev = D(i - 1), + # prev_c1 = s1[[i - 1]], c1 = s1[[i]], compute D(i, ...) + for j, d_curr_j in enumerate(_levenshtein_di(c1, s2, i, d_prev, sameQ, cost)): + if i > 1 and j > 1: + if sameQ(c1, s2[j - 2]) and sameQ(prev_c1, s2[j - 1]): # transposition? + # i.e. if s1[[i]] = s2[[j-1]] and s1[[i-1]] = s2[[j]] + d_curr_j = min(d_curr_j, d_prev_prev[j - 2] + cost) + yield d_curr_j + + d_prev_prev = None + d_prev = _levenshtein_d0(s2) + for i, (prev_c1, c1) in _one_based(enumerate(_prev_curr(s1))): + d_curr = list(row(d_prev_prev, d_prev, i, prev_c1, c1, 1)) + d_prev_prev = d_prev + d_prev = d_curr + + return d_prev[-1] + + +def _levenshtein_like_or_border_cases(s1, s2, sameQ: Callable[..., bool], compute): + if len(s1) == len(s2) and all(sameQ(c1, c2) for c1, c2 in zip(s1, s2)): + return 0 + + if len(s1) < len(s2): + s1, s2 = s2, s1 + + if len(s2) == 0: + return len(s1) + + return compute(s1, s2, sameQ) + + +class _StringDistance(Builtin): + options = {"IgnoreCase": "False"} + + def apply(self, a, b, evaluation, options): + "%(name)s[a_, b_, OptionsPattern[%(name)s]]" + if isinstance(a, String) and isinstance(b, String): + py_a = a.get_string_value() + py_b = b.get_string_value() + if options["System`IgnoreCase"] == SymbolTrue: + if hasattr(str, "casefold"): + + def normalize(c): + return unicodedata.normalize("NFKD", c.casefold()) + + py_a = [normalize(c) for c in py_a] + py_b = [normalize(c) for c in py_b] + else: # python2, PyPy + py_a = py_a.lower() + py_b = py_b.lower() + return Integer(self._distance(py_a, py_b, lambda u, v: u == v)) + elif a.get_head_name() == "System`List" and b.get_head_name() == "System`List": + return Integer(self._distance(a.leaves, b.leaves, lambda u, v: u.sameQ(v))) + else: + return Expression("EditDistance", a, b) + + +class DamerauLevenshteinDistance(_StringDistance): + """ +
+
'DamerauLevenshteinDistance[$a$, $b$]' +
returns the Damerau-Levenshtein distance of $a$ and $b$, which is defined as the minimum number of + transpositions, insertions, deletions and substitutions needed to transform one into the other. + In contrast to EditDistance, DamerauLevenshteinDistance counts transposition of adjacent items (e.g. + "ab" into "ba") as one operation of change. +
+ + >> DamerauLevenshteinDistance["kitten", "kitchen"] + = 2 + + >> DamerauLevenshteinDistance["abc", "ac"] + = 1 + + >> DamerauLevenshteinDistance["abc", "acb"] + = 1 + + >> DamerauLevenshteinDistance["azbc", "abxyc"] + = 3 + + The IgnoreCase option makes DamerauLevenshteinDistance ignore the case of letters: + >> DamerauLevenshteinDistance["time", "Thyme"] + = 3 + + >> DamerauLevenshteinDistance["time", "Thyme", IgnoreCase -> True] + = 2 + + DamerauLevenshteinDistance also works on lists: + >> DamerauLevenshteinDistance[{1, E, 2, Pi}, {1, E, Pi, 2}] + = 1 + """ + + def _distance(self, s1, s2, sameQ: Callable[..., bool]): + return _levenshtein_like_or_border_cases(s1, s2, sameQ, _damerau_levenshtein) + +class EditDistance(_StringDistance): + """ +
+
'EditDistance[$a$, $b$]' +
returns the Levenshtein distance of $a$ and $b$, which is defined as the minimum number of + insertions, deletions and substitutions on the constituents of $a$ and $b$ needed to transform + one into the other. +
+ + >> EditDistance["kitten", "kitchen"] + = 2 + + >> EditDistance["abc", "ac"] + = 1 + + >> EditDistance["abc", "acb"] + = 2 + + >> EditDistance["azbc", "abxyc"] + = 3 + + The IgnoreCase option makes EditDistance ignore the case of letters: + >> EditDistance["time", "Thyme"] + = 3 + + >> EditDistance["time", "Thyme", IgnoreCase -> True] + = 2 + + EditDistance also works on lists: + >> EditDistance[{1, E, 2, Pi}, {1, E, Pi, 2}] + = 2 + """ + + def _distance(self, s1, s2, sameQ: Callable[..., bool]): + return _levenshtein_like_or_border_cases(s1, s2, sameQ, _levenshtein) + + +class HammingDistance(Builtin): + """ +
+
'HammingDistance[$u$, $v$]' +
returns the Hamming distance between $u$ and $v$, i.e. the number of different elements. + $u$ and $v$ may be lists or strings. +
+ + >> HammingDistance[{1, 0, 1, 0}, {1, 0, 0, 1}] + = 2 + + >> HammingDistance["time", "dime"] + = 1 + + >> HammingDistance["TIME", "dime", IgnoreCase -> True] + = 1 + """ + + messages = { + "idim": "`1` and `2` must be of same length.", + } + + options = { + "IgnoreCase": "False", + } + + @staticmethod + def _compute(u, v, sameQ, evaluation): + if len(u) != len(v): + evaluation.message("HammingDistance", "idim", u, v) + return None + else: + return Integer(sum(0 if sameQ(x, y) else 1 for x, y in zip(u, v))) + + def apply_list(self, u, v, evaluation): + "HammingDistance[u_List, v_List]" + return HammingDistance._compute( + u.leaves, v.leaves, lambda x, y: x.sameQ(y), evaluation + ) + + def apply_string(self, u, v, evaluation, options): + "HammingDistance[u_String, v_String, OptionsPattern[HammingDistance]]" + ignore_case = self.get_option(options, "IgnoreCase", evaluation) + py_u = u.get_string_value() + py_v = v.get_string_value() + if ignore_case and ignore_case.is_true(): + py_u = py_u.lower() + py_v = py_v.lower() + return HammingDistance._compute(py_u, py_v, lambda x, y: x == y, evaluation) diff --git a/mathics/builtin/strings.py b/mathics/builtin/strings.py index 227c6c963..db12c03fc 100644 --- a/mathics/builtin/strings.py +++ b/mathics/builtin/strings.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- """ -Unsorted Strings and Characters +Strings and Characters - Miscellaneous """ import io @@ -8,7 +8,7 @@ import unicodedata from binascii import hexlify, unhexlify from heapq import heappush, heappop -from typing import Any, Callable, List +from typing import Any, List from mathics.version import __version__ # noqa used in loading to check consistency. @@ -1052,259 +1052,6 @@ def test(self, expr): return isinstance(expr, String) -class HammingDistance(Builtin): - """ -
-
'HammingDistance[$u$, $v$]' -
returns the Hamming distance between $u$ and $v$, i.e. the number of different elements. - $u$ and $v$ may be lists or strings. -
- - >> HammingDistance[{1, 0, 1, 0}, {1, 0, 0, 1}] - = 2 - - >> HammingDistance["time", "dime"] - = 1 - - >> HammingDistance["TIME", "dime", IgnoreCase -> True] - = 1 - """ - - messages = { - "idim": "`1` and `2` must be of same length.", - } - - options = { - "IgnoreCase": "False", - } - - @staticmethod - def _compute(u, v, sameQ, evaluation): - if len(u) != len(v): - evaluation.message("HammingDistance", "idim", u, v) - return None - else: - return Integer(sum(0 if sameQ(x, y) else 1 for x, y in zip(u, v))) - - def apply_list(self, u, v, evaluation): - "HammingDistance[u_List, v_List]" - return HammingDistance._compute( - u.leaves, v.leaves, lambda x, y: x.sameQ(y), evaluation - ) - - def apply_string(self, u, v, evaluation, options): - "HammingDistance[u_String, v_String, OptionsPattern[HammingDistance]]" - ignore_case = self.get_option(options, "IgnoreCase", evaluation) - py_u = u.get_string_value() - py_v = v.get_string_value() - if ignore_case and ignore_case.is_true(): - py_u = py_u.lower() - py_v = py_v.lower() - return HammingDistance._compute(py_u, py_v, lambda x, y: x == y, evaluation) - - -class _StringDistance(Builtin): - options = {"IgnoreCase": "False"} - - def apply(self, a, b, evaluation, options): - "%(name)s[a_, b_, OptionsPattern[%(name)s]]" - if isinstance(a, String) and isinstance(b, String): - py_a = a.get_string_value() - py_b = b.get_string_value() - if options["System`IgnoreCase"] == SymbolTrue: - if hasattr(str, "casefold"): - - def normalize(c): - return unicodedata.normalize("NFKD", c.casefold()) - - py_a = [normalize(c) for c in py_a] - py_b = [normalize(c) for c in py_b] - else: # python2, PyPy - py_a = py_a.lower() - py_b = py_b.lower() - return Integer(self._distance(py_a, py_b, lambda u, v: u == v)) - elif a.get_head_name() == "System`List" and b.get_head_name() == "System`List": - return Integer(self._distance(a.leaves, b.leaves, lambda u, v: u.sameQ(v))) - else: - return Expression("EditDistance", a, b) - - -# Levenshtein's algorithm is defined by the following construction: -# (adapted from https://de.wikipedia.org/wiki/Levenshtein-Distanz) -# -# given two strings s1, s2, we build a matrix D sized (len(s1) + 1, -# len(s2) + 1) and fill it using the following rules: -# -# (1) D(0, 0) = 0 -# (2) D(i, 0) = i, 1 <= i <= len(s1) -# (3) D(0, j) = j, 1 <= j <= len(s2) -# (4) D(i, j) = minimum of -# D(i - 1, j - 1) + 0 if s1(j) = s2(j) -# D(i - 1, j - 1) + 1 (substitution) -# D(i, j - 1) + 1 (insertion) -# D(i - 1, j) + 1 (deletion) -# -# The computed distance will be in D(len(s1) + 1, len(s2) + 1). -# -# note: double brackets indicate 1-based indices below, e.g. s1[[1]] - - -def _one_based(l): # makes an enumerated generator 1-based - return ((i + 1, x) for i, x in l) - - -def _prev_curr(l): # yields pairs of (x[i - 1], x[i]) for i in 1, 2, ... - prev = None - for curr in l: - yield prev, curr - prev = curr - - -def _levenshtein_d0(s2): # compute D(0, ...) - return list(range(len(s2) + 1)) # see (1), (3) - - -def _levenshtein_di(c1, s2, i, d_prev, sameQ, cost): # compute one new row - # given c1 = s1[i], s2, i, d_prev = D(i - 1, ...), compute D(i, ...) - - yield i # start with D(i, 0) = i, see (2) - d_curr_prev_j = i # d_curr_prev_j stores D(i, j - 1) - - for j, c2 in _one_based(enumerate(s2)): # c2 = s2[[j]] - cond = 0 if sameQ(c1, c2) else cost - - d_curr_j = min( # see (4) - d_prev[j - 1] + cond, # D(i - 1, j - 1) + cond; substitution - d_curr_prev_j + 1, # D(i, j - 1) + 1; insertion - d_prev[j] + 1, - ) # D(i - 1, j) + 1; deletion - - yield d_curr_j - d_curr_prev_j = d_curr_j - - -def _levenshtein(s1, s2, sameQ: Callable[..., bool]): - d_prev = _levenshtein_d0(s2) - for i, c1 in _one_based(enumerate(s1)): # c1 = s1[[i]] - d_prev = list(_levenshtein_di(c1, s2, i, d_prev, sameQ, 1)) - return d_prev[-1] - - -def _damerau_levenshtein(s1, s2, sameQ: Callable[..., bool]): - # _damerau_levenshtein works like _levenshtein, except for one additional - # rule covering transposition: - # - # if i > 1 and j > 1 and a[i] == b[j - 1] and a[i - 1] == b[j] then - # D(i, j) = minimum(D(i, j), D(i - 2, j - 2) + transposition_cost) - - def row(d_prev_prev, d_prev, i, prev_c1, c1, cost): - # given c1 = s1[i], d_prev_prev = D(i - 2), d_prev = D(i - 1), - # prev_c1 = s1[[i - 1]], c1 = s1[[i]], compute D(i, ...) - for j, d_curr_j in enumerate(_levenshtein_di(c1, s2, i, d_prev, sameQ, cost)): - if i > 1 and j > 1: - if sameQ(c1, s2[j - 2]) and sameQ(prev_c1, s2[j - 1]): # transposition? - # i.e. if s1[[i]] = s2[[j-1]] and s1[[i-1]] = s2[[j]] - d_curr_j = min(d_curr_j, d_prev_prev[j - 2] + cost) - yield d_curr_j - - d_prev_prev = None - d_prev = _levenshtein_d0(s2) - for i, (prev_c1, c1) in _one_based(enumerate(_prev_curr(s1))): - d_curr = list(row(d_prev_prev, d_prev, i, prev_c1, c1, 1)) - d_prev_prev = d_prev - d_prev = d_curr - - return d_prev[-1] - - -def _levenshtein_like_or_border_cases(s1, s2, sameQ: Callable[..., bool], compute): - if len(s1) == len(s2) and all(sameQ(c1, c2) for c1, c2 in zip(s1, s2)): - return 0 - - if len(s1) < len(s2): - s1, s2 = s2, s1 - - if len(s2) == 0: - return len(s1) - - return compute(s1, s2, sameQ) - - -class EditDistance(_StringDistance): - """ -
-
'EditDistance[$a$, $b$]' -
returns the Levenshtein distance of $a$ and $b$, which is defined as the minimum number of - insertions, deletions and substitutions on the constituents of $a$ and $b$ needed to transform - one into the other. -
- - >> EditDistance["kitten", "kitchen"] - = 2 - - >> EditDistance["abc", "ac"] - = 1 - - >> EditDistance["abc", "acb"] - = 2 - - >> EditDistance["azbc", "abxyc"] - = 3 - - The IgnoreCase option makes EditDistance ignore the case of letters: - >> EditDistance["time", "Thyme"] - = 3 - - >> EditDistance["time", "Thyme", IgnoreCase -> True] - = 2 - - EditDistance also works on lists: - >> EditDistance[{1, E, 2, Pi}, {1, E, Pi, 2}] - = 2 - """ - - def _distance(self, s1, s2, sameQ: Callable[..., bool]): - return _levenshtein_like_or_border_cases(s1, s2, sameQ, _levenshtein) - - -class DamerauLevenshteinDistance(_StringDistance): - """ -
-
'DamerauLevenshteinDistance[$a$, $b$]' -
returns the Damerau-Levenshtein distance of $a$ and $b$, which is defined as the minimum number of - transpositions, insertions, deletions and substitutions needed to transform one into the other. - In contrast to EditDistance, DamerauLevenshteinDistance counts transposition of adjacent items (e.g. - "ab" into "ba") as one operation of change. -
- - >> DamerauLevenshteinDistance["kitten", "kitchen"] - = 2 - - >> DamerauLevenshteinDistance["abc", "ac"] - = 1 - - >> DamerauLevenshteinDistance["abc", "acb"] - = 1 - - >> DamerauLevenshteinDistance["azbc", "abxyc"] - = 3 - - The IgnoreCase option makes DamerauLevenshteinDistance ignore the case of letters: - >> DamerauLevenshteinDistance["time", "Thyme"] - = 3 - - >> DamerauLevenshteinDistance["time", "Thyme", IgnoreCase -> True] - = 2 - - DamerauLevenshteinDistance also works on lists: - >> DamerauLevenshteinDistance[{1, E, 2, Pi}, {1, E, Pi, 2}] - = 1 - """ - - def _distance(self, s1, s2, sameQ: Callable[..., bool]): - return _levenshtein_like_or_border_cases(s1, s2, sameQ, _damerau_levenshtein) - - class RemoveDiacritics(Builtin): """
From 58d1882bfb1eb52909c76bb98c85f01902fe145b Mon Sep 17 00:00:00 2001 From: autoblack Date: Wed, 30 Jun 2021 02:31:05 +0000 Subject: [PATCH 6/7] fixup: Format Python code with Black --- mathics/builtin/distance/stringdata.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mathics/builtin/distance/stringdata.py b/mathics/builtin/distance/stringdata.py index b4ac85ee7..d65c874ef 100644 --- a/mathics/builtin/distance/stringdata.py +++ b/mathics/builtin/distance/stringdata.py @@ -38,6 +38,7 @@ # # note: double brackets indicate 1-based indices below, e.g. s1[[1]] + def _one_based(l): # makes an enumerated generator 1-based return ((i + 1, x) for i, x in l) @@ -182,6 +183,7 @@ class DamerauLevenshteinDistance(_StringDistance): def _distance(self, s1, s2, sameQ: Callable[..., bool]): return _levenshtein_like_or_border_cases(s1, s2, sameQ, _damerau_levenshtein) + class EditDistance(_StringDistance): """
From 2965615c3ea7aba8ace471fe4cd2d0b95d6a7773 Mon Sep 17 00:00:00 2001 From: rocky Date: Tue, 29 Jun 2021 23:10:12 -0400 Subject: [PATCH 7/7] Fix some of the import bugs introduced --- mathics/builtin/colors/color_operations.py | 5 +- mathics/builtin/colors/named_colors.py | 3 +- mathics/builtin/string/charcodes.py | 3 +- mathics/builtin/string/operations.py | 6 +- mathics/builtin/string/patterns.py | 124 ++++++++++++++++++--- mathics/builtin/strings.py | 86 -------------- setup.py | 1 + 7 files changed, 119 insertions(+), 109 deletions(-) diff --git a/mathics/builtin/colors/color_operations.py b/mathics/builtin/colors/color_operations.py index aaf874044..94580a119 100644 --- a/mathics/builtin/colors/color_operations.py +++ b/mathics/builtin/colors/color_operations.py @@ -1,5 +1,8 @@ # -*- coding: utf-8 -*- -"""Color Operations""" +"""Color Operations + +Functions for manipulating colors and color images. +""" from mathics.version import __version__ # noqa used in loading to check consistency. diff --git a/mathics/builtin/colors/named_colors.py b/mathics/builtin/colors/named_colors.py index 518f93024..5d8cc3191 100644 --- a/mathics/builtin/colors/named_colors.py +++ b/mathics/builtin/colors/named_colors.py @@ -1,8 +1,7 @@ # -*- coding: utf-8 -*- """Named Colors -Mathics has definitions for the most common color names which can be -used in a graphics or style specification. +Mathics has definitions for the most common color names which can be used in a graphics or style specification. """ from mathics.builtin.base import Builtin diff --git a/mathics/builtin/string/charcodes.py b/mathics/builtin/string/charcodes.py index be6d1ec49..d62150d49 100644 --- a/mathics/builtin/string/charcodes.py +++ b/mathics/builtin/string/charcodes.py @@ -3,6 +3,7 @@ Character Codes """ +import sys from mathics.version import __version__ # noqa used in loading to check consistency. from mathics.builtin.base import Builtin @@ -15,7 +16,7 @@ SymbolList, ) -from mathics.builtin.strings import _encodings, to_python_encoding +from mathics.builtin.strings import to_python_encoding def pack_bytes(codes): diff --git a/mathics/builtin/string/operations.py b/mathics/builtin/string/operations.py index e67499c80..0b9b6bb87 100644 --- a/mathics/builtin/string/operations.py +++ b/mathics/builtin/string/operations.py @@ -5,9 +5,6 @@ """ import re -from sys import version_info -from binascii import hexlify, unhexlify -from heapq import heappush, heappop from mathics.version import __version__ # noqa used in loading to check consistency. @@ -30,10 +27,9 @@ from mathics.builtin.lists import python_seq, convert_seq from mathics.builtin.strings import ( _StringFind, - _decode_pname, - _encode_pname, _evaluate_match, _parallel_match, + mathics_split, to_regex, ) diff --git a/mathics/builtin/string/patterns.py b/mathics/builtin/string/patterns.py index 7a96342e8..5a7b811e5 100644 --- a/mathics/builtin/string/patterns.py +++ b/mathics/builtin/string/patterns.py @@ -12,13 +12,18 @@ from mathics.core.expression import ( Expression, Integer1, + String, SymbolFalse, + SymbolList, SymbolTrue, ) from mathics.builtin.strings import ( _StringFind, + _evaluate_match, + _pattern_search, + _parallel_match, anchor_pattern, to_regex, ) @@ -27,7 +32,7 @@ class DigitCharacter(Builtin): """
-
'DigitCharacter' +
'DigitCharacter'
represents the digits 0-9.
@@ -46,10 +51,47 @@ class DigitCharacter(Builtin): """ +class EndOfLine(Builtin): + r""" +
+
'EndOfString' +
represents the end of a line in a string. +
+ + >> StringReplace["aba\nbba\na\nab", "a" ~~ EndOfLine -> "c"] + = abc + . bbc + . c + . ab + + >> StringSplit["abc\ndef\nhij", EndOfLine] + = {abc, + . def, + . hij} + """ + + +class EndOfString(Builtin): + r""" +
+
'EndOfString' +
represents the end of a string. +
+ + Test whether strings end with "e": + >> StringMatchQ[#, __ ~~ "e" ~~ EndOfString] &/@ {"apple", "banana", "artichoke"} + = {True, False, True} + + >> StringReplace["aab\nabb", "b" ~~ EndOfString -> "c"] + = aab + . abc + """ + + class LetterCharacter(Builtin): """
-
'LetterCharacter' +
'LetterCharacter'
represents letters.
@@ -62,19 +104,60 @@ class LetterCharacter(Builtin): """ +class StartOfLine(Builtin): + r""" +
+
'StartOfString' +
represents the start of a line in a string. +
+ + >> StringReplace["aba\nbba\na\nab", StartOfLine ~~ "a" -> "c"] + = cba + . bba + . c + . cb + + >> StringSplit["abc\ndef\nhij", StartOfLine] + = {abc + . , def + . , hij} + """ + + +class StartOfString(Builtin): + r""" +
+
'StartOfString' +
represents the start of a string. +
+ + Test whether strings start with "a": + >> StringMatchQ[#, StartOfString ~~ "a" ~~ __] &/@ {"apple", "banana", "artichoke"} + = {True, False, True} + + >> StringReplace["aba\nabb", StartOfString ~~ "a" -> "c"] + = cba + . abb + """ + + class StringCases(_StringFind): """
-
'StringCases["$string$", $pattern$]' -
gives all occurences of $pattern$ in $string$. -
'StringReplace["$string$", $pattern$ -> $form$]' -
gives all instances of $form$ that stem from occurences of $pattern$ in $string$. -
'StringCases["$string$", {$pattern1$, $pattern2$, ...}]' -
gives all occurences of $pattern1$, $pattern2$, .... -
'StringReplace["$string$", $pattern$, $n$]' -
gives only the first $n$ occurences. -
'StringReplace[{"$string1$", "$string2$", ...}, $pattern$]' -
gives occurences in $string1$, $string2$, ... +
'StringCases["$string$", $pattern$]' +
gives all occurences of $pattern$ in $string$. + +
'StringReplace["$string$", $pattern$ -> $form$]' +
gives all instances of $form$ that stem from occurences of $pattern$ in $string$. + +
'StringCases["$string$", {$pattern1$, $pattern2$, ...}]' +
gives all occurences of $pattern1$, $pattern2$, .... + +
'StringReplace["$string$", $pattern$, $n$]' +
gives only the first $n$ occurences. + +
'StringReplace[{"$string1$", "$string2$", ...}, $pattern$]' +
gives occurences in $string1$, $string2$, ...
>> StringCases["axbaxxb", "a" ~~ x_ ~~ "b"] @@ -361,7 +444,7 @@ def apply(self, string, patt, evaluation, options): class WhitespaceCharacter(Builtin): r"""
-
'WhitespaceCharacter' +
'WhitespaceCharacter'
represents a single whitespace character.
@@ -379,10 +462,23 @@ class WhitespaceCharacter(Builtin): """ +# strings.to_regex() seems to have the implementation here. +class WordBoundary(Builtin): + """ +
+
'WordBoundary' +
represents the boundary between words. +
+ + >> StringReplace["apple banana orange artichoke", "e" ~~ WordBoundary -> "E"] + = applE banana orangE artichokE + """ + + class WordCharacter(Builtin): r"""
-
'WordCharacter' +
'WordCharacter'
represents a single letter or digit character.
diff --git a/mathics/builtin/strings.py b/mathics/builtin/strings.py index db12c03fc..8458477f5 100644 --- a/mathics/builtin/strings.py +++ b/mathics/builtin/strings.py @@ -414,92 +414,6 @@ class Whitespace(Builtin): """ -class StartOfString(Builtin): - r""" -
-
'StartOfString' -
represents the start of a string. -
- - Test whether strings start with "a": - >> StringMatchQ[#, StartOfString ~~ "a" ~~ __] &/@ {"apple", "banana", "artichoke"} - = {True, False, True} - - >> StringReplace["aba\nabb", StartOfString ~~ "a" -> "c"] - = cba - . abb - """ - - -class EndOfString(Builtin): - r""" -
-
'EndOfString' -
represents the end of a string. -
- - Test whether strings end with "e": - >> StringMatchQ[#, __ ~~ "e" ~~ EndOfString] &/@ {"apple", "banana", "artichoke"} - = {True, False, True} - - >> StringReplace["aab\nabb", "b" ~~ EndOfString -> "c"] - = aab - . abc - """ - - -class StartOfLine(Builtin): - r""" -
-
'StartOfString' -
represents the start of a line in a string. -
- - >> StringReplace["aba\nbba\na\nab", StartOfLine ~~ "a" -> "c"] - = cba - . bba - . c - . cb - - >> StringSplit["abc\ndef\nhij", StartOfLine] - = {abc - . , def - . , hij} - """ - - -class EndOfLine(Builtin): - r""" -
-
'EndOfString' -
represents the end of a line in a string. -
- - >> StringReplace["aba\nbba\na\nab", "a" ~~ EndOfLine -> "c"] - = abc - . bbc - . c - . ab - - >> StringSplit["abc\ndef\nhij", EndOfLine] - = {abc, - . def, - . hij} - """ - - -class WordBoundary(Builtin): - """ -
-
'WordBoundary' -
represents the boundary between words. -
- - >> StringReplace["apple banana orange artichoke", "e" ~~ WordBoundary -> "E"] - = applE banana orangE artichokE - """ - - # FIXME: Generalize string.lower() and ord() def letter_number(chars: List[str], start_ord) -> List["Integer"]: # Note caller has verified that everything isalpha() and diff --git a/setup.py b/setup.py index 3e2a18923..5c0c80989 100644 --- a/setup.py +++ b/setup.py @@ -133,6 +133,7 @@ def subdirs(root, file="*.*", depth=10): "mathics.builtin.box", "mathics.builtin.colors", "mathics.builtin.compile", + "mathics.builtin.distance", "mathics.builtin.drawing", "mathics.builtin.fileformats", "mathics.builtin.files_io",