" ingo/regexp/deconstruct.vim: Functions for taking apart regular expressions. " " DEPENDENCIES: " " Copyright: (C) 2018-2020 Ingo Karkat " The VIM LICENSE applies to this script; see ':help copyright'. " " Maintainer: Ingo Karkat let s:save_cpo = &cpo set cpo&vim function! ingo#regexp#deconstruct#RemovePositionAtoms( pattern ) "****************************************************************************** "* PURPOSE: " Remove atoms that assert a certain position of the pattern (like ^, $, \<, " \%l) from a:pattern. "* ASSUMPTIONS / PRECONDITIONS: " Does not consider "very magic" (/\v)-style syntax. If you may have this, " convert via ingo#regexp#magic#Normalize() first. "* EFFECTS / POSTCONDITIONS: " None. "* INPUTS: " a:pattern regular expression "* RETURN VALUES: " Modified a:pattern with position atoms removed. "****************************************************************************** return substitute(a:pattern, ingo#regexp#parse#PositionAtomExpr(), '', 'g') endfunction function! ingo#regexp#deconstruct#RemoveMultis( pattern ) "****************************************************************************** "* PURPOSE: " Remove multi items (*, \+, etc.) that signify the multiplicity of the " previous atom from a:pattern. "* ASSUMPTIONS / PRECONDITIONS: " Does not consider "very magic" (/\v)-style syntax. If you may have this, " convert via ingo#regexp#magic#Normalize() first. "* EFFECTS / POSTCONDITIONS: " None. "* INPUTS: " a:pattern regular expression "* RETURN VALUES: " Modified a:pattern with multi items removed. "****************************************************************************** return substitute(a:pattern, ingo#regexp#parse#MultiExpr(), '', 'g') endfunction let s:specialLookup = { \ 'e': "\e", \ 't': "\t", \ 'r': "\r", \ 'b': "\b", \ 'n': "\n", \} function! ingo#regexp#deconstruct#UnescapeSpecialCharacters( pattern ) "****************************************************************************** "* PURPOSE: " Remove the backslash in front of characters that have special regular " expression meaning without it, like [\.*~], and interpret special sequences " like \e \t \n. "* ASSUMPTIONS / PRECONDITIONS: " Does not consider "very magic" (/\v)-style syntax. If you may have this, " convert via ingo#regexp#magic#Normalize() first. "* EFFECTS / POSTCONDITIONS: " None. "* INPUTS: " a:pattern regular expression "* RETURN VALUES: " Modified a:pattern with special characters turned into literal ones. "****************************************************************************** let l:result = a:pattern let l:result = substitute(l:result, ingo#regexp#parse#EscapedCharacterExpr(), '\=s:specialLookup[submatch(1)]', 'g') let l:result = ingo#escape#Unescape(l:result, '\^$.*~[]') return l:result endfunction function! ingo#regexp#deconstruct#TranslateSingleCharacterAtoms( pattern ) abort "****************************************************************************** "* PURPOSE: " Return a regular expression that matches any unspecific single character, " i.e. . or \_.. "* ASSUMPTIONS / PRECONDITIONS: " Does not consider "very magic" (/\v)-style syntax. If you may have this, " convert via ingo#regexp#magic#Normalize() first. "* EFFECTS / POSTCONDITIONS: " None. "* INPUTS: " None. "* RETURN VALUES: " Regular expression. "****************************************************************************** return substitute(a:pattern, ingo#regexp#parse#SingleCharacterExpr(), "\u2022", 'g') endfunction function! ingo#regexp#deconstruct#TranslateCharacterClasses( pattern, ... ) abort "****************************************************************************** "* PURPOSE: " Translate character classes (e.g. \d, \k), collections ([...]; unless they " only contain a single literal character), and optionally matched atoms from " a:pattern with the passed a:replacements or default ones. "* ASSUMPTIONS / PRECONDITIONS: " Does not consider "very magic" (/\v)-style syntax. If you may have this, " convert via ingo#regexp#magic#Normalize() first. "* EFFECTS / POSTCONDITIONS: " None. "* INPUTS: " a:pattern regular expression " a:replacements Optional Dict that maps each character class / collection to " a replacment. "* RETURN VALUES: " Modified a:pattern with character classes translated. "****************************************************************************** let l:pattern = a:pattern let l:replacements = (a:0 ? a:1 : { \ 'blank': ' ', \ 'return': "\r", \ 'tab': "\t", \ 'escape': "\e", \ 'backspace': "\b", \ 'cntrl': "\uFF3E", \ 'punct': "\u203D", \ 'i': "\U1D456", \ 'I': "\U1D43C", \ 'k': "\U1D458", \ 'K': "\U1D43E", \ 'f': "\U1D453", \ 'F': "\U1D439", \ 'p': "\U1D45D", \ 'print': "\U1D45D", \ 'graph': "\U1D45D", \ 'P': "\U1D443", \ 'PRINT': "\U1D443", \ 'GRAPH': "\U1D443", \ 's': "\U1D460", \ 'space': "\U1D460", \ 'S': "\U1D446", \ 'SPACE': "\U1D446", \ 'd': "\U1D451", \ 'digit': "\U1D451", \ 'D': "\U1D437", \ 'DIGIT': "\U1D437", \ 'x': "\U1D465", \ 'xdigit': "\U1D465", \ 'X': "\U1D44B", \ 'XDIGIT': "\U1D44B", \ 'o': "\U1D45C", \ 'O': "\U1D442", \ 'w': "\U1D464", \ 'W': "\U1D44A", \ 'h': "\U1D455", \ 'H': "\U1D43B", \ 'a': "\U1D44E", \ 'alpha': "\U1D44E", \ 'alnum': "\U1D44E", \ 'A': "\U1D434", \ 'ALPHA': "\U1D434", \ 'ALNUM': "\U1D434", \ 'l': "\U1D459", \ 'lower': "\U1D459", \ 'L': "\U1D43F", \ 'LOWER': "\U1D43F", \ 'u': "\U1D462", \ 'upper': "\U1D462", \ 'U': "\U1D448", \ 'UPPER': "\U1D448", \ '[]': "\u2026", \}) let l:pattern = substitute(l:pattern, '\C' . ingo#regexp#parse#CharacterClassesExpr(), '\=get(l:replacements, submatch(1), "")', 'g') " Optional sequence of atoms \%[]. Note: Because these can contain " collection-like stuff, it has to be processed before collections. let l:pattern = substitute(l:pattern, ingo#regexp#parse#OptionalSequenceExpr(), '\1', 'g') let l:pattern = substitute(l:pattern, ingo#regexp#collection#Expr({'isCapture': 1}), '\=s:TransformCollection(l:replacements, submatch(1))', 'g') return l:pattern endfunction function! s:TransformCollection( replacements, characters ) abort let l:literalCharacter = matchstr(a:characters, '^\\\?\zs.$') if ! empty(l:literalCharacter) return l:literalCharacter endif let l:characterClass = matchstr(a:characters, '^\[:\zs\a\+\ze:\]$') if ! empty(l:characterClass) return get(a:replacements, l:characterClass, '') endif let l:invertedCharacterClass = matchstr(a:characters, '^\^\[:\zs\a\+\ze:\]$') if ! empty(l:invertedCharacterClass) return get(a:replacements, toupper(l:invertedCharacterClass), '') endif return get(a:replacements, '[]', '') endfunction function! ingo#regexp#deconstruct#RemoveCharacterClasses( pattern ) abort "****************************************************************************** "* PURPOSE: " Remove character classes (e.g. \d, \k), collections ([...]; unless they only " contain a single literal character), and optionally matched atoms from " a:pattern. "* ASSUMPTIONS / PRECONDITIONS: " Does not consider "very magic" (/\v)-style syntax. If you may have this, " convert via ingo#regexp#magic#Normalize() first. "* EFFECTS / POSTCONDITIONS: " None. "* INPUTS: " a:pattern regular expression "* RETURN VALUES: " Modified a:pattern with character classes removed. "****************************************************************************** return ingo#regexp#deconstruct#TranslateCharacterClasses(a:pattern, {}) endfunction function! ingo#regexp#deconstruct#TranslateNumberEscapes( pattern ) abort "****************************************************************************** "* PURPOSE: " Convert characters escaped as numbers from a:pattern. "* ASSUMPTIONS / PRECONDITIONS: " Does not consider "very magic" (/\v)-style syntax. If you may have this, " convert via ingo#regexp#magic#Normalize() first. "* EFFECTS / POSTCONDITIONS: " None. "* INPUTS: " a:pattern regular expression "* RETURN VALUES: " Modified a:pattern with numbered escapes translated to literal characters. "****************************************************************************** let l:pattern = a:pattern let l:pattern = substitute(l:pattern, '\C\%(\%(^\|[^\\]\)\%(\\\\\)*\\\)\@