Как найти индексы групп в регулярных выражениях JavaScript?

Когда я пишу регулярное выражение, например:

var m = /(s+).*?(l)[^l]*?(o+)/.exec("this is hello to you");
console.log(m);

Я получаю объект соответствия, содержащий следующее:

{
  0: "s is hello",
  1: "s",
  2: "l",
  3: "o",
  index: 3,
  input: "this is hello to you"
}

Я знаю индекс всего соответствия из свойства index, но мне также нужно знать начало и конец сопоставленных групп. Использование простого поиска не будет работать. В этом примере он найдет первый "l" вместо найденного в группе.

Есть ли способ получить смещение согласованной группы?

Ответ 1

Вы не можете напрямую получить индекс группы соответствия. То, что вам нужно сделать, - это сначала поместить каждого персонажа в группу соответствия, даже те, которые вам не нужны:

var m= /(s+)(.*?)(l)([^l]*?)(o+)/.exec('this is hello to you');

Теперь у вас есть полный матч по частям:

[ is hello', 's', ' is hel', 'l', '', 'o']

Итак, вы можете добавить длины строк перед своей группой, чтобы получить смещение от индекса соответствия к индексу группы:

function indexOfGroup(match, n) {
    var ix= match.index;
    for (var i= 1; i<n; i++)
        ix+= match[i].length;
    return ix;
}

console.log(indexOfGroup(m, 3)); // 11

Ответ 2

Я написал простую (ну, инициализацию получил немного раздутый) javascript-объект для решения этой проблемы в проекте, над которым я недавно работал. Он работает так же, как и принятый ответ, но генерирует новое регулярное выражение и автоматически извлекает запрошенные вами данные.

var exp = new MultiRegExp(/(firstBit\w+)this text is ignored(optionalBit)?/i);
var value = exp.exec("firstbitWithMorethis text is ignored");

value = {0: {index: 0, text: 'firstbitWithMore'},
         1: null};

Git Репо: My MultiRegExp. Надеюсь, это поможет кому-то там.

edit Aug, 2015:

Попробуйте: MultiRegExp Live.

Ответ 3

Другой класс javascript, который также может анализировать вложенные группы, доступен под: https://github.com/valorize/MultiRegExp2

Использование:

let regex = /a(?: )bc(def(ghi)xyz)/g;
let regex2 = new MultiRegExp2(regex);

let matches = regex2.execForAllGroups('ababa bcdefghixyzXXXX'));

Will output:
[ { match: 'defghixyz', start: 8, end: 17 },
  { match: 'ghi', start: 11, end: 14 } ]

Ответ 4

Я поиграл с добавлением вложенных групп захвата и именованных групп с информацией о местоположении. Вы можете поиграть с некоторыми регулярными выражениями в jsfiddle... https://jsfiddle.net/smuchow1962/z5dj9gL0/

/*
Copyright (c) 2019 Steven A Muchow
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

Enhanced RegEx JS processing
Adds position information for capture groups (nested ones too) AND named group items.
*/
class RegexContainer {

    static _findCaptureGroupsInRegexTemplate(re, input) {
        let refCount = 0; let matches = []; let res; let data;
        re.lastIndex = 0;
        while ((res = re.exec(input)) !== null) {
            if (isCapturingStartItem(res[0])) {
                refCount++;
                data = {parent: 0, refCount: refCount, start: res.index};
                if (res.groups.name) { data.name = res.groups.name; }
                matches.push(data);
            } else if (input.charAt(res.index) === ')') {
                let idx = matches.length;
                while (idx--) {
                    if (matches[idx].end === undefined) {
                        matches[idx].end = re.lastIndex;
                        matches[idx].source = input.substring(matches[idx].start, matches[idx].end);
                        break;
                    }
                }
                refCount--;
                let writeIdx = idx;
                while (idx--) {
                    if (matches[idx].refCount === refCount) {
                        matches[writeIdx].parent = idx + 1;
                        break;
                    }
                }
            }
        }
        matches.unshift({start: 0, end: input.length, source: input});
        return matches;

        function isCapturingStartItem(str) {
            if (str !== '(') { return (str.search(/\(\?<\w/)!==-1); }
            return true;
        }
    }

    static execFull(re, input, foundCaptureItems) {
        let result; let foundIdx; let groupName;  const matches = [];
        while ((result = re.exec(input)) !== null) {
            let array = createCustomResultArray(result);
            array.forEach((match, idx) => {
                if (!idx) {
                    match.startPos = match.endPos = result.index;
                    match.endPos += result[0].length;
                    delete match.parent;
                    return;
                }
                let parentStr = array[match.parent].data;
                foundIdx = (match.parent < idx - 1) ? parentStr.lastIndexOf(match.data) : parentStr.indexOf(match.data);
                match.startPos = match.endPos = foundIdx + array[match.parent].startPos;
                match.endPos += match.data.length;
                if ((groupName = foundCaptureItems[idx].name)) { match.groupName = groupName; }
            });
            matches.push(array);
            if (re.lastIndex === 0) { break; }
        }
        return matches;

        function createCustomResultArray(result) {
            let captureVar = 0;
            return Array.from(result, (data) => {
                return {data: data || '', parent: foundCaptureItems[captureVar++].parent,};
            });
        }
    }

    static mapCaptureAndNameGroups(inputRegexSourceString) {
        let REGEX_CAPTURE_GROUPS_ANALYZER = /((((?<!\\)|^)\((\?((<(?<name>\w+)))|(\?<=.*?\))|(\?<!.*?\))|(\?!.*?\))|(\?=.*?\)))?)|((?<!\\)\)(([*+?](\?)?))?|({\d+(,)?(\d+)?})))/gm;
        return RegexContainer._findCaptureGroupsInRegexTemplate(REGEX_CAPTURE_GROUPS_ANALYZER, inputRegexSourceString);
    }

    static exec(re, input) {
        let foundCaptureItems = RegexContainer.mapCaptureAndNameGroups(re.source);
        let res = RegexContainer.execFull(re, input, foundCaptureItems);
        return {captureItems: foundCaptureItems, results: res};
    }

}

let answers = [];
let regex = [
    { re: "[ \\t]*?\\[\\[(?<inner>\\s*(?<core>\\w(.|\\s)*?)\\s*?)]]", label: "NESTED Regex"},
  { re: "(?<context>((\\w)(\\w|-)*))((?<separator>( - ))?(?<type>(-|\\w)+)?\\s*(?<opt>(\\{.*}))?)?[\\t ]*", label: "simpler regex" },
]

let input = "[[ context1 ]]  [[ context2 - with-style { andOpts : {data: 'some info'} } ]]";

regex.forEach( (item) => {
    let re = new RegExp(item.re, 'gm');
  let result = RegexContainer.exec(re,input);
  result.label = item.label;
  answers.push(result);
});

answers.forEach((answer,index) => {
    console.log('==========================================================');
    console.log('==== Item ' + index + ' label: ' + answer.label + ' regex: ' + answer.captureItems[0].source );
    console.log('==========================================================\n\n');
    let scannedItems = answer.results;
    scannedItems.forEach( (match) => {
        let full = match[0];
        let mstr = full.data;
        let substr = input.substring(full.startPos, full.endPos);
        if (mstr !== substr) {
            console.log('error in the parsing if you get here');
            return;
        }
        console.log('==== Checking ' + mstr);
        for (let i=1; i<match.length; i++) {
            let capture = match[i];
            if (capture.groupName) {
                console.log(' ' + capture.groupName + ': ' + "'''" + input.substring(capture.startPos,capture.endPos) + "'''");
            }
        }
        console.log('');
    });
});

Архитектура

Возьмите шаблон Regex и определите группы захвата, которые он сгенерирует. Сохраните его как массив групповых элементов и вложенной информации для подачи в расширенный вызов exec().
- используйте регулярные выражения, чтобы найти начало захвата, элементы без захвата, имена захвата и окончания захвата. Ловушка правильно для страшных\(и \) предметов.
- нерекурсивный осмотр предметов захвата и их родителей (с использованием подсчета ссылок).
запустите exec() с информацией о группе захвата, полученной выше.
- использовать функции подстроки для извлечения данных для каждой группы захвата
- поместите все в массив для каждого найденного результата и отправьте массив обратно.

Ответ 5

На основе синтаксиса регулярного выражения ecma Я написал парсер, соответствующий расширению класса RegExp, который решает помимо этой проблемы (полный indexed exec), а также другие ограничения реализации JavaScript RegExp, например: поиск и замена на основе групп. Вы можете протестировать и загрузить реализацию здесь (также доступен как модуль NPM).

Реализация работает следующим образом (малый пример):

//Retrieve content and position of: opening-, closing tags and body content for: non-nested html-tags.
var pattern = '(<([^ >]+)[^>]*>)([^<]*)(<\\/\\2>)';
var str = '<html><code class="html plain">first</code><div class="content">second</div></html>';
var regex = new Regex(pattern, 'g');
var result = regex.exec(str);

console.log(5 === result.length);
console.log('<code class="html plain">first</code>'=== result[0]);
console.log('<code class="html plain">'=== result[1]);
console.log('first'=== result[3]);
console.log('</code>'=== result[4]);
console.log(5=== result.index.length);
console.log(6=== result.index[0]);
console.log(6=== result.index[1]);
console.log(31=== result.index[3]);
console.log(36=== result.index[4]);

Я также попытался выполнить реализацию из @velop, но реализация кажется ошибкой, например, она неправильно обрабатывает обратные ссылки, например. "/a (?) bc (def (\1 ghi) xyz)/g" - при добавлении парадельта спереди, тогда необходимо увеличить соответственно обратную ссылку \1 (что не соответствует его реализации).

Ответ 6

Для глобального регулярного выражения вы хотите сопоставить только фрагменты и выполнить итерацию, чтобы первое решение не сработало. Это 30-минутное решение, основанное на indexOf и суммах, которые работают для этого случая:

https://codepen.io/cancerberoSgx/pen/qYwjjz?editors=0012#code-area

!function () {
  const regex = /\/\*\*\*@\s*([^@]+)\s*(@\*\*\*\/)/gim
  const exampleThatMatch = '
    /***@
    debug.print('hello editor, simpleNode kind is ' +
    arg.simpleNode.getKindName())
    @***/

    const a = 1 //user

    /***@
    debug.print(arg.simpleNode.getParent().getKindName())
    @***/
    '
  const text = exampleThatMatch 
  function exec(r, s) {
    function indexOfGroup(match, n) {
      var ix = match.index;
      for (var i = 1; i < n; i++)
        ix += match[i].length;
      return ix;
    }
    let result
    let lastMatchIndex = 0
    const matches = []
    while ((result = regex.exec(text))) {
      const match = []
      lastMatchIndex = text.indexOf(result[0], lastMatchIndex)
      let relIndex = 0 
      for (let i = 1; i < result.length; i++) {
        relIndex = text.indexOf(result[i], relIndex)
        match.push({ value: result[i], start: relIndex, end: relIndex + result[i].length })
      }
      matches.push(match)
    }
    return matches
  }
  const groupsWithIndex = exec(regex, text)
  console.log({RESULT: groupsWithIndex })
  // now test - let remove everything else but matched groups 
  let frag = '' , sep = '\n#######\n'
  groupsWithIndex.forEach(match => match.forEach(group => {
    frag += text.substring(group.start, group.end) + sep
  }))
  console.log('The following are only the matched groups usign the result and text.substring just to verify it works OK:', '\n'+sep)
  console.log(frag)
}()

И на всякий случай вот машинопись:

https://codepen.io/cancerberoSgx/pen/yjrXxx?editors=0012

| наслаждаться