123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605 |
- #include "Regex.h"
- using namespace Framework;
- using namespace Regex;
- int findEndPos(Text* regex, int startPos, char endChar, char startChar)
- {
- int length = regex->getLength();
- int i = startPos;
- int opened = 0;
- while (i < length)
- {
- if (regex->getText()[i] == '\\')
- {
- i++;
- }
- else if (regex->getText()[i] == startChar)
- {
- opened++;
- }
- else if (regex->getText()[i] == endChar)
- {
- if (opened == 0)
- {
- return i;
- }
- opened--;
- }
- i++;
- }
- return -1;
- }
- Automata<char>* parseCharacterList(Text* regex, RegexConfig& config)
- {
- bool negated = regex->hatAt(0, "^");
- Automata<char>* result = new Automata<char>();
- State<char>* start = result->addState();
- State<char>* end = result->addState();
- end->setFinal(true);
- int length = regex->getLength();
- bool escaped = false;
- bool minusValid = false;
- Text characterList;
- std::function<bool(char, int)> negatedFunction
- = [](char c, int flags) { return true; };
- for (int i = 0; i < length; i++)
- {
- switch (regex->getText()[i])
- {
- case '\\':
- {
- escaped = true;
- minusValid = false;
- break;
- }
- case '-':
- if (escaped || !minusValid)
- {
- characterList += '-';
- }
- else if (minusValid)
- {
- if (i == length - 1)
- {
- characterList += '-';
- }
- else
- {
- unsigned char before = (unsigned char)
- characterList[characterList.getLength() - 1];
- characterList.remove(characterList.getLength() - 1, 1);
- unsigned char after
- = (unsigned char)regex->getText()[i + 1];
- if (before > after)
- {
- unsigned char temp = before;
- before = after;
- after = temp;
- }
- for (unsigned char c = before; c <= after; c++)
- {
- characterList += c;
- }
- i++;
- }
- break;
- }
- default:
- {
- if (escaped)
- {
- if (regex->getText()[i] == 'w')
- {
- if (negated)
- {
- negatedFunction
- = [config, negatedFunction](char c, int flags) {
- return negatedFunction(c, flags)
- && !config.getWordChars().hat(c);
- };
- }
- else
- {
- characterList += config.getWordChars();
- }
- minusValid = false;
- }
- else if (regex->getText()[i] == 'W')
- {
- if (negated)
- {
- characterList += config.getWordChars();
- }
- else
- {
- start->addTransition(Transition<char>(
- [config](char c, int flags) {
- return !config.getWordChars().hat(c);
- },
- end,
- 0));
- }
- minusValid = false;
- }
- else if (regex->getText()[i] == 'd')
- {
- if (negated)
- {
- negatedFunction
- = [config, negatedFunction](char c, int flags) {
- return negatedFunction(c, flags)
- && (c < '0' || c > '9');
- };
- }
- else
- {
- characterList.append("0123456789");
- }
- minusValid = false;
- }
- else if (regex->getText()[i] == 's')
- {
- if (negated)
- {
- negatedFunction = [config, negatedFunction](
- char c, int flags) {
- return negatedFunction(c, flags)
- && !config.getWhitespaceChars().hat(c);
- };
- }
- else
- {
- characterList.append(config.getWhitespaceChars());
- }
- minusValid = false;
- }
- else if (regex->getText()[i] == 'D')
- {
- if (negated)
- {
- negatedFunction
- = [negatedFunction](char c, int flags) {
- return negatedFunction(c, flags)
- && (c >= '0' && c <= '9');
- };
- }
- else
- {
- start->addTransition(Transition<char>(
- [](char c, int flags) {
- return c < '0' || c > '9';
- },
- end,
- 0));
- }
- minusValid = false;
- }
- else if (regex->getText()[i] == 'S')
- {
- if (negated)
- {
- negatedFunction
- = [config, negatedFunction](char c, int flags) {
- return negatedFunction(c, flags)
- && config.getWhitespaceChars().hat(c);
- };
- }
- else
- {
- start->addTransition(Transition<char>(
- [config](char c, int flags) {
- return !config.getWhitespaceChars().hat(c);
- },
- end,
- 0));
- }
- minusValid = false;
- }
- else if (regex->getText()[i] == 'n')
- {
- characterList.append('\n');
- }
- else if (regex->getText()[i] == 'r')
- {
- characterList.append('\r');
- }
- else if (regex->getText()[i] == 't')
- {
- characterList.append('\t');
- }
- else
- {
- characterList.append(regex->getText()[i]);
- minusValid = true;
- }
- }
- else
- {
- characterList.append(regex->getText()[i]);
- minusValid = true;
- }
- break;
- }
- }
- }
- if (negated)
- {
- negatedFunction = [characterList, negatedFunction](char c, int flags) {
- return negatedFunction(c, flags) && !characterList.hat(c);
- };
- start->addTransition(Transition<char>(negatedFunction, end, 0));
- }
- else
- {
- start->addTransition(Transition<char>(
- [characterList](char c, int flags) { return characterList.hat(c); },
- end,
- 0));
- }
- regex->release();
- return result;
- }
- Automata<char>* concat(Array<Automata<char>*> parsers)
- {
- if (parsers.getEintragAnzahl() == 0)
- {
- Automata<char>* result = new Automata<char>();
- State<char>* start = result->addState();
- State<char>* end = result->addState();
- end->setFinal(true);
- start->addTransition(Transition<char>(0, end, 0));
- return result;
- }
- Automata<char>* result = 0;
- for (Automata<char>* parser : parsers)
- {
- if (result == 0)
- {
- result = parser;
- }
- else
- {
- result = concat(result, parser);
- }
- }
- return result;
- }
- Automata<char>* parse(Text* regex, int& nextGroupId, RegexConfig& config)
- {
- Array<Automata<char>*> results;
- int length = regex->getLength();
- for (int i = 0; i < length; i++)
- {
- bool escaped = false;
- switch (regex->getText()[i])
- {
- case '\\':
- {
- escaped = true;
- i++;
- break;
- }
- case '(':
- {
- int end = findEndPos(regex, i + 1, ')', '(');
- Text* subRegex = regex->getTeilText(i + 1, end);
- Automata<char>* subAutomata = 0;
- if (subRegex->hatAt(0, "?:"))
- {
- subRegex->remove(0, 2);
- subAutomata = parse(subRegex, nextGroupId, config);
- }
- else
- {
- int groupId = nextGroupId++;
- subAutomata
- = group(parse(subRegex, nextGroupId, config), groupId);
- }
- results.add(subAutomata);
- i = end;
- break;
- }
- case '[':
- {
- int end = findEndPos(regex, i + 1, ']', 0);
- Text* subRegex = regex->getTeilText(i + 1, end);
- Automata<char>* subAutomata
- = parseCharacterList(subRegex, config);
- results.add(subAutomata);
- i = end;
- break;
- }
- case '?':
- {
- Automata<char>* last
- = results.get(results.getEintragAnzahl() - 1);
- results.remove(results.getEintragAnzahl() - 1);
- results.add(maybe(last));
- break;
- }
- case '*':
- {
- bool lazy = false;
- if (i < length - 1 && regex->getText()[i + 1] == '?')
- {
- lazy = true;
- i++;
- }
- Automata<char>* last
- = results.get(results.getEintragAnzahl() - 1);
- results.remove(results.getEintragAnzahl() - 1);
- results.add(many(last, lazy));
- break;
- }
- case '+':
- {
- bool lazy = false;
- if (i < length - 1 && regex->getText()[i + 1] == '?')
- {
- lazy = true;
- i++;
- }
- Automata<char>* last
- = results.get(results.getEintragAnzahl() - 1);
- results.remove(results.getEintragAnzahl() - 1);
- results.add(atLeastOnce(last, lazy));
- break;
- }
- case '.':
- {
- Automata<char>* any = new Automata<char>();
- State<char>* start = any->addState();
- State<char>* end = any->addState();
- end->setFinal(true);
- start->addTransition(Transition<char>(
- [](char c, int flags) { return true; }, end, 0));
- results.add(any);
- break;
- }
- case '|':
- {
- Automata<char>* result = oneOf(concat(results),
- parse(regex->getTeilText(i + 1), nextGroupId, config));
- regex->release();
- return result;
- }
- case '{':
- {
- int end = findEndPos(regex, i + 1, '}', 0);
- Text* subRegex = regex->getTeilText(i + 1, end);
- int min = 0;
- int max = 0;
- if (subRegex->hat(","))
- {
- Text* minText
- = subRegex->getTeilText(0, subRegex->positionVon(","));
- Text* maxText
- = subRegex->getTeilText(subRegex->positionVon(",") + 1);
- min = (int)*minText;
- max = (int)*maxText;
- minText->release();
- maxText->release();
- }
- else
- {
- min = (int)*subRegex;
- max = min;
- }
- Automata<char>* last
- = results.get(results.getEintragAnzahl() - 1);
- results.remove(results.getEintragAnzahl() - 1);
- results.add(repeat(last, min, max));
- i = end;
- subRegex->release();
- break;
- }
- case '^':
- {
- Automata<char>* qutomata = new Automata<char>();
- State<char>* start = qutomata->addState();
- State<char>* end = qutomata->addState();
- end->setFinal(true);
- start->addTransition(Transition<char>(0, end, Flags::START));
- results.add(qutomata);
- break;
- }
- case '$':
- {
- Automata<char>* qutomata = new Automata<char>();
- State<char>* start = qutomata->addState();
- State<char>* end = qutomata->addState();
- end->setFinal(true);
- start->addTransition(Transition<char>(0, end, Flags::END));
- results.add(qutomata);
- break;
- }
- default:
- {
- Automata<char>* automata = new Automata<char>();
- State<char>* start = automata->addState();
- State<char>* end = automata->addState();
- end->setFinal(true);
- char current = regex->getText()[i];
- start->addTransition(Transition<char>(
- [current](char c, int flags) { return c == current; },
- end,
- 0));
- results.add(automata);
- break;
- }
- }
- if (escaped)
- {
- std::function<bool(char, int)> function;
- switch (regex->getText()[i])
- {
- case 'n':
- {
- function = [](char c, int flags) { return c == '\n'; };
- break;
- }
- case 'r':
- {
- function = [](char c, int flags) { return c == '\r'; };
- break;
- }
- case 't':
- {
- function = [](char c, int flags) { return c == '\t'; };
- break;
- }
- case 'w':
- {
- function = [config](char c, int flags) {
- return config.getWordChars().hat(c);
- };
- break;
- }
- case 'W':
- {
- function = [config](char c, int flags) {
- return !config.getWordChars().hat(c);
- };
- break;
- }
- case 'd':
- {
- function = [](char c, int flags) {
- return c >= '0' && c <= '9';
- };
- break;
- }
- case 'D':
- {
- function
- = [](char c, int flags) { return c < '0' || c > '9'; };
- break;
- }
- case 's':
- {
- function = [config](char c, int flags) {
- return config.getWhitespaceChars().hat(c);
- };
- break;
- }
- case 'S':
- {
- function = [config](char c, int flags) {
- return !config.getWhitespaceChars().hat(c);
- };
- break;
- }
- default:
- {
- char current = regex->getText()[i];
- function
- = [current](char c, int flags) { return c == current; };
- break;
- }
- }
- Automata<char>* automata = new Automata<char>();
- State<char>* start = automata->addState();
- State<char>* end = automata->addState();
- end->setFinal(true);
- start->addTransition(Transition<char>(function, end, 0));
- results.add(automata);
- }
- }
- regex->release();
- return concat(results);
- }
- Framework::Regex::Group::Group(int start, int end, int id)
- : start(start),
- end(end),
- id(id)
- {}
- Framework::Regex::Group::Group()
- : start(-1),
- end(-1),
- id(-1)
- {}
- int Framework::Regex::Group::getStart()
- {
- return start;
- }
- int Framework::Regex::Group::getEnd()
- {
- return end;
- }
- int Framework::Regex::Group::getId()
- {
- return id;
- }
- Framework::Regex::Result::Result(Array<Group> groups)
- : ReferenceCounter(),
- groups(groups)
- {}
- int Framework::Regex::Result::getGroupCount()
- {
- return groups.getEintragAnzahl();
- }
- Group Framework::Regex::Result::getGroup(int index)
- {
- return groups.get(index);
- }
- int Framework::Regex::Result::getStart()
- {
- return groups.get(0).getStart();
- }
- int Framework::Regex::Result::getEnd()
- {
- return groups.get(0).getEnd();
- }
- RegexConfig::RegexConfig()
- {
- whitespaceChars = " \n\r\t";
- wordChars
- = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_";
- }
- void RegexConfig::setWhitespaceChars(Text whitespaceChars)
- {
- this->whitespaceChars = whitespaceChars;
- }
- void RegexConfig::setWordChars(Text wordChars)
- {
- this->wordChars = wordChars;
- }
- Text RegexConfig::getWhitespaceChars() const
- {
- return whitespaceChars;
- }
- Text RegexConfig::getWordChars() const
- {
- return wordChars;
- }
- Automata<char>* Framework::Regex::parse(Text regex)
- {
- RegexConfig config;
- return parse(regex, config);
- }
- Automata<char>* Framework::Regex::parse(Text regex, RegexConfig& config)
- {
- int nextGroupId = 1;
- return ::parse(new Text(regex), nextGroupId, config);
- }
|