Program Listing for File path_match.h#

Return to documentation for file (path_match.h)

#ifndef WILDCARD_MATCH_H
#define WILDCARD_MATCH_H

#include <filesystem>
#include <string>
#include <vector>
#include <regex>
#include <stdexcept>
#include <stack>
#include <list>

inline bool CheckWildcardPathMatch(const std::filesystem::path& rpathRel, const std::string& rssPattern)
{
    std::filesystem::path pathPattern = std::filesystem::u8path(rssPattern);
    auto itPathRel = rpathRel.begin();
    auto itPathPattern = pathPattern.begin();

    // Iterate through the paths
    bool bLastPartAsterisk = false; // Special case when the last checked part is a single asterisk.
    while (itPathRel != rpathRel.end() && itPathPattern != pathPattern.end())
    {
        // Get the parts and increase
        std::string ssPathPart = itPathRel->generic_u8string();
        std::string ssPatternPart = itPathPattern->generic_u8string();
        itPathRel++;
        itPathPattern++;

        // Compose a new path from the rest of the path starting at the provided iterator.
        auto fnBuildNewPath = [](std::filesystem::path::iterator& ritPath, const std::filesystem::path& rpath)
        {
            std::filesystem::path pathNew;
            while (ritPath != rpath.end())
            {
                pathNew /= *ritPath;
                ++ritPath;
            }
            return pathNew;
        };

        // Special case... skipping zero or more path parts.
        if (ssPatternPart == "**")
        {
            // The pattern "**/.." or even multiple higher directories like "**/../.." would indicate that any number of directories
            // can be skipped, but at least the amount of directories need to be present to be able to go higher. By building
            // potential relative paths (below), skip the amount of paths going higher.
            size_t nSkipParent = 0;
            while (itPathPattern != pathPattern.end() && itPathPattern->generic_u8string() == "..")
            {
                itPathPattern++;
                nSkipParent++;
            }

            // Build a new pattern from the rest of the pattern.
            std::filesystem::path pathNewPattern = fnBuildNewPath(itPathPattern, pathPattern);

            // Build a list of new potential relative paths.
            // NOTE: More efficient would be to execute the matching algorithm for each path-part until the end of the path.
            // Since the path iterator doesn't allow copying like it does with the container classes, it doesn't work to reuse the
            // ierator to build each new path on the fly. Instead, first create a list of potential paths and then execute the
            // matching process.
            std::vector<std::filesystem::path> vecNewRelPaths;
            itPathRel--;    // Include the current path part as well.
            while (itPathRel != rpathRel.end())
            {
                // Skip the amount of parent parts if ".." was detected.
                if (nSkipParent)
                {
                    itPathRel++;
                    nSkipParent--;
                    continue;
                }

                // Add to all existing paths in the vector
                for (auto& rpath : vecNewRelPaths)
                    rpath /= *itPathRel;

                // Add as a separate part to the vector
                vecNewRelPaths.push_back(*itPathRel);

                // Next part
                itPathRel++;
            }

            // When the amount of parents to be skipped is higher than 0, there is no parent to skip anymore.
            if (nSkipParent) return false;

            // Iterate through the list of potential relative paths and check for at least one fit...
            for (const auto& rpathNewRel : vecNewRelPaths)
            {
                // Check for this path. If it matches... it is valid.
                if (CheckWildcardPathMatch(rpathNewRel, pathNewPattern.generic_u8string()))
                    return true;
            }

            // If none fit, there is no match.
            return false;
        }

        // Special case... ignore current test.
        if (ssPatternPart == ".")
        {
            // Redo the last path iteration and continue with the matching process.
            itPathRel--;
            continue;
        }

        // Special case... use parent path.
        if (ssPatternPart == "..")
        {
            // If the path also has a higher directory, everything is good
            if (ssPathPart == "..") continue;

            // Redo the last path iteration.
            itPathRel--;

            // Does a higher part path exists?
            if (itPathRel == rpathRel.begin())
            {
                // A higher directory doesn't exist. Create a new pattern starting with "..".
                std::filesystem::path pathNewPattern = fnBuildNewPath(itPathPattern, pathPattern);

                //  Create a new path from the path starting with "..".
                std::filesystem::path pathNew = ".." / rpathRel;

                // Redo the matching process.
                // NOTE: Since ".." was prepended, only generic patterns with "*" and "**" can match.
                return CheckWildcardPathMatch(pathNew, pathNewPattern.generic_u8string());
            }
            else
            {
                // Redo the path before the last path iteration as well.
                itPathRel--;
                continue;
            }
        }

        // Special case... skipping the current part
        if (ssPatternPart == "*")
        {
            // Are there any pattern parts following; if not, the last pattern part is an asterisk.
            if (itPathPattern == pathPattern.end())
                bLastPartAsterisk = true;
            else if (itPathPattern->generic_u8string() == "..")
            {
                // The combination "*/.." cancels itself.
                itPathRel--;
                itPathPattern++;
                continue;
            }
        }

        // Iterate through the path parts
        size_t nPathPartPos = 0;
        size_t nPatternPartPos = 0;
        while (nPathPartPos != ssPathPart.size() && nPatternPartPos != ssPatternPart.size())
        {
            // Get the characters and increase
            char cPathPart = ssPathPart[nPathPartPos];
            char cPatternPart = ssPatternPart[nPatternPartPos];
            nPathPartPos++;
            nPatternPartPos++;

            // Classify and check with the pattern.
            switch (cPatternPart)
            {
            case '?':
                // Any character is allowed.
                break;
            case '*':
                // Skip zero or more wildcard-characters following the asterisk.
                while (nPatternPartPos != ssPatternPart.size() &&
                    (ssPatternPart[nPatternPartPos] == '*' || ssPatternPart[nPatternPartPos] == '?'))
                    nPatternPartPos++;

                // When there are no more characters following, the rest of the path part fits.
                if (nPatternPartPos == ssPatternPart.size())
                {
                    nPathPartPos = ssPathPart.size();
                    break;
                }

                // Skip zero or more characters in the path part until the next fitting characters.
                cPatternPart = ssPatternPart[nPatternPartPos];
                nPathPartPos--; // Start with the current path
                while (nPathPartPos != ssPathPart.size() && ssPathPart[nPathPartPos] != cPatternPart)
                    nPathPartPos++;

                // When the path part pos has reached the end, there is no match.
                if (nPathPartPos == ssPathPart.size())
                    return false;

                // Go to the next character
                nPathPartPos++;
                nPatternPartPos++;
                break;
            default:
                // Character needs to fit
                if (cPathPart != cPatternPart) return false;
                break;
            }
        }

        // If the path part has reached the end, but the pattern still has space, this might indicate a mismatch. But only when the
        // rest of the pattern doesn't contain one or more asterisk '*' until the end of the path. Simply skip these characters.
        while (nPatternPartPos != ssPatternPart.size() && ssPatternPart[nPatternPartPos] == '*')
            nPatternPartPos++;

        // When both have reached the end, the path parts are identical.
        if (nPathPartPos == ssPathPart.size() && nPatternPartPos == ssPatternPart.size())
            continue;

        // No match
        return false;
    }

    // When both have reached the end of iteration, we have a match of files.
    if (itPathRel == rpathRel.end() && itPathPattern == pathPattern.end())
        return true;

    // When only itPathPattern has reached the end of iteration, we have a match of directories... All files in the directory (but
    // not in any sub-directories) are valid. The exception would be if the pattern was "*", which is forced here to match files
    // only. In that case bLastPartAsterisk would be true.
    if (itPathRel != rpathRel.end() && itPathPattern == pathPattern.end() && !bLastPartAsterisk)
    {
        itPathRel++;
        return itPathRel == rpathRel.end();
    }

    // No match...
    return false;
}

inline std::string NormalizeRegexPattern(const std::string& rssPattern)
{
    // Throw regex group mismatch exception
    auto fnThrowGroupException = [](char c)
    {
        switch (c)
        {
        case '(':
        case ')':
            throw std::regex_error(std::regex_constants::error_paren);
        case '[':
        case ']':
            throw std::regex_error(std::regex_constants::error_brack);
        case '{':
        case '}':
            throw std::regex_error(std::regex_constants::error_brace);
        default:
            break;
        }
    };

    // Return the matching group character (parenthesis, brakcets or braces)
    auto fnGroupMatch = [](char cOpenGroup) -> char
    {
        switch (cOpenGroup)
        {
        case '(': return ')';
        case ')': return '(';
        case '[': return ']';
        case ']': return '[';
        case '{': return '}';
        case '}': return '{';
        default: return cOpenGroup;
        }
    };

    // Split the pattern in chunks
    std::list<std::string> lstPattern;
    std::string ssPart;
    size_t nPos = 0;
    std::stack<char> stackGroup;
    while (nPos < rssPattern.size())
    {
        char c = rssPattern[nPos];
        nPos++;

        switch (c)
        {
        case '(': // Group (...)
        case '[': // Group [...]
        case '{': // Group {...}
            stackGroup.push(c);
            ssPart += c;
            break;
        case ')': // Group (...)
        case ']': // Group [...]
        case '}': // Group {...}
            if (stackGroup.empty()) fnThrowGroupException(c);
            if (fnGroupMatch(stackGroup.top()) != c) fnThrowGroupException(stackGroup.top());
            stackGroup.pop();
            ssPart += c;
            break;
        case '\\': // Escape
            ssPart += c;
            if (nPos >= rssPattern.size()) throw std::regex_error(std::regex_constants::error_escape);
            c = rssPattern[nPos];
            ssPart += c;
            nPos++;
            break;
        case '/': // Separator
            ssPart += c;
            if (stackGroup.empty())
            {
                lstPattern.push_back(std::move(ssPart));
                ssPart.clear();
            }
            break;
        default:
            ssPart += c;
            break;
        }
    }
    if (!stackGroup.empty()) fnThrowGroupException(stackGroup.top());
    if (!ssPart.empty())
        lstPattern.push_back(std::move(ssPart));

    // Erase all entries to the current directory
    lstPattern.erase(std::remove(lstPattern.begin(), lstPattern.end(), "\\./"), lstPattern.end());

    // Handle parent directories
    auto itPart = lstPattern.begin();
    while (itPart != lstPattern.end())
    {
        // Next
        auto itCurrent = itPart;
        itPart++;

        // Check for any directory followed by a parent directory - change to any directory: ".*\\.\\./"
        if (*itCurrent == ".*\\.\\./")
        {
            *itCurrent = ".*/";
            continue;
        }

        // Check for parent directory
        if (*itCurrent == "\\.\\./")
        {
            // Check whether there is a directory before
            if (itCurrent != lstPattern.begin())
            {
                // Get the directory before
                auto itBefore = itCurrent;
                itBefore--;

                // Anything with .* in it will be truncated: "abc.*/", ".*jojo/", ".*/"
                nPos = itBefore->rfind(".*");
                if (nPos != std::string::npos)
                {
                    *itBefore = itBefore->substr(nPos, 2) + '/';
                    lstPattern.erase(itCurrent);
                    continue;
                }

                // All others are considered a single directory and will be removed
                lstPattern.erase(itBefore);
                lstPattern.erase(itCurrent);
                continue;
            }
            else if (itPart != lstPattern.end())
            {
                // Get the directory behind
                auto itBehind = itPart;
                itPart++;

                // Get the directory after. This can only work if the directory after can be anything: starting with ".*/", ".*anything" or "[^/]+/"
                if (*itBehind == ".*/")
                {
                    lstPattern.erase(itCurrent);
                    *itBehind = ".*";   // Remove trailing slash
                    continue;
                }
                if (itBehind->substr(0, 2) == ".*")
                {
                    lstPattern.erase(itCurrent);
                    continue;
                }
                else if (*itBehind == "[^/]+/")
                {
                    lstPattern.erase(itCurrent);
                    lstPattern.erase(itBehind);
                    continue;
                }

                // Invalid use of "\\.\\."
                throw std::regex_error(std::regex_constants::error_complexity);
            }
            else
            {
                // Invalid use of "\\.\\."
                throw std::regex_error(std::regex_constants::error_complexity);
            }
        }
    }

    // Rebuild the pattern
    std::string ssPatternNew;
    for (const std::string& rssPart : lstPattern)
        ssPatternNew += rssPart;

    return ssPatternNew;
}

inline bool CheckRegexPathMatch(const std::filesystem::path& rpathRel, const std::string& rssPattern)
{
    try
    {
        // Convert paths to generic, portable string format (with '/' separators)
        const std::string ssPath = rpathRel.generic_u8string();

        // Use the ECMAScript grammar, which is the default and most common.
        const std::regex rgx(NormalizeRegexPattern(rssPattern), std::regex::ECMAScript);

        // std::regex_match checks if the entire string is a match for the pattern.
        return std::regex_match(ssPath, rgx);
    }
    catch (const std::regex_error& e)
    {
        // In case of an invalid regex pattern, log the error and return false.
        std::cerr << "Regex error: " << e.what() << " for pattern: " << rssPattern << std::endl;
        return false;
    }
}

enum class EPathMatchAlgorithm
{
    wildcards,
    regex,
};

inline std::vector<std::filesystem::path> CollectWildcardPath(const std::filesystem::path& rpathBase,
    const std::string& rssPattern, EPathMatchAlgorithm eAlgorithm = EPathMatchAlgorithm::wildcards)
{
    std::vector<std::filesystem::path> pathMatches;
    try
    {
        std::filesystem::path pathBaseCopy = rpathBase;
        std::string ssPatternCopy = rssPattern;
        bool bUseAbsolutePath = false;
        if (pathBaseCopy.empty())
        {
            // Get the first position of a path not including wildcard and/or regex special characters
            size_t nPos = rssPattern.find_first_of("[](){}*?.\\+");

            // Get the directory separator before that
            nPos = rssPattern.substr(0, nPos).find_last_of('/');

            // The base pattern is everything before the '/'.
            if (nPos != std::string::npos)
            {
                pathBaseCopy = rssPattern.substr(0, nPos);
                ssPatternCopy = rssPattern.substr(nPos + 1);
                bUseAbsolutePath = true;
            }
        }
        else if (std::filesystem::path(rssPattern).is_absolute())
        {
            // Separator processing function.
            size_t nSearchPos = 0;
            auto fnProcessSeparator = [&rssPattern, &nSearchPos]() -> bool
            {
#ifdef _WIN32
                if (nSearchPos >= rssPattern.size() || (rssPattern[nSearchPos] != '/' && rssPattern[nSearchPos] != '\\'))
#else
                if (nSearchPos >= rssPattern.size() || rssPattern[nSearchPos] != '/')
#endif
                    return false;
                nSearchPos++;
                return true;
            };

            // Remove the base from the pattern.
            bool bExplicitSeparator = false;
            for (const auto& rpathBasePart : rpathBase)
            {
                // Skip separator in base path
#ifdef _WIN32
                if (rpathBasePart == "/" || rpathBasePart == "\\")
#else
                if (rpathBasePart == "/")
#endif
                {
                    fnProcessSeparator();
                    bExplicitSeparator = true;
                    continue;
                }

                // Separator needed?
                if (!bExplicitSeparator && nSearchPos && !fnProcessSeparator())
                    return {};
                bExplicitSeparator = false;

                // Check for the base part inside the pattern
                std::string ssPathPart = rpathBasePart.generic_u8string();
                if (rssPattern.substr(nSearchPos, ssPathPart.size()) != ssPathPart)
                    return {};
                nSearchPos += ssPathPart.size();
            }

            // Skip trailing slash if existing
#ifdef _WIN32
            if (nSearchPos < rssPattern.size() &&
                (rssPattern[nSearchPos] == '/' || (rssPattern[nSearchPos] == '\\' && eAlgorithm == EPathMatchAlgorithm::wildcards)))
#else
            if (nSearchPos < rssPattern.size() && rssPattern[nSearchPos] == '/')
#endif
                nSearchPos++;

            // Copy the pattern following the parg
            ssPatternCopy = rssPattern.substr(nSearchPos);

            // If the pattern was absolute and the represents exactly the base path, search for all files in the top most directory.
            if (ssPatternCopy.empty())
            {
                // Add wildcards or regex structure for all files in all directories
                switch (eAlgorithm)
                {
                case EPathMatchAlgorithm::regex:
                    ssPatternCopy = "[^/]+";
                    break;
                case EPathMatchAlgorithm::wildcards:
                default:
                    ssPatternCopy = "*";
                    break;
                }
            }
        }

        // If there is no pattern, then all content of the directory of the base path is required.
        if (ssPatternCopy.empty())
        {
            // Add wildcards or regex structure for all files in all directories
            switch (eAlgorithm)
            {
            case EPathMatchAlgorithm::regex:
                ssPatternCopy = ".*";
                break;
            case EPathMatchAlgorithm::wildcards:
            default:
                ssPatternCopy = "**";
                break;
            }
        }

        // Iterate through the path files of the base path.
        for (auto const& rDirEntry : std::filesystem::recursive_directory_iterator(pathBaseCopy))
        {
            if (!rDirEntry.is_regular_file()) continue;
            std::filesystem::path pathRel = rDirEntry.path().lexically_relative(pathBaseCopy);
            std::filesystem::path pathToAdd = bUseAbsolutePath ? rDirEntry.path() : pathRel;
            switch (eAlgorithm)
            {
            case EPathMatchAlgorithm::wildcards:
                if (CheckWildcardPathMatch(pathRel, ssPatternCopy))
                    pathMatches.push_back(pathToAdd);
                break;
            case EPathMatchAlgorithm::regex:
                if (CheckRegexPathMatch(pathRel, ssPatternCopy))
                    pathMatches.push_back(pathToAdd);
                break;
            default:
                break;
            }
        }
    }
    catch (std::filesystem::filesystem_error& rexception)
    {
        std::cerr << rexception.what();
    }
    return pathMatches;
}

#endif // WILDCARD_MATCH_H