diff options
| author | Carlos Maiolino <[email protected]> | 2026-02-20 16:17:14 +0100 |
|---|---|---|
| committer | Carlos Maiolino <[email protected]> | 2026-02-20 16:17:14 +0100 |
| commit | 4ff0e42f65d8bba3d21bed53bfe1251d8db5c13f (patch) | |
| tree | 9ac61873e2676c3f392bd26063afeb16897c3902 /CPP/cpp_book/chap6/find_url/furl.cpp | |
| parent | fd313dd5ad9ac067a31f2b1760b85bd305567131 (diff) | |
Diffstat (limited to 'CPP/cpp_book/chap6/find_url/furl.cpp')
| -rw-r--r-- | CPP/cpp_book/chap6/find_url/furl.cpp | 88 |
1 files changed, 88 insertions, 0 deletions
diff --git a/CPP/cpp_book/chap6/find_url/furl.cpp b/CPP/cpp_book/chap6/find_url/furl.cpp new file mode 100644 index 0000000..17e9822 --- /dev/null +++ b/CPP/cpp_book/chap6/find_url/furl.cpp @@ -0,0 +1,88 @@ +#include <iostream> +#include <vector> +#include <algorithm> + +static bool +not_url_char(char c) +{ + static const std::string url_ch = "~;/?:@=&$-_.+!*`(),"; + + return !(isalnum(c) || + find(url_ch.begin(), url_ch.end(), c) != url_ch.end()); +} + +static std::string::const_iterator +url_beg( + std::string::const_iterator b, + std::string::const_iterator e) +{ + static const std::string sep = "://"; + + std::string::const_iterator i = b; + + while ((i = search(i, e, sep.begin(), sep.end())) != e) { + + // make sure the separator isn't at the beginning of the line + if (i != b && i + sep.size() != e) { + + // beg marks the beginning of the protocol name + std::string::const_iterator beg = i; + while (beg != b && isalpha(beg[-1])) + beg--; + + // Is there at least one char before and after the separator? + if (beg != i && !not_url_char(i[sep.size()])) { + return beg; + } + } + + // The separator we found wasn't part of a URL; + // advance i past the separator + i += sep.size(); + } + + return e; + +} + +static std::string::const_iterator +url_end( + std::string::const_iterator b, + std::string::const_iterator e) +{ + return find_if(b, e, not_url_char); +} + +/* + * Find URLs in the string 's' and return + * a vector containing all strings found. + * We ignore the protocol and just get the url + */ +std::vector<std::string> +find_urls(const std::string& s) +{ + std::vector<std::string> ret; + std::string::const_iterator b = s.begin(); + std::string::const_iterator e = s.end(); + + while (b != e) { + + /* Look for one or more chars followed by '://' */ + b = url_beg(b, e); + + /* Found it? */ + if (b != e) { + + /* Get the end of the URL after '://' */ + std::string::const_iterator after = url_end(b, e); + + /* Save the url */ + ret.push_back(std::string(b, after)); + + /* Advance b and check if file has more URLs */ + b = after; + } + } + + return ret; +} |
