#include #include #include static bool not_url_char(char c) { static const std::string url_ch = "~;/?:@=&$-_.+!*`(),"; return !(isalnum(c) || find(url_ch.begin(), url_ch.end(), c) != url_ch.end()); } static std::string::const_iterator url_beg( std::string::const_iterator b, std::string::const_iterator e) { static const std::string sep = "://"; std::string::const_iterator i = b; while ((i = search(i, e, sep.begin(), sep.end())) != e) { // make sure the separator isn't at the beginning of the line if (i != b && i + sep.size() != e) { // beg marks the beginning of the protocol name std::string::const_iterator beg = i; while (beg != b && isalpha(beg[-1])) beg--; // Is there at least one char before and after the separator? if (beg != i && !not_url_char(i[sep.size()])) { return beg; } } // The separator we found wasn't part of a URL; // advance i past the separator i += sep.size(); } return e; } static std::string::const_iterator url_end( std::string::const_iterator b, std::string::const_iterator e) { return find_if(b, e, not_url_char); } /* * Find URLs in the string 's' and return * a vector containing all strings found. * We ignore the protocol and just get the url */ std::vector find_urls(const std::string& s) { std::vector ret; std::string::const_iterator b = s.begin(); std::string::const_iterator e = s.end(); while (b != e) { /* Look for one or more chars followed by '://' */ b = url_beg(b, e); /* Found it? */ if (b != e) { /* Get the end of the URL after '://' */ std::string::const_iterator after = url_end(b, e); /* Save the url */ ret.push_back(std::string(b, after)); /* Advance b and check if file has more URLs */ b = after; } } return ret; }