1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
|
#include <iostream>
#include <vector>
#include <algorithm>
static bool
not_url_char(char c)
{
static const std::string url_ch = "~;/?:@=&$-_.+!*`(),";
return !(isalnum(c) ||
find(url_ch.begin(), url_ch.end(), c) != url_ch.end());
}
static std::string::const_iterator
url_beg(
std::string::const_iterator b,
std::string::const_iterator e)
{
static const std::string sep = "://";
std::string::const_iterator i = b;
while ((i = search(i, e, sep.begin(), sep.end())) != e) {
// make sure the separator isn't at the beginning of the line
if (i != b && i + sep.size() != e) {
// beg marks the beginning of the protocol name
std::string::const_iterator beg = i;
while (beg != b && isalpha(beg[-1]))
beg--;
// Is there at least one char before and after the separator?
if (beg != i && !not_url_char(i[sep.size()])) {
return beg;
}
}
// The separator we found wasn't part of a URL;
// advance i past the separator
i += sep.size();
}
return e;
}
static std::string::const_iterator
url_end(
std::string::const_iterator b,
std::string::const_iterator e)
{
return find_if(b, e, not_url_char);
}
/*
* Find URLs in the string 's' and return
* a vector containing all strings found.
* We ignore the protocol and just get the url
*/
std::vector<std::string>
find_urls(const std::string& s)
{
std::vector<std::string> ret;
std::string::const_iterator b = s.begin();
std::string::const_iterator e = s.end();
while (b != e) {
/* Look for one or more chars followed by '://' */
b = url_beg(b, e);
/* Found it? */
if (b != e) {
/* Get the end of the URL after '://' */
std::string::const_iterator after = url_end(b, e);
/* Save the url */
ret.push_back(std::string(b, after));
/* Advance b and check if file has more URLs */
b = after;
}
}
return ret;
}
|