summaryrefslogtreecommitdiff
path: root/CPP/cpp_book/chap6/find_url/furl.cpp
blob: 17e9822d5d190721abab8ca9441cc098575bfbdd (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#include <iostream>
#include <vector>
#include <algorithm>

static bool
not_url_char(char c)
{
	static const std::string url_ch = "~;/?:@=&$-_.+!*`(),";

	return !(isalnum(c) ||
		 find(url_ch.begin(), url_ch.end(), c) != url_ch.end());
}

static std::string::const_iterator
url_beg(
	std::string::const_iterator b,
	std::string::const_iterator e)
{
	static const std::string sep = "://";

	std::string::const_iterator i = b;

	while ((i = search(i, e, sep.begin(), sep.end())) != e) {

		// make sure the separator isn't at the beginning of the line
		if (i != b && i + sep.size() != e) {

			// beg marks the beginning of the protocol name
			std::string::const_iterator beg = i;
			while (beg != b && isalpha(beg[-1]))
				beg--;

			// Is there at least one char before and after the separator?
			if (beg != i && !not_url_char(i[sep.size()])) {
				return beg;
			}
		}

		// The separator we found wasn't part of a URL;
		// advance i past the separator
		i += sep.size();
	}

	return e;

}

static std::string::const_iterator
url_end(
	std::string::const_iterator b,
	std::string::const_iterator e)
{
	return find_if(b, e, not_url_char);
}

/*
 * Find URLs in the string 's' and return
 * a vector containing all strings found.
 * We ignore the protocol and just get the url
 */
std::vector<std::string>
find_urls(const std::string& s)
{
	std::vector<std::string> ret;
	std::string::const_iterator b = s.begin();
	std::string::const_iterator e = s.end();

	while (b != e) {

		/* Look for one or more chars followed by '://' */
		b = url_beg(b, e);

		/* Found it? */
		if (b != e) {

			/* Get the end of the URL after '://' */
			std::string::const_iterator after = url_end(b, e);

			/* Save the url */
			ret.push_back(std::string(b, after));

			/* Advance b and check if file has more URLs */
			b = after;
		}
	}

	return ret;
}