1 package com.ljs.ootp.extract.html;
2
3 import com.github.rholder.retry.RetryException;
4 import com.github.rholder.retry.Retryer;
5 import com.github.rholder.retry.RetryerBuilder;
6 import com.github.rholder.retry.StopStrategies;
7 import com.github.rholder.retry.WaitStrategies;
8 import com.google.common.base.Charsets;
9 import com.google.common.base.Throwables;
10 import java.io.IOException;
11 import java.io.InputStream;
12 import java.net.URL;
13 import java.util.concurrent.Callable;
14 import java.util.concurrent.ExecutionException;
15 import java.util.logging.Level;
16 import java.util.logging.Logger;
17 import org.jsoup.Jsoup;
18 import org.jsoup.nodes.Document;
19
20
21
22
23
24 public final class Documents {
25
26 private static final Integer NUMBER_OF_RETRIES = 3;
27
28 private static final Logger LOGGER =
29 Logger.getLogger(Documents.class.getName());
30
31 private Documents() { }
32
33 public static Document load(final String url) {
34 LOGGER.log(Level.INFO, "Loading page {0}...", url);
35
36 Retryer<Document> retryer = RetryerBuilder
37 .<Document>newBuilder()
38 .retryIfException()
39 .withStopStrategy(StopStrategies.stopAfterAttempt(NUMBER_OF_RETRIES))
40 .withWaitStrategy(WaitStrategies.exponentialWait())
41 .build();
42
43 try {
44 return retryer.call(new Callable<Document>() {
45 @Override
46 public Document call() throws IOException {
47 try (
48 InputStream in = new URL(url).openStream()) {
49
50 return Jsoup.parse(in, Charsets.ISO_8859_1.name(), "");
51 }
52 }
53 });
54 } catch (RetryException | ExecutionException e) {
55 throw Throwables.propagate(e);
56 }
57 }
58
59 }