View Javadoc
1   package com.ljs.ootp.extract.html;
2   
3   import com.github.rholder.retry.RetryException;
4   import com.github.rholder.retry.Retryer;
5   import com.github.rholder.retry.RetryerBuilder;
6   import com.github.rholder.retry.StopStrategies;
7   import com.github.rholder.retry.WaitStrategies;
8   import com.google.common.base.Charsets;
9   import com.google.common.base.Throwables;
10  import java.io.IOException;
11  import java.io.InputStream;
12  import java.net.URL;
13  import java.util.concurrent.Callable;
14  import java.util.concurrent.ExecutionException;
15  import java.util.logging.Level;
16  import java.util.logging.Logger;
17  import org.jsoup.Jsoup;
18  import org.jsoup.nodes.Document;
19  
20  /**
21   *
22   * @author lstephen
23   */
24  public final class Documents {
25  
26      private static final Integer NUMBER_OF_RETRIES = 3;
27  
28      private static final Logger LOGGER =
29          Logger.getLogger(Documents.class.getName());
30  
31      private Documents() { }
32  
33      public static Document load(final String url) {
34          LOGGER.log(Level.INFO, "Loading page {0}...", url);
35  
36          Retryer<Document> retryer = RetryerBuilder
37              .<Document>newBuilder()
38              .retryIfException()
39              .withStopStrategy(StopStrategies.stopAfterAttempt(NUMBER_OF_RETRIES))
40              .withWaitStrategy(WaitStrategies.exponentialWait())
41              .build();
42  
43          try {
44              return retryer.call(new Callable<Document>() {
45                  @Override
46                  public Document call() throws IOException {
47                      try (
48                          InputStream in = new URL(url).openStream()) {
49  
50                          return Jsoup.parse(in, Charsets.ISO_8859_1.name(), "");
51                      }
52                  }
53              });
54          } catch (RetryException | ExecutionException e) {
55              throw Throwables.propagate(e);
56          }
57      }
58  
59  }