Skip to content

Commit

Permalink
Merge pull request karussell#34 from skyshard/abhishek/empty_extracti…
Browse files Browse the repository at this point in the history
…on_apnews

Fixed content extraction for apnews.com
  • Loading branch information
andresp99999 authored Jun 12, 2017
2 parents 8a3fab2 + 712eea5 commit 93e3fc4
Show file tree
Hide file tree
Showing 8 changed files with 1,280 additions and 19 deletions.
6 changes: 6 additions & 0 deletions src/main/java/de/jetwick/snacktory/ArticleTextExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -2183,6 +2183,12 @@ protected String extractAuthorName(Document doc) {
if(DEBUG_AUTHOR_EXTRACTION && matches!=null && matches.size()>0) System.out.println("AUTHOR: span[itemprop=author]");
}

// a hack for http://apnews.com/
if(matches == null || matches.size() == 0){
matches = doc.select("[class=mobile] h6");
if(DEBUG_AUTHOR_EXTRACTION && matches!=null && matches.size()>0) System.out.println("AUTHOR: [class=mobile] h6");
}

// select the best element from them
if(matches != null){
Element bestMatch = getBestMatchElement(matches);
Expand Down
39 changes: 21 additions & 18 deletions src/main/java/de/jetwick/snacktory/HtmlFetcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,29 +15,26 @@
*/
package de.jetwick.snacktory;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.FileNotFoundException;
import de.jetwick.snacktory.utils.SSLConnectionSocketFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.net.ssl.*;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.Proxy;
import java.net.URL;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.zip.GZIPInputStream;
import java.util.zip.Inflater;
import java.util.zip.InflaterInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.net.ssl.*;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;

/**
* Class to fetch articles. This class is thread safe.
Expand Down Expand Up @@ -495,12 +492,18 @@ protected HttpURLConnection createUrlConnection(String urlAsStr, int timeout,
if(DISABLE_SSL_VERIFICATION){
if (urlAsStr.toLowerCase().startsWith("https://")){
try {
SSLContext sslc = SSLContext.getInstance("TLS");
TrustManager[] trustManagerArray = { new NullX509TrustManager() };
sslc.init(null, trustManagerArray, null);
List sniHostNames = new ArrayList() {{
add(new SNIHostName(url.getHost()));
}};
SSLParameters sslParameters = new SSLParameters();
sslParameters.setServerNames(sniHostNames);

SSLContext sslContext = SSLContext.getInstance("TLS");
sslContext.init(null, new TrustManager[]{new NullX509TrustManager()}, null);

HttpsURLConnection hConnSecure = (HttpsURLConnection) hConn;
hConnSecure.setDefaultSSLSocketFactory(sslc.getSocketFactory());
hConnSecure.setDefaultHostnameVerifier(new NullHostnameVerifier());
hConnSecure.setSSLSocketFactory(new SSLConnectionSocketFactory(sslContext.getSocketFactory(), sslParameters));
hConnSecure.setHostnameVerifier(new NullHostnameVerifier());
} catch(Exception e) {
e.printStackTrace();
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
package de.jetwick.snacktory.utils;


import javax.net.ssl.SSLParameters;
import javax.net.ssl.SSLSocket;
import javax.net.ssl.SSLSocketFactory;
import java.io.IOException;
import java.net.InetAddress;
import java.net.Socket;
import java.net.UnknownHostException;

/**
* In Java 8, SSLSocketFactory has some weird behaviour when the HostNameVerification is
* bypassed it also skip the SNI header.
* <p>
* To overcome the default behaviour this class helps to include SNI in the SSL requests.
* <p>
* Ref: http://javabreaks.blogspot.in/2015/12/java-ssl-handshake-with-server-name.html
*/
public class SSLConnectionSocketFactory extends SSLSocketFactory {

private final SSLSocketFactory sslSocketFactory;
private final SSLParameters sslParameters;

public SSLConnectionSocketFactory(SSLSocketFactory sslSocketFactory, SSLParameters sslParameters) {
this.sslSocketFactory = sslSocketFactory;
this.sslParameters = sslParameters;
}

@Override
public String[] getDefaultCipherSuites() {
return sslSocketFactory.getDefaultCipherSuites();
}

@Override
public String[] getSupportedCipherSuites() {
return sslSocketFactory.getSupportedCipherSuites();
}

@Override
public Socket createSocket(Socket socket, String host, int port, boolean autoClose) throws IOException {
SSLSocket sslSocket = (SSLSocket) sslSocketFactory.createSocket(socket, host, port, autoClose);
setParameters(sslSocket);
return sslSocket;
}

@Override
public Socket createSocket(String host, int port) throws IOException, UnknownHostException {
SSLSocket socket = (SSLSocket) sslSocketFactory.createSocket(host, port);
setParameters(socket);
return socket;
}

@Override
public Socket createSocket(String host, int port, InetAddress localHost, int localPort)
throws IOException {
SSLSocket socket = (SSLSocket) sslSocketFactory.createSocket(host, port, localHost, localPort);
setParameters(socket);
return socket;
}

@Override
public Socket createSocket(InetAddress host, int port) throws IOException {
SSLSocket socket = (SSLSocket) sslSocketFactory.createSocket(host, port);
setParameters(socket);
return socket;
}

@Override
public Socket createSocket(InetAddress address, int port, InetAddress localAddress, int localPort) throws IOException {
SSLSocket socket = (SSLSocket) sslSocketFactory.createSocket(address, port, localAddress, localPort);
setParameters(socket);
return socket;
}

private void setParameters(SSLSocket socket) {
socket.setSSLParameters(sslParameters);
}
}
2 changes: 1 addition & 1 deletion src/main/resources/config.yml
Original file line number Diff line number Diff line change
@@ -1 +1 @@
defaultTimezone: UTC
defaultTimezone: UTC
30 changes: 30 additions & 0 deletions src/test/java/de/jetwick/snacktory/ArticleTextExtractorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -3082,6 +3082,36 @@ public void testPublicNet() throws Exception {
compareDates("2017-05-12 00:00:00", res.getDate());
}

@Test
public void testApnews() throws Exception {
// https://www.apnews.com/amp/0290dd1b2048498783f3d7d0ea28a10d
JResult res = new JResult();
res.setUrl("https://www.apnews.com/amp/0290dd1b2048498783f3d7d0ea28a10d");
res = extractor.extractContent(res, c.streamToString(getClass().getResourceAsStream("apnews.html")));
assertEquals("https://www.apnews.com/0290dd1b2048498783f3d7d0ea28a10d", res.getCanonicalUrl());
assertEquals("Deputies fatally shoot man who reportedly shot at neighbor", res.getTitle());
assertTrue(res.getText(), res.getText().startsWith("KNOXVILLE, Tenn. (AP) — Knox County Sheriff's Office deputies shot"));
assertTrue(res.getText(), res.getText().endsWith("Information from: Knoxville News Sentinel, http://www.knoxnews.com"));
assertEquals(StringUtils.EMPTY, res.getAuthorName());
assertEquals(StringUtils.EMPTY, res.getAuthorDescription());
compareDates("2017-05-24 21:50:34", res.getDate());
}

@Test
public void testApnews1() throws Exception {
// https://www.apnews.com/b823ddc6905d454dacc65fd107730f95/Playmate-tastes-shame,-ordered-to-clean-up-urban-grit
JResult res = new JResult();
res.setUrl("https://www.apnews.com/b823ddc6905d454dacc65fd107730f95/Playmate-tastes-shame,-ordered-to-clean-up-urban-grit");
res = extractor.extractContent(res, c.streamToString(getClass().getResourceAsStream("apnews1.html")));
assertEquals("https://www.apnews.com/b823ddc6905d454dacc65fd107730f95", res.getCanonicalUrl());
assertEquals("Playmate tastes shame, ordered to clean up urban grit", res.getTitle());
assertTrue(res.getText(), res.getText().startsWith("LOS ANGELES (AP) — Dani Mathers earned fame posing as a nude model."));
assertTrue(res.getText(), res.getText().endsWith("where people might be naked or expect privacy."));
assertEquals("BRIAN MELLEY", res.getAuthorName());
assertEquals("By BRIAN MELLEY", res.getAuthorDescription());
compareDates("2017-05-25 00:00:00", res.getDate());
}

@Test
public void testMorningStar() throws Exception {
// http://www.morningstar.com/news/associated-press/urn:publicid:ap.org:f8d53c4370434744a864d4afa5fa8d36/hackers-break-into-centralized-password-manager-onelogin.html
Expand Down
Loading

0 comments on commit 93e3fc4

Please sign in to comment.