Write a program to read the content of any of the below website and all its sub pages and perform following actions:
Parse all the pages and sub pages of News, Sports and Business section
Extract the content, Image and Links
Dump the Content, Image and Links into the respective mongo collections
Websites
https://timesofindia.indiatimes.com/
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import javax.net.ssl.HttpsURLConnection;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
public class Main {
public static void main(String[] args) {
try {
URL newsURL = new URL("https://timesofindia.indiatimes.com/news");
HttpsURLConnection connection = (HttpsURLConnection) newsURL.openConnection();
BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream()));
StringBuilder builder = new StringBuilder();
String data;
while ((data = reader.readLine()) != null) {
builder.append(data).append('\n');
}
Document document = Jsoup.parse(builder.toString());
List<String> imgURLs = new ArrayList<>();
for (Element element : document.getElementsByTag("img")) {
imgURLs.add(element.attr("data-src"));
}
// imgURLs.forEach(System.out::println);
List<String> hrefURLs = new ArrayList<>();
for (Element element : document.getElementsByAttribute("href")) {
hrefURLs.add(element.attr("href"));
}
// hrefURLs.forEach(System.out::println);
} catch (IOException e) {
e.printStackTrace();
}
}
}
https://jsoup.org/
Comments
Leave a comment