1. Write a program to read the content of any of the below website and all its sub pages and perform following actions:
1. Parse all the pages and sub pages of
News, Sports and Business section
2. Extract the content, Image and Links
3. Dump the Content, Image and Links into the respective mongo collections
import java.util.Scanner;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
public class HttpClientExample {
public static void main(String args[]) throws Exception{
CloseableHttpClient ht_client = HttpClients.createDefault();
HttpGet ht_get = new HttpGet("http://www.example.com/");
HttpResponse ht_response = ht_client.execute(httpget);
Scanner in = new Scanner(ht_response.getEntity().getContent());
StringBuffer s_buffer = new StringBuffer();
while(in.hasNext()) {
s_buffer.append(in.next());
}
String res = s_buffer.toString();
System.out.println(res);
res = res.replaceAll("<[^>]*>", "");
System.out.println(res);
}
}
Comments
Leave a comment