清华大佬耗费三个月吐血整理的几百G的资源,免费分享!....>>>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 | package com.zzger.model; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.concurrent.CountDownLatch; import com.zzger.module.queue.UrlQueue; import com.zzger.util.HttpUtils; import com.zzger.util.RegexUtils; public class WebSite { /** * 站点url */ private String url; /** * 需要爬行的url队列 */ private UrlQueue<String> urls = new UrlQueue<>(); /** * 已爬行过的页面url */ private List<String> exitUrls = Collections.synchronizedList( new ArrayList<>()); private static final int TOTAL_THREADS = 12; private final CountDownLatch mStartSignal = new CountDownLatch(1); private final CountDownLatch mDoneSignal = new CountDownLatch(TOTAL_THREADS); public WebSite(String url){ this.url = url; urls.offer(url); //把网站首页加入需要爬行的队列中 } public void guangDu(){ new Thread( new Runnable() { @Override public void run() { paxing(HttpUtils.httpGet(url)); } }).start(); } public void paxing(String html){ if (html.lastIndexOf( "下一页</a></li></ul></div>" )<0) return ; String strList = html.substring(html.indexOf( "<li class=\\" next-page\\ ">" ), html.lastIndexOf( "下一页</a></li></ul></div>" )); String url = RegexUtils.RegexString( "<a href=\\" (.+?)\\ "" , strList); if (url.equals( "Nothing" )) return ; urls.put(url); //把url存储到队列中 paxing(HttpUtils.httpGet(url)); } public void dxcPx(){ Page<DuanZi> page = new Gxpage(urls.take()); List<Section<DuanZi>> list = page.ybhqSection().getSections(); for (Section<DuanZi> section : list){ new Thread( new Runnable() { @Override public void run() { mStartSignal.countDown(); // 计数减一为0,工作线程真正启动具体操作 try { mStartSignal.await(); // 阻塞,等待mStartSignal计数为0运行后面的代码 // 所有的工作线程都在等待同一个启动的命令 } catch (InterruptedException e) { e.printStackTrace(); } DuanZi duanzi = section.select().getModel(); System.out.println(duanzi.getTitle()); mDoneSignal.countDown(); // 完成以后计数减一 } } ).start(); } try { mDoneSignal.await(); // 等待所有工作线程结束 } catch (InterruptedException e) { e.printStackTrace(); } dxcPx(); //线程任务执行完后,再次获取url队列进行任务 } public static void main(String[] args) { WebSite web = new WebSite( "http://duanziwang.com" ); web.guangDu(); for (int i = 0; i<10;i++){ new Thread( new Runnable() { @Override public void run() { web.dxcPx(); } }).start(); } } } |