Java解析出sitemap.xml里面的url

发布时间:2020-03-22 21:29:48 作者:Mos 阅读量:3144

package util;

import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;

public class DOMResolver {
    public static void main(String[] args) {
        DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
        try {
            DocumentBuilder db = dbf.newDocumentBuilder();
            Document document = db.parse("https://www.i847.cn/sitemap.xml");
            NodeList urlList = document.getElementsByTagName("url");
            int urlCnt = urlList.getLength();
            System.out.println("总的链接数:" + urlCnt);
            int groupIndex = 0;
            for (int i = 0; i < urlCnt; i++) {
                Node url = urlList.item(i);
                NodeList childNodes = url.getChildNodes();
                for (int k = 0; k < childNodes.getLength(); k++) {
                    String nodeName = childNodes.item(k).getTextContent().trim();
                    if ("loc".equals(childNodes.item(k).getNodeName()) && nodeName.endsWith("html")) {
                        String resUrl = nodeName;
                        if (nodeName.startsWith("http://") && !nodeName.contains("www")) {
                            nodeName = nodeName.substring(7, nodeName.length());
                            resUrl = "http://www.";
                            resUrl = resUrl.concat(nodeName);
                        }
                        System.out.println(resUrl);
                        groupIndex++;
                    }
                    //因为百度一次只能提交20个链接,因此以20个链接为1组
                    if (groupIndex == 20) {
                        System.out.println("\n\n===================" + groupIndex + "============================\n");
                        groupIndex = 0;
                    }
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

转载请注明出处:文思齐远博客,如果对您有帮助,请喝杯咖啡,谢谢您了!
支付宝打赏 微信打赏

我要评论

©2021 i847.cn
部分内容转自网络,如有损害您的权益,致邮联系:jiang2008wen#126.com,一经证实,立即删除!   
备案号:蜀ICP备18020563号-1