当前位置:Gxlcms > PHP教程 > xml文件读写实例

xml文件读写实例

时间:2021-07-01 10:21:17 帮助过:29人阅读

这是一个目前在做的项目需要使用的xml文件读写实现。记起来以备后忘和供有需要的同学学习。

xml文件读写类:

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.input.SAXBuilder;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;
import org.lt.cj.config.entities.ConfigModel;
import org.lt.cj.config.entities.TMallConfigModel;
import org.lt.cj.core.Seed;
public class XMLConfigWriter {
    /*创建淘宝商城的配置文件*/
    public Document buildUpMallDocument(TMallConfigModel missionConfig) throws MissionConfigException, EnterUrlsException {
        if (missionConfig == null) {
            throw new MissionConfigException();
        } else if (missionConfig.getSeeds().isEmpty()) {
            return null;
        }
        // Create the root element
        Element rootElement = new Element("website");
        /* 设置网站属性 */
        /* 设置网站名称 */
        rootElement.setAttribute("name", missionConfig.getWebsiteName());
        /*设置网站地址*/
        rootElement.setAttribute("url", missionConfig.getWebsiteUrl());
        //添加任务名称
        Element taskElement = new Element("taskName");
        taskElement.addContent(missionConfig.getTaskName());
        rootElement.addContent(taskElement);
        //构造种子列表节点
        Element seeds = new Element("seeds");
        for (int i = 0; i < missionConfig.getSeeds().size(); i++) {
            Element seedElement = new Element("seed");
            Element seedNameElement = new Element("seedName");
            seedNameElement.addContent(missionConfig.getSeeds().get(i).getSeedName());
            Element seedUrlElement = new Element("seedUrl");
            seedUrlElement.addContent(missionConfig.getSeeds().get(i).getUrl());
            Element seedSortNameElement = new Element("sortName");
            seedSortNameElement.addContent(missionConfig.getSeeds().get(i).getSortName());
            seedElement.addContent(seedSortNameElement);
            seedElement.addContent(seedNameElement);
            seedElement.addContent(seedUrlElement);
            seeds.addContent(seedElement);
        }
        rootElement.addContent(seeds);
        //定义匹配的要采集的URL链接fitUrl的节点
        Element fiturls = new Element("fitUrls");
        for (int i = 0; i < missionConfig.getFitUrlRegs().size(); i++) {
            Element fitUrl = new Element("fit_url");
            fitUrl.addContent(missionConfig.getFitUrlRegs().get(i));
            fiturls.addContent(fitUrl);
        }
        rootElement.addContent(fiturls);//添加到根节点
        //并发工作线程数
        Element workingThreadsElement = new Element("workingThreads");
        workingThreadsElement.addContent("" + missionConfig.getWorkingThreads());
        rootElement.addContent(workingThreadsElement);//添加到根节点
        //定义页面编码节点
        Element pageEncodingElement = new Element("pageEncoding");
        pageEncodingElement.addContent(missionConfig.getPageEncoding());
        rootElement.addContent(pageEncodingElement);//添加到根节点
        //定义下载图片控制标志节点
        Element dwdPhoFlagElement = new Element("dwdPhoFlag");
        dwdPhoFlagElement.addContent(missionConfig.getDwdPhoFlag());
        rootElement.addContent(dwdPhoFlagElement);
        //定义原语言节点
        Element oriLan = new Element("orien_lan");
        oriLan.addContent(missionConfig.getOrigLanguage());
        Element transLan = new Element("trans_lan");
        transLan.addContent(missionConfig.getTranLanguage());
        rootElement.addContent(oriLan);//添加到根节点
        rootElement.addContent(transLan);//添加到根节点
        //定义匹配抓取信息的产品页面Url节点
        Element pageUrlRegs = new Element("pageUrlRegs");
        for (int i = 0; i < missionConfig.getPageReg().size(); i++) {
            Element pageUrl = new Element("pageUrl");
            pageUrl.addContent(missionConfig.getFitUrlRegs().get(i));
            pageUrlRegs.addContent(pageUrl);
        }
        rootElement.addContent(pageUrlRegs);//添加到根节点
        Map<String, List<String>> map = missionConfig.getEntityReg();
        List<String> list = null;
        Element pathElements = new Element("pathElements");
        //直接循环算啦
        //=====================================
        Iterator iter = map.entrySet().iterator();
        while (iter.hasNext()) {
            Map.Entry e = (Map.Entry) iter.next();
            Element element = new Element(e.getKey() + "");
            map = missionConfig.getEntityReg();
            list = map.get(e.getKey() + "");
            for (int i = 0; i < list.size(); i++) {
                Element path = new Element("path");
                path.addContent(list.get(i));
                element.addContent(path);
            }
            pathElements.addContent(element);
        }
        rootElement.addContent(pathElements);
        /*   =====================================================   */
        Document myDocument = new Document(rootElement);
        return myDocument;
    }
    /* 创建文档文件 */
    public void createConfigFile(Document document, String filepath) {
        try {
            /* 定义XML
输出器 */ XMLOutputter xmlOutPutter = new XMLOutputter(); xmlOutPutter.setFormat(Format.getPrettyFormat()); File file = new File(filepath); if (!file.exists()) { if (file.createNewFile()) { FileOutputStream fileOutputStream = new FileOutputStream(filepath); xmlOutPutter.output(document, fileOutputStream); return; } } FileOutputStream fileOutputStream = new FileOutputStream(filepath); xmlOutPutter.output(document, fileOutputStream); } catch (java.io.IOException e) { e.printStackTrace(); } } /* 重写文件 */ public void saveTask(String filePath, ConfigModel configModel) { try { TMallConfigModel tMallConfigModel = (TMallConfigModel) configModel; Document document = buildUpMallDocument(tMallConfigModel); if (document != null) { createConfigFile(document, filePath); } } catch (MissionConfigException ex) { Logger.getLogger(XMLConfigWriter.class.getName()).log(Level.SEVERE, null, ex); } catch (EnterUrlsException ex) { Logger.getLogger(XMLConfigWriter.class.getName()).log(Level.SEVERE, null, ex); } } //* xml文件读取方法 */ public TMallConfigModel readMallDocument(String filePath) { TMallConfigModel model = new TMallConfigModel(); SAXBuilder sb = new SAXBuilder(); try { //读取基本配置信息 Document doc = sb.build(filePath); //构造文档对象 Element root = doc.getRootElement(); //获取根元素 String websiteName = root.getAttributeValue("name"); //获取网站名称 String websiteAddr = root.getAttributeValue("url"); //获取网站地址 model.setWebsiteName(websiteName); //设置网站名称 model.setWebsiteUrl(websiteAddr); //设置网站地址 Element taskNameElement = root.getChild("taskName"); //获取任务名内容 String taskName = taskNameElement.getText(); model.setTaskName(taskName); //获取入口种子列表 List<Seed> seedList = new ArrayList(); Element seedsElement = root.getChild("seeds"); List list = seedsElement.getChildren(); for (int i = 0; i < list.size(); i++) { Element element = (Element) seedsElement.getChildren().get(i); Seed seed = new Seed(); Element seedNameElement = element.getChild("seedName"); Element seedUrlElement = element.getChild("seedUrl"); Element seedSortNameElement = element.getChild("sortName"); seed.setSeedName(seedNameElement.getTextTrim()); seed.setUrl(seedUrlElement.getTextTrim()); seed.setSortName(seedSortNameElement.getTextTrim()); Element parentSeedElement = element.getChild("parentSeed"); if (parentSeedElement != null) { Seed parentSeed = new Seed(); Element parentSeedNameElement = parentSeedElement.getChild("seedName"); Element parentSeedUrlElement = parentSeedElement.getChild("seedUrl"); Element parentSeedSortNameElement = parentSeedElement.getChild("sortName"); parentSeed.setSeedName(parentSeedNameElement.getText()); parentSeed.setUrl(parentSeedUrlElement.getTextTrim()); parentSeed.setSortName(parentSeedSortNameElement.getTextTrim()); } seedList.add(seed); } model.setSeeds(seedList); //获取匹配的要抽取的页面的特定部分内容 list = new ArrayList(); Element extractHtmlElement = root.getChild("extractHtml"); if (extractHtmlElement != null) { for (int i = 0; i < extractHtmlElement.getChildren().size(); i++) { Element element = (Element) extractHtmlElement.getChildren().get(i); list.add(element.getText()); } } model.setExtractHtmlReg(list); //获取匹配URLs list = new ArrayList(); Element fitUrlsElement = root.getChild("fitUrls"); for (int i = 0; i < fitUrlsElement.getChildren().size(); i++) { Element element = (Element) fitUrlsElement.getChildren().get(i); list.add(element.getText()); } model.setFitUrlRegs(list); //获取线程数量 Element workingThreadsElement = root.getChild("workingThreads"); String workingCount = workingThreadsElement.getText(); model.setWorkingThreads(Integer.valueOf(workingCount)); //获取解析编码 Element pageEncodingElement = root.getChild("pageEncoding"); String pageEncoding = pageEncodingElement.getText(); model.setPageEncoding(pageEncoding); //获取是否下载图片的标志 Element dwdPhoFlagElement = root.getChild("dwdPhoFlag"); String dphoFlag = dwdPhoFlagElement.getText(); model.setDwdPhoFlag(dphoFlag); //获取语言 Element orien_lanElement = root.getChild("orien_lan"); String orien = orien_lanElement.getText(); model.setOrigLanguage(orien); Element trans_lanElement = root.getChild("trans_lan"); String trans_lan = trans_lanElement.getText(); model.setTranLanguage(trans_lan); //获取URL正则匹配 Element pageUrlRegsElement = root.getChild("pageUrlRegs"); list = new ArrayList(); for (int i = 0; i < pageUrlRegsElement.getChildren().size(); i++) { Element element = (Element) pageUrlRegsElement.getChildren().get(i); list.add(element.getText()); } model.setPageReg(list); //获取余下的匹配规则 Map<String, List<String>> entityReg = new HashMap(); Element pathElements = root.getChild("pathElements"); for (int i = 0; i < pathElements.getChildren().size(); i++) { Element element = (Element) pathElements.getChildren().get(i); List<String> pathList = new ArrayList(); String mapName = element.getName(); for (int j = 0; j < element.getChildren().size(); j++) { Element childElement = (Element) element.getChildren().get(j); pathList.add(childElement.getText()); } entityReg.put(mapName, pathList); } model.setEntityReg(entityReg); } catch (JDOMException ex) { Logger.getLogger(XMLConfigWriter.class.getName()).log(Level.SEVERE, null, ex); } catch (IOException ex) { Logger.getLogger(XMLConfigWriter.class.getName()).log(Level.SEVERE, null, ex); } return model; } }

xml文件内容:

<?xml version="1.0" encoding="UTF-8"?>
<website name="taobao_mall" url="http://www.tmall.com/?ver=2011b">
  <taskName>caiji_tmall_精品男装_T恤</taskName>
  <seeds>
    <seed>
      <sortName>精品男装/T恤</sortName>
      <seedName>精品男装/T恤</seedName>
      <seedUrl>http://item.tmall.com/item.htm?id=9351702393</seedUrl>
    </seed>
  </seeds>
  <extractHtml>
      <path>div class="list item-view item-miniView"</path>
  </extractHtml>
  <fitUrls>
    <fit_url>http://www\.tmall\.com/go/act/tmall/iwanttobuy\.php.*</fit_url>
    <fit_url>http://list\.tmall\.com/.*</fit_url>
    <fit_url>http://item\.tmall\.com/item\.htm.*</fit_url>
  </fitUrls>
  <workingThreads>1</workingThreads>
  <pageEncoding>UTF-8</pageEncoding>
  <orien_lan>zh</orien_lan>
  <trans_lan>en</trans_lan>
  <pageUrlRegs>
    <pageUrl>http://www\.tmall\.com/go/act/tmall/iwanttobuy\.php.*</pageUrl>
  </pageUrlRegs>
  <pathElements>
    <commnents>
      <path>div class="tb-box tshop-psm tshop-psm-bdetailtabl" id="J_Detail"</path>
      <path>div id="reviews" class="J_DetailSection" data-reviewApi</path>
    </commnents>
    <shopAddr>
      <path>div class="clearfix tb-header-nav"</path>
      <path>div class="nav"</path>
      <path>a href</path>
    </shopAddr>
    <productDetail>
      <path>div id="attributes" class="attributes</path>
      <path>ul class="attributes-list</path>
      <path>li</path>
    </productDetail>
    <photosPath>
      <path>div class="tb-detail-bd tb-clear"</path>
      <path>div class="tb-gallery"</path>
      <path>div class="tb-booth tb-pic tb-s310"</path>
      <path>img id="J_ImgBooth" src</path>
    </photosPath>
    <category>
      <path>ul class="mallCrumbs-nav" id="J_crumbs"</path>
      <path>li class="mallCrumbs-nav-item"</path>
    </category>
    <countSold>
      <path>div class="tb-detail-bd tb-clear"</path>
      <path>ul class="tb-meta"</path>
      <path>li class="tb-sold-out tb-clear"</path>
    </countSold>
    <shopInfo>
      <path>div class="shop-intro"</path>
      <path>div class="extend"</path>
      <path>li</path>
    </shopInfo>
    <despPhos>
      <path>script</path>
    </despPhos>
    <thumbPhosPath>
      <path>div class="tb-detail-bd tb-clear"</path>
      <path>div class="tb-gallery"</path>
      <path>ul id="J_UlThumb" class="tb-thumb tb-clearfix"</path>
      <path>img src=</path>
    </thumbPhosPath>
    <productName>
      <path>div class="layout grid-s5m0 "</path>
      <path>div class="tb-detail-hd"</path>
      <path>a target="_blank" href=</path>
    </productName>
    <productPrice>
      <path>div class="tb-detail-bd tb-clear"</path>
      <path>ul class="tb-meta"</path>
      <path>li id="J_StrPriceModBox" class="tb-detail-price tb-clearfix"</path>
    </productPrice>
  </pathElements>
</website>

人气教程排行