C# XML HTML SgmlReader sitemap 昨天还在折腾HTML页面解析,还是觉得以前用VB使用WEBBROWSER比较简单,现在模拟一个提交都要监视一个IP数据包,看看里面是不是还隐含提交了什么东西。真是郁闷。 搞了半天,终于根据目前的水平,弄了一个简单的玩意,把我的BLOG提取URL链接,然后制作SITEMAP网站地图。也没有写HTML了,反正就几个页面,简单手工处理完了。 步骤: 1、使用 WebRequest WebResponse 访问页面和获取HTML内容,用UTF-8转码 2、使用 SgmlReader 将HTML虚拟化为 XML 3、根据 XPATH 获得文本标题和超级链接并组合。 private int ipage=0; www.shengfang.org private void button1_Click(object sender, System.EventArgs e) { ipage=ipage+1; string sPageUrl="http://www.shengfang.org/blog/index.php?job=listall&golist=&page="+ipage.ToString(); string sContent=""; www.shengfang.org WebRequest request = WebRequest.Create(sPageUrl); WebResponse response = request.GetResponse(); Stream resStream = response.GetResponseStream(); Encoding encode = System.Text.Encoding.GetEncoding("utf-8"); StreamReader sr = new StreamReader(resStream, encode); html.Text=sr.ReadToEnd(); SgmlReader reader = new SgmlReader(); reader.DocType = "HTML"; reader.InputStream = new StringReader(html.Text); StringWriter sw = new StringWriter(); XmlTextWriter writer = new XmlTextWriter(sw); writer.Formatting = Formatting.Indented; while (reader.Read()) { www.shengfang.org if (reader.NodeType != XmlNodeType.Whitespace) { writer.WriteNode(reader, true); } } xml.Text = sw.ToString(); resStream.Close(); sr.Close(); www.shengfang.org } private void button2_Click(object sender, System.EventArgs e) { string sXPath="/html/body/div/table/tr/td/table/table/tr/td/div/table/tr/td/span/a"; System.Text.StringBuilder sb = new System.Text.StringBuilder(); System.Xml.XPath.XPathDocument doc = new System.Xml.XPath.XPathDocument(new StringReader(xml.Text)); www.shengfang.org System.Xml.XPath.XPathNavigator nav = doc.CreateNavigator(); System.Xml.XPath.XPathNodeIterator nodes = nav.Select(sXPath); string sXPath1="/html/body/div/table/tr/td/table/table/tr/td/div/table/tr/td/span/a/ at href"; System dot Xml.XPath.XPathDocument doc1 = new System.Xml.XPath.XPathDocument(new StringReader(xml.Text)); www.shengfang.org System.Xml.XPath.XPathNavigator nav1 = doc1.CreateNavigator(); System.Xml.XPath.XPathNodeIterator nodes1 = nav1.Select(sXPath1); while (nodes.MoveNext() && nodes1.MoveNext()) { sb.Append("· <a href=""); www.shengfang.org sb.Append(nodes1.Current.Value + "">"); sb.Append(nodes.Current.Value + "</a><br/><br/>rn"); } xpath.Text = sb.ToString(); } www.shengfang.org
· <a href="p/xiyangxinxiang.php">西洋 星象 处女座</a><br/><br/> · <a href="p/vsnetwinceenv.php">WINCE VS.NET 开发 环境 EN</a><br/><br/> · <a href="p/wincevsnetcn.php">WINCE VS.NET 开发 环境 中文</a><br/><br/> · <a href="p/vsnetwinceback.php">WINCE VS.NET 开发 背景知识</a><br/><br/> · <a href="p/gupaitianjiu1.php">骨牌 牌九 天九</a><br/><br/> · <a href="p/liuyongliyu.php">柳永 李煜 填词</a><br/><br/> · <a href="p/perlbijixuexi1.php">perl 语法 函数 学习 笔记</a><br/><br/> WebRequest 模拟 SUBMIT POST 资料 http://www.shengfang.org/blog/p/websubmitpostinfo.php 字体:大 中 小 |