Вы находитесь на странице: 1из 16

using

using
using
using
using
using
using
using

System;
System.Collections.Generic;
System.ComponentModel;
System.Data;
System.Drawing;
System.Linq;
System.Text;
System.Windows.Forms;

using HtmlAgilityPack; // DOM


using System.IO;
using iTextSharp.text;
using iTextSharp.text.pdf;
using System.Diagnostics;
namespace Miki_GrabWebPageArticle
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
//=====================
string site = "";
List<HtmlContentElement> Article;
//=========== END=======
//
private void btnGetHref_Click(object sender, EventArgs e)
{
listView1.Items.Clear();
site = txtSite.Text; //
if (!site.Contains("http"))
{
site = "http://" + site;
}
// 2
#region HtmlConten
//HtmlContent hcontent = ParseLinksText(site);

#region listView for 2


/*
this.listView1.BeginUpdate();
//UI
EndUpdate
// listView ()
listView1.View = View.Details;
listView1.Columns.Add("");
listView1.Columns.Add("");
listView1.Columns[1].Width = 600; // 1
for (int i = 0; i < hcontent.HtmlLinks.Count; i++)
{
ListViewItem itemTemp = new ListViewItem();
itemTemp.Text =
hcontent.HtmlLinkTexts[i];//linkText[i];//"" + i; // 1
itemTemp.SubItems.Add(hcontent.HtmlLinks[i]); // 2

listView1.Items.Add(itemTemp); // listViewItem
ListView
}
this.listView1.EndUpdate();

//UI

*/
#endregion
#endregion
// 3
List<HtmlContentElement> contentList =
ParseLinksText2(site);
//
List<HtmlContentElement> contentListNonRepeat =
contentList.Distinct().ToList();
// http https
List<HtmlContentElement> contentListDiff =
PureHTTPLink(contentListNonRepeat);
// listView
ShowListView(contentListDiff);
}

//
private void btnGetArticle_Click(object sender, EventArgs e)
{
string webSite = "http://blog.yam.com/fern724&page=";
string Xpath =
@"//div[@class='post_titlediv']/a[@href]";//txtXPath.Text;//"div[@class
='post_titlediv']/a[@href]";
for (int j = 1; j <= 2; j++) //
{
listView1.Items.Clear();
label4.Text = j.ToString(); //
Application.DoEvents();
site = webSite + j.ToString();
txtSite.Text = site;
Application.DoEvents();
//site = txtSite.Text; //
while ((Article = ParseArticle(site, Xpath)) == null) {
;}
ShowListView(Article);
//btnSave.PerformClick();
SavePDF(j);
}
MessageBox.Show("");
}
private void SavePDF(int j) // j
{
if (Article != null)
{
// html
#region html
if (ckBoxHtml.Checked == true)
{
for (int i = 0; i < Article.Count; i++)
//Article.Count
{
SaveWebHTML(Article[i].HtmlLink, i + "." +
Article[i].HtmlLinkText + ".html");
}
}

#endregion
// pdf
#region pdf
if (ckBoxPdf.Checked == true)
{
for (int i = 0; i < Article.Count; i++)
{
string tempSite =
Article[i].HtmlLink.Replace("http://","").Replace("/","");
SaveWebPDF(Article[i].HtmlLink, j + "_" + i +
"." + tempSite + "_" + Article[i].HtmlLinkText.Replace("&quot", "") +
".pdf"); //"_" + Article[i].HtmlLink.Replace("https://",
"").Replace("//", "_") +
}
// PDF
for (int i = 0; i < Article.Count; i++)
{
//SaveWebPDF(Article[i].HtmlLink, i + "." +
Article[i].HtmlLinkText + ".pdf");
string tempSite =
Article[i].HtmlLink.Replace("http://", "").Replace("/", "");
string temp = j + "_" + i + "." + tempSite +
"_" + Article[i].HtmlLinkText.Replace("&quot", "") + ".pdf"; //"_" +
Article[i].HtmlLink.Replace("http://", "").Replace("//","_") +
ChangeSavePath(temp.Replace(" ", ""),
Application.StartupPath + @"\WebPDF \" + temp.Replace(" ", ""));
}
}
#endregion
// pdf
#region // pdf
for (int i = 0; i < Article.Count; i++)
{
//SaveWebPDF(Article[i].HtmlLink, i + "." +
Article[i].HtmlLinkText + ".pdf");
//
string tempSite =
Article[i].HtmlLink.Replace("http://", "").Replace("/", "");
string temp = j + "_" + i + "." + tempSite + "_" +
Article[i].HtmlLinkText.Replace("&quot", "") + ".pdf"; //"_" +
Article[i].HtmlLink.Replace("http://", "").Replace("//","_") +
//ChangeSavePath(temp.Replace(" ", ""),
Application.StartupPath + @"\WebPDF \" + temp.Replace(" ", ""));

AddUrl2PDF(Application.StartupPath + @"\WebPDF
\" + temp.Replace(" ", ""), Article[i].HtmlLink);
}
#endregion
//MessageBox.Show("");
Article.Clear();
}
}
// html pdf
private void btnSave_Click(object sender, EventArgs e)
{
if (Article != null)
{
// html
#region html
if (ckBoxHtml.Checked == true)
{
for (int i = 0; i < Article.Count; i++)
//Article.Count
{
SaveWebHTML(Article[i].HtmlLink, i + "." +
Article[i].HtmlLinkText + ".html");
}
}
#endregion
// pdf
#region pdf
if (ckBoxPdf.Checked == true)
{
for (int i = 0; i < Article.Count; i++)
{
SaveWebPDF(Article[i].HtmlLink, i + "." +
Article[i].HtmlLinkText.Replace("&quot", "") + ".pdf"); //"_" +
Article[i].HtmlLink.Replace("https://", "").Replace("//", "_") +
}
// PDF
for (int i = 0; i < Article.Count; i++)
{
//SaveWebPDF(Article[i].HtmlLink, i + "." +
Article[i].HtmlLinkText + ".pdf");
string temp = i + "." +

Article[i].HtmlLinkText.Replace("&quot", "") + ".pdf"; //"_" +


Article[i].HtmlLink.Replace("http://", "").Replace("//","_") +
ChangeSavePath(temp.Replace(" ", ""),
Application.StartupPath + @"\WebPDF \" + temp.Replace(" ", ""));
}
}
#endregion
// pdf
#region // pdf
for (int i = 0; i < Article.Count; i++)
{
//SaveWebPDF(Article[i].HtmlLink, i + "." +
Article[i].HtmlLinkText + ".pdf");
//
string temp = i + "." +
Article[i].HtmlLinkText.Replace("&quot", "") + ".pdf"; //"_" +
Article[i].HtmlLink.Replace("http://", "").Replace("//","_") +
//ChangeSavePath(temp.Replace(" ", ""),
Application.StartupPath + @"\WebPDF \" + temp.Replace(" ", ""));
AddUrl2PDF(Application.StartupPath + @"\WebPDF
\" + temp.Replace(" ", ""), Article[i].HtmlLink);
}
#endregion
//MessageBox.Show("");
}
}
// HTML
#region void SaveWebHTML(string html, string filename)
private void SaveWebHTML(string html, string filename)
{
HtmlWeb web = new HtmlWeb();
web.AutoDetectEncoding = true;
HtmlAgilityPack.HtmlDocument doc = web.Load(html); //
//doc.Save(filename, doc.Encoding); //
Debug
//doc.Save(Application.StartupPath + @"\HTML \" +
filename, doc.Encoding); // HTML
doc.Save(Application.StartupPath + @"\WebHTML \" +
filename, doc.Encoding); // HTML
}

#endregion
// PDF
#region void SaveWebPDF(string html, string filename)
private void SaveWebPDF(string html, string filename)
{
#region ()
/*
HtmlWeb web = new HtmlWeb();
web.AutoDetectEncoding = true;
HtmlAgilityPack.HtmlDocument doc = web.Load(html); //
//doc.Save(filename, doc.Encoding); //
Debug
doc.Save(Application.StartupPath + @"\WebPDF \" +
filename, doc.Encoding); // HTML
*/
//string fileNamePDF = Application.StartupPath + @"\WebPDF
\" + filename;
//fileNamePDF = fileNamePDF.Replace(Application.StartupPath
+ "\\", ""); //
//string site = html; //
#endregion
filename = filename.Replace(" ", "");
// cmd phantomjs rasterize.js
https://tw.news.yahoo.com/locam() PhantomjsTest4.pdf()
string command = "phantomjs" + " rasterize2.js" + " " +
html + " " + filename; //0.ASP.NET 2.0
//filename;//@"\WebPDF
\1.pdf";//fileNamePDF;
ExecuteCmd(command);
}
/// <summary>
/// cmd
/// </summary>
/// <param name="command"></param>
private void ExecuteCmd(string command)
{
System.Diagnostics.Process p = new
System.Diagnostics.Process(); // process
p.StartInfo.FileName = "cmd.exe"; // cmd
p.StartInfo.Arguments = @"/C " + command; // cmd

p.StartInfo.UseShellExecute = false;
p.StartInfo.CreateNoWindow = true; //

p.Start(); //
p.WaitForExit(); //
}
#endregion
#region listView
private void listView1_ItemActivate(object sender, EventArgs e)
{
ListView listview = (ListView)sender;
// row
string s1 = listview.SelectedItems[0].SubItems[1].Text; //

// Uri
Uri url;
bool tag = false;
try
{
url = new Uri(s1);
//
/*
// url ie
System.Diagnostics.Process.Start("IExplore",
url.AbsoluteUri);
tag = Uri.CheckSchemeName(url.Scheme); //
MessageBox.Show(tag.ToString());
*/
//
try
{
System.Diagnostics.Process.Start(url.AbsoluteUri);
//System.Diagnostics.Process.Start("")
}
catch (Exception ex)
{
MessageBox.Show(ex.ToString());
}
}
catch (Exception ex)
{
//tag = Uri.CheckSchemeName(url.Scheme); //
MessageBox.Show(tag.ToString());

MessageBox.Show(ex.ToString());
}
}
#endregion
// =========================
//
#region List<string> ParseLinks(string html)
private List<string> ParseLinks(string html)
{
HtmlWeb web = new HtmlWeb();
HtmlAgilityPack.HtmlDocument doc = web.Load(html);
HtmlNodeCollection nodes =
doc.DocumentNode.SelectNodes("//a[@href]");//(xpath);
List<string> temp = new List<string>();
foreach (HtmlNode node in nodes)
{
if (node == null) continue;
string i = node.Attributes["href"].Value;
//
//string i2 = node.GetAttributeValue("href","");
temp.Add(i);
}
return temp;
}
#endregion
//
#region private List<string> ParseLinksText(string html,ref
List<string> LinkText)
private List<string> ParseLinksText(string html, ref
List<string> LinkText)
{
HtmlWeb web = new HtmlWeb();
HtmlAgilityPack.HtmlDocument doc = web.Load(html);
HtmlNodeCollection nodes =
doc.DocumentNode.SelectNodes("//a[@href]");//(xpath);
List<string> temp = new List<string>();
foreach (HtmlNode node in nodes)
{
if (node == null) continue;

string i = node.Attributes["href"].Value;
string text = node.InnerText;
//
//string i2 = node.GetAttributeValue("href","");
temp.Add(i);
LinkText.Add(text);
}
return temp;
}
#endregion
// HtmlContent
#region private HtmlContent ParseLinksText(string html)
private HtmlContent ParseLinksText(string html)
{
HtmlWeb web = new HtmlWeb();
HtmlAgilityPack.HtmlDocument doc = web.Load(html); //

HtmlNodeCollection nodes =
doc.DocumentNode.SelectNodes("//a[@href]");//(xpath); //
(node)
HtmlContent hcontents = new
HtmlContent();//GetLinksAndText(nodes); // function

// link HtmlContent
for (int i = 0; i < nodes.Count; i++)
{
hcontents.HtmlLinks.Add(nodes[i].Attributes["href"].Value);
hcontents.HtmlLinkTexts.Add(nodes[i].InnerText);
}
return hcontents;
}
#endregion
// HtmlContentElement (
)
#region private List<HtmlContentElement> ParseLinksText2(string
html)
private List<HtmlContentElement> ParseLinksText2(string html)
{
List<HtmlContentElement> hconElementOutput = new

List<HtmlContentElement>();
HtmlWeb web = new HtmlWeb();
web.AutoDetectEncoding = true;
HtmlAgilityPack.HtmlDocument doc = web.Load(html); //

Encoding e = doc.Encoding; //
HtmlNodeCollection nodes =
doc.DocumentNode.SelectNodes("//a[@href]");//(xpath); //
(node)
foreach (HtmlNode node in nodes)
{
HtmlContentElement hconElement = new
HtmlContentElement();
hconElement.HtmlLink = node.Attributes["href"].Value;
hconElement.HtmlLinkText = node.InnerText;
hconElementOutput.Add(hconElement);
}
return hconElementOutput;
}
#endregion
// http https
#region
private List<string> CheckValidHTTPLink(List<string>
input)
private List<string> CheckValidHTTPLink(List<string> input)
{
List<string> validLinks = new List<string>();
foreach (string s in input)
{
if (s.Contains("http://"))
{
validLinks.Add(s.Substring(s.IndexOf("http://")));
// http://( http://)
}
else if (s.Contains("https://"))
{
validLinks.Add(s.Substring(s.IndexOf("https://")));
// http://( https://)
}
}
return validLinks;
}
#endregion

// pure http
#region private List<HtmlContentElement>
PureHTTPLink(List<HtmlContentElement> inputLinkList)
private List<HtmlContentElement>
PureHTTPLink(List<HtmlContentElement> inputLinkList)
{
List<HtmlContentElement> outputLinkList = new
List<HtmlContentElement>();
foreach (HtmlContentElement content in inputLinkList)
{
if (content.HtmlLink.Contains("http://"))
{
content.HtmlLink =
content.HtmlLink.Substring(content.HtmlLink.IndexOf("http://")); //
http
outputLinkList.Add(content);
}
else if (content.HtmlLink.Contains("https://"))
{
content.HtmlLink =
content.HtmlLink.Substring(content.HtmlLink.IndexOf("https://")); //
http
outputLinkList.Add(content);
}
}
return outputLinkList;
}
#endregion
// http
#region private List<HtmlContentElement> ParseArticle(string
html,string Xpath)
private List<HtmlContentElement> ParseArticle(string
html,string Xpath)
{
List<HtmlContentElement> hconElementOutput = new
List<HtmlContentElement>();
HtmlWeb web = new HtmlWeb();
web.AutoDetectEncoding = true;
HtmlAgilityPack.HtmlDocument doc = web.Load(html); //

Encoding e = doc.Encoding; //
//Xpath = "//a[@href]"; // (a href a //
a ( href ))
//Xpath = "//div[@class='post-body entry-

content']/a[@href]";
HtmlNodeCollection nodes =
doc.DocumentNode.SelectNodes(Xpath);//(xpath); // (node)

foreach (HtmlNode node in nodes)


{
HtmlContentElement hconElement = new
HtmlContentElement();
hconElement.HtmlLink = node.Attributes["href"].Value;
hconElement.HtmlLinkText = node.InnerText;
hconElementOutput.Add(hconElement);
}
return hconElementOutput;
}
#endregion
// show ListView
#region void ShowListView(List<HtmlContentElement>
contentListDiff)
private void ShowListView(List<HtmlContentElement>
contentListDiff)
{
this.listView1.BeginUpdate();
//UI
EndUpdate
// listView ()
listView1.View = View.Details;
listView1.Columns.Add("");
listView1.Columns.Add("");
listView1.Columns[0].Width = 90; // 0
listView1.Columns[1].Width = 600; // 1
for (int i = 0; i < contentListDiff.Count; i++)
{
ListViewItem itemTemp = new ListViewItem();
itemTemp.Text =
contentListDiff[i].HtmlLinkText;//linkText[i];//"" + i; // 1
itemTemp.SubItems.Add(contentListDiff[i].HtmlLink); //
2
listView1.Items.Add(itemTemp); // listViewItem
ListView
}

this.listView1.EndUpdate();

//UI

}
#endregion
//
#region void ChangeSavePath(string sourcePath, string
TargetPath)
private void ChangeSavePath(string sourcePath, string
TargetPath)
{
byte[] bytes = new byte[0];
using (FileStream fsR = new FileStream(sourcePath,
FileMode.Open, FileAccess.Read))
{
bytes = new byte[fsR.Length];
fsR.Read(bytes, 0, (int)fsR.Length);// bytes
using (FileStream fsW = new FileStream(TargetPath,
FileMode.Create, FileAccess.Write)) //, FileAccess.Write
{
fsW.Write(bytes, 0, (int)fsR.Length);
}
}
System.IO.File.Delete(sourcePath); //
}
#endregion
// pdf
#region public void AddUrl2PDF(string filepath, string url)
public void AddUrl2PDF(string filepath, string url)
{
string oldFile = filepath;//"0.ASP.NET2.0 Excel
.pdf";//"oldFile.pdf";
//string temp = "0.ASP.NET2.0 Excel
.pdf";
string newFile = "NewFile.pdf";//temp;
PdfReader reader = new PdfReader(oldFile);
iTextSharp.text.Rectangle dimension =
reader.GetPageSize(1);
iTextSharp.text.Rectangle size =
reader.GetPageSizeWithRotation(1);
Document document = new Document(size);

// open the writer


FileStream fs = new FileStream(newFile, FileMode.Create,
FileAccess.Write);
PdfWriter writer = PdfWriter.GetInstance(document, fs);
document.Open();
for (var i = 1; i <= reader.NumberOfPages; i++)
{
document.NewPage();
// the pdf content
PdfContentByte cb = writer.DirectContent;
// select the font properties
BaseFont bf = BaseFont.CreateFont(BaseFont.HELVETICA,
BaseFont.CP1252, BaseFont.NOT_EMBEDDED);
cb.SetColorFill(BaseColor.DARK_GRAY);
cb.SetFontAndSize(bf, 8);
// write the text in the pdf content
cb.BeginText();
string text = url;//"https://tw.yahoo.com/"; //

// put the alignment and coordinates here


//cb.ShowTextAligned(1, text, 530, 30, 0); //(1, text,
520, 640, 0);
cb.ShowTextAligned(PdfContentByte.ALIGN_LEFT, text,
dimension.GetLeft(5), dimension.GetTop(8), 0);
cb.EndText();
// create the new page and add it to the pdf
PdfImportedPage page = writer.GetImportedPage(reader,
i);
cb.AddTemplate(page, 0, 0);
}
// close the streams and voil the file should be
changed :)
document.Close();
fs.Close();
writer.Close();
reader.Close();
File.Copy(newFile, oldFile, true); //
File.Delete(newFile); //
}
#endregion

//===================================================
}
}

Вам также может понравиться