Skip to content

Commit d4b4fcd

Browse files
committed
Create MHTMLParser.cs
1 parent 838d8ff commit d4b4fcd

File tree

1 file changed

+315
-0
lines changed

1 file changed

+315
-0
lines changed

MHTMLParser.cs

Lines changed: 315 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,315 @@
1+
using System;
2+
using System.IO;
3+
using System.Text;
4+
using System.Text.RegularExpressions;
5+
using System.Collections.Generic;
6+
7+
/// <summary>
8+
/// HTMLParser is an object that can decode mhtml into ASCII text.
9+
/// Using getHTMLText() will generate static HTML with inline images.
10+
/// </summary>
11+
public class MHTMLParser
12+
{
13+
const string BOUNDARY = "boundary";
14+
const string CHAR_SET = "charset";
15+
const string CONTENT_TYPE = "Content-Type";
16+
const string CONTENT_TRANSFER_ENCODING = "Content-Transfer-Encoding";
17+
const string CONTENT_LOCATION = "Content-Location";
18+
const string FILE_NAME = "filename=";
19+
20+
private string mhtmlString; // the string we want to decode
21+
private string log; // log file
22+
public bool decodeImageData; //decode images?
23+
24+
/*
25+
* Results of Conversion
26+
* This is split into a string[3] for each part
27+
* string[0] is the content type
28+
* string[1] is the content name
29+
* string[2] is the converted data
30+
*/
31+
public List<string[]> dataset;
32+
33+
/*
34+
* Default Constructor
35+
*/
36+
public MHTMLParser()
37+
{
38+
this.dataset = new List<string[]>(); //Init dataset
39+
this.log += "Initialized dataset.\n";
40+
this.decodeImageData = false; //Set default for decoding images
41+
}
42+
43+
/*
44+
* Init with contents of string
45+
*/
46+
public MHTMLParser(string mhtml)
47+
: this()
48+
{
49+
setMHTMLString(mhtml);
50+
}
51+
/*
52+
* Init with contents of string, and decoding option
53+
*/
54+
public MHTMLParser(string mhtml, bool decodeImages)
55+
: this(mhtml)
56+
{
57+
this.decodeImageData = decodeImages;
58+
}
59+
/*
60+
* Set the mhtml string we want to decode
61+
*/
62+
public void setMHTMLString(string mhtml)
63+
{
64+
try
65+
{
66+
if (mhtml == null) throw new Exception("The mhtml string is null"); //Early Exit
67+
this.mhtmlString = mhtml; //Set String
68+
this.log += "Set mhtml string.\n";
69+
}
70+
catch (Exception e)
71+
{
72+
this.log += e.Message;
73+
this.log += e.StackTrace;
74+
}
75+
}
76+
/*
77+
* Decompress Archive From String
78+
*/
79+
public List<string[]> decompressString()
80+
{
81+
// init Prerequisites
82+
StringReader reader = null;
83+
string type = "";
84+
string encoding = "";
85+
string location = "";
86+
string filename = "";
87+
string charset = "utf-8";
88+
StringBuilder buffer = null;
89+
this.log += "Starting decompression \n";
90+
91+
92+
try
93+
{
94+
reader = new StringReader(this.mhtmlString); //Start reading the string
95+
96+
String boundary = getBoundary(reader); // Get the boundary code
97+
if (boundary == null) throw new Exception("Failed to find string 'boundary'");
98+
this.log += "Found boundary.\n";
99+
100+
//Loop through each line in the string
101+
string line = null;
102+
while ((line = reader.ReadLine()) != null)
103+
{
104+
string temp = line.Trim();
105+
if (temp.Contains(boundary)) //Check if this is a new section
106+
{
107+
if (buffer != null) //If this is a new section and the buffer is full, write to dataset
108+
{
109+
string[] data = new string[3];
110+
data[0] = type;
111+
data[1] = filename;
112+
data[2] = writeBufferContent(buffer, encoding, charset, type, this.decodeImageData);
113+
this.dataset.Add(data);
114+
buffer = null;
115+
this.log += "Wrote Buffer Content and reset buffer.\n";
116+
}
117+
buffer = new StringBuilder();
118+
}
119+
else if (temp.StartsWith(CONTENT_TYPE))
120+
{
121+
type = getAttribute(temp);
122+
this.log += "Got content type.\n";
123+
}
124+
else if (temp.StartsWith(CHAR_SET))
125+
{
126+
charset = getCharSet(temp);
127+
this.log += "Got charset.\n";
128+
}
129+
else if (temp.StartsWith(CONTENT_TRANSFER_ENCODING))
130+
{
131+
encoding = getAttribute(temp);
132+
this.log += "Got encoding (" + encoding + ").\n";
133+
}
134+
else if (temp.StartsWith(CONTENT_LOCATION))
135+
{
136+
location = temp.Substring(temp.IndexOf(":") + 1).Trim();
137+
this.log += "Got location.\n";
138+
}
139+
else if (temp.StartsWith(FILE_NAME))
140+
{
141+
char c = '"';
142+
filename = temp.Substring(temp.IndexOf(c.ToString()) + 1, temp.LastIndexOf(c.ToString()) - temp.IndexOf(c.ToString()) - 1);
143+
}
144+
else if (temp.StartsWith("Content-ID") || temp.StartsWith("Content-Disposition") || temp.StartsWith("name=") || temp.Length == 1)
145+
{
146+
//We don't need this stuff; Skip lines
147+
}
148+
else
149+
{
150+
if (buffer != null)
151+
{
152+
buffer.Append(line + "\n");
153+
}
154+
}
155+
}
156+
}
157+
finally
158+
{
159+
if (null != reader)
160+
reader.Close();
161+
this.log += "Closed Reader.\n";
162+
}
163+
return this.dataset; //Return Results
164+
}
165+
private string writeBufferContent(StringBuilder buffer, string encoding, string charset, string type, bool decodeImages)
166+
{
167+
this.log += "Start writing buffer contents.\n";
168+
169+
//Detect if this is an image and if we want to decode it
170+
if (type.Contains("image"))
171+
{
172+
this.log += "Image Data Detected.\n";
173+
if (!decodeImages)
174+
{
175+
this.log += "Skipping image decode.\n";
176+
return buffer.ToString();
177+
}
178+
}
179+
180+
// base64 Decoding
181+
if (encoding.ToLower().Equals("base64"))
182+
{
183+
try
184+
{
185+
this.log += "base64 encoding detected.\n";
186+
this.log += "Got base64 decoded string.\n";
187+
return decodeFromBase64(buffer.ToString());
188+
}
189+
catch (Exception e)
190+
{
191+
this.log += e.Message + "\n";
192+
this.log += e.StackTrace + "\n";
193+
this.log += "Data not Decoded.\n";
194+
return buffer.ToString();
195+
}
196+
}
197+
//quoted-printable decoding
198+
else if (encoding.ToLower().Equals("quoted-printable"))
199+
{
200+
this.log += "Quoted-Prinatble string detected.\n";
201+
return getQuotedPrintableString(buffer.ToString());
202+
}
203+
else
204+
{
205+
this.log += "Unknown Encoding.\n";
206+
return buffer.ToString();
207+
}
208+
}
209+
/*
210+
* Take base64 string, get bytes and convert to ascii string
211+
*/
212+
static public string decodeFromBase64(string encodedData)
213+
{
214+
byte[] encodedDataAsBytes
215+
= System.Convert.FromBase64String(encodedData);
216+
string returnValue =
217+
System.Text.ASCIIEncoding.ASCII.GetString(encodedDataAsBytes);
218+
return returnValue;
219+
}
220+
/*
221+
* Get decoded quoted printable string
222+
*/
223+
public string getQuotedPrintableString(string mimeString)
224+
{
225+
try
226+
{
227+
throw new Exception("Quoted-Printable is not supported.");
228+
}
229+
catch (Exception e)
230+
{
231+
this.log += e.Message + "\n";
232+
this.log += e.StackTrace + "\n";
233+
this.log += "Data not Decoded.\n";
234+
return mimeString;
235+
}
236+
}
237+
/*
238+
* Finds boundary used to break code into multiple parts
239+
*/
240+
private string getBoundary(StringReader reader)
241+
{
242+
string line = null;
243+
244+
while ((line = reader.ReadLine()) != null)
245+
{
246+
line = line.Trim();
247+
//If the line starts with BOUNDARY, lets grab everything in quotes and return it
248+
if (line.StartsWith(BOUNDARY))
249+
{
250+
char c = '"';
251+
int a = line.IndexOf(c.ToString());
252+
int b = line.LastIndexOf(c.ToString());
253+
return line.Substring(line.IndexOf(c.ToString()) + 1, line.LastIndexOf(c.ToString()) - line.IndexOf(c.ToString()) - 1);
254+
}
255+
}
256+
return null;
257+
}
258+
/*
259+
* Grabs charset from a line
260+
*/
261+
private string getCharSet(String temp)
262+
{
263+
string t = temp.Split('=')[1].Trim();
264+
return t.Substring(1, t.Length - 1);
265+
}
266+
/*
267+
* split a line on ": "
268+
*/
269+
private string getAttribute(String line)
270+
{
271+
string str = ": ";
272+
return line.Substring(line.IndexOf(str) + str.Length, line.Length - (line.IndexOf(str) + str.Length)).Replace(";", "");
273+
}
274+
/*
275+
* Get an html page from the mhtml. Embeds images as base64 data
276+
*/
277+
public string getHTMLText()
278+
{
279+
if (this.decodeImageData) throw new Exception("Turn off image decoding for valid html output.");
280+
List<string[]> data = this.decompressString();
281+
string body = "";
282+
//First, lets write all non-images to mail body
283+
//Then go back and add images in
284+
for (int i = 0; i < 2; i++)
285+
{
286+
foreach (string[] strArray in data)
287+
{
288+
if (i == 0)
289+
{
290+
if (strArray[0].Equals("text/html"))
291+
{
292+
body += strArray[2];
293+
this.log += "Writing HTML Text\n";
294+
}
295+
}
296+
else if (i == 1)
297+
{
298+
if (strArray[0].Contains("image"))
299+
{
300+
body = body.Replace("cid:" + strArray[1], "data:" + strArray[0] + ";base64," + strArray[2]);
301+
this.log += "Overwriting HTML with image: " + strArray[1] + "\n";
302+
}
303+
}
304+
}
305+
}
306+
return body;
307+
}
308+
/*
309+
* Get the log from the decoding process
310+
*/
311+
public string getLog()
312+
{
313+
return this.log;
314+
}
315+
}

0 commit comments

Comments
 (0)