/// <summary>
/// This function actually alters the text and makes the necessary changes.
/// </summary>
/// <param name="text">Initial text.</param>
/// <param name="options">Conversion options. See <see cref="T:DmitriNesteruk.TypograFixImpl.ConversionOptions"/>.</param>
/// <param name="timeoutMsec">The timeout value in milliseconds.</param>
/// <param name="createReferences">Indicates whether references should be created. If in doubt, set to <c>true</c>.</param>
/// <returns>The altered text.</returns>
/// <remarks>Contains tracing until things work perfectly.</remarks>
/// <exception cref="Exception">Will throw exception if source HTML is malformed.</exception>
/// <exception cref="TimeoutException">Will throw if execution time exceeds timeout.</exception>
private static string Transform([NotNull]string text, ConversionOptions options,
[GreaterThan(0)]int timeoutMsec, bool createReferences)
{
int len = text.Length;
if (len == 0)
return string.Empty;
Stopwatch st = new Stopwatch();
st.Start();
HtmlBuilder hb = new HtmlBuilder((int)(len * 1.5 + 1));
context = new Stack<string>();
if (createReferences)
references = new List<string>();
backquoteOpen = emphasisOpen = false;
Stack<char> quoteStack = new Stack<char>();
for (int i = 0; i < len; ++i)
{
switch (text[i])
{
case '{':
#region { (possible {{ pre)
if (CannotReplace || !options.DoubleCurlyToPre || i + 1 == len) goto default;
if (text[i + 1] == '{')
{
hb.Append("<pre>");
++i;
context.Push("pre");
break;
}
goto default;
#endregion
case '}':
#region } (possible }} /pre)
if ((context.Count == 0 || context.Peek() != "pre") || !options.DoubleCurlyToPre || i + 1 == len) goto default;
if (text[i + 1] == '}')
{
hb.Append("</pre>");
++i;
context.Pop();
break;
}
goto default;
#endregion
case '~':
#region ~ (approx)
if (CannotReplace || !options.ConvertApprox) goto default;
if (i + 1 < text.Length && text[i + 1] == '=')
{
hb.Append("≈");
++i;
break;
}
goto default;
#endregion
case '[':
#region [ (possible reference)
if (CannotReplace || options.ReferenceStyle == ReferenceStyle.None)
goto default;
// look for closing index
int idxClosing = -1;
for (int a = i; a < text.Length; ++a)
{
if (text[a] == ']')
{
idxClosing = a;
break;
}
}
if (idxClosing == -1 || idxClosing == i + 1)
goto default;
// grab the content string
string reference = text.Substring(i + 1, idxClosing - i - 1);
references.Add(reference);
string flattened =
Regex.Replace(
Transform(reference, options, 5000, false),
@"<(.|\n)*?>", string.Empty);
switch (options.ReferenceStyle)
{
default:
hb.Append(string.Format("[<a href=\"#Reference{0}\" title=\"{1}\">{0}</a>]",
references.Count, flattened));
break;
case ReferenceStyle.Superscript:
hb.Append(string.Format("<sup><small><a href=\"#Reference{0}\" title=\"{1}\">{0}</a></small></sup>",
references.Count, flattened));
break;
}
i = idxClosing;
break;
#endregion
case '`':
#region `
if ((InCode && !backquoteOpen) || !options.BackquoteToCode)
goto default;
if (backquoteOpen)
{
hb.Append("</code>");
context.Pop();
}
else
{
hb.Append("<code>");
context.Push("code");
}
backquoteOpen = !backquoteOpen;
break;
#endregion
case '*':
#region * possible emphasis or bold
if (InCode) goto default;
if (emphasisOpen)
{
if (i + 1 != len && text[i + 1] == '*')
{
hb.Append("</strong>");
i++;
}
else
hb.Append("</em>");
emphasisOpen = !emphasisOpen;
continue;
}
// should we open it?
if (i == 0 || !char.IsLetterOrDigit(text[i - 1]))
{
if (i + 1 != len && text[i + 1] == '*')
{
hb.Append("<strong>");
i++;
}
else
hb.Append("<em>");
emphasisOpen = true;
continue;
}
goto default;
#endregion
case '<':
#region <
// if this is occuring between code tags, check to see if replacement needs to be done
if (i + 1 != text.Length && text[i + 1] != '/' && InCode && options.EscapeLessGreaterInCode)
{
hb.Append("<");
continue;
}
hb.Append('<');
StringBuilder tb = new StringBuilder();
while (++i < text.Length)
{
hb.Append(text[i]);
if (text[i] == '\"')
{
// this (opening quote) has been appended
// append everything until we meet a closing one
if (i + 1 < text.Length)
{
do
{
++i;
hb.Append(text[i]);
} while (i < text.Length && text[i] != '\"');
}
}
else if (char.IsWhiteSpace(text[i]))
{
// we're in this tag's context
context.Push(tb.ToString());
//tb = new StringBuilder();
//Debug.WriteLine("The string '" + tb + "' has been pushed, items count = " + context.Count);
}
else if (text[i] == '/')
{
tb = new StringBuilder();
try
{
// try popping
context.Pop();
}
catch (InvalidOperationException)
{
// we would end up here in cases where tag is followed by /, e.g., <br/>
}
// move until we meet > and add it
while (++i < text.Length)
{
if (text[i] == '>')
goto default;
hb.Append(text[i]);
Console.WriteLine("Appended internally " + text[i]);
}
}
else if (text[i] == '>')
{
// if closed, break this loop
context.Push(tb.ToString());
//tb = new StringBuilder();
//Debug.WriteLine("The string '" + tb + "' has been pushed, items count = " + context.Count);
break;
}
else
tb.Append(text[i]);
}
break;
#endregion
case '>':
#region >
// if this is occuring between code tags, check to see if replacement needs to be done
if (i + 1 != text.Length && text[i + 1] != '/' && InCode && options.EscapeLessGreaterInCode)
{
hb.Append(">");
continue;
}
goto default;
case '-':
if (CannotReplace) goto default;
// if surrounded by spaces, it's an en dash
if (i > 0 && i + 1 < text.Length && text[i - 1] == ' ' && text[i + 1] == ' ')
hb.Append("–");
else if (i + 1 < text.Length && text[i + 1] == '-' &&
(i + 2 == text.Length || text[i + 2] != '&'))
{
hb.Append("—");
++i; // ignore the second dash
}
else if (i + 5 < text.Length &&
text[i + 1] == '-' &&
text[i + 2] == '&' &&
text[i + 3] == 'g' &&
text[i + 4] == 't' &&
text[i + 5] == ';')
{
hb.Append("→");
i += 5;
}
else goto default;
// same for em dash
break;
#endregion
case '=':
#region =
if (CannotReplace) goto default;
if (i + 5 < len &&
text[i + 1] == '=' &&
text[i + 2] == '&' &&
text[i + 3] == 'g' &&
text[i + 4] == 't' &&
text[i + 5] == ';')
{
hb.Append("⇒");
i += 5;
break;
}
goto default;
#endregion
case '(':
#region (
if (CannotReplace) goto default;
if (i + 2 < text.Length && text[i + 2] == ')')
{
if (char.ToLower(text[i + 1]) == 'c')
{
hb.Append("©");
i += 2;
}
else if (char.ToLower(text[i + 1]) == 'r')
{
hb.Append("®");
i += 2;
}
else goto default;
}
else if (i + 3 < text.Length && char.ToLower(text[i + 1]) == 't'
&& char.ToLower(text[i + 2]) == 'm'
&& char.ToLower(text[i + 3]) == ')')
{
hb.Append("™");
i += 3;
}
else goto default;
break;
#endregion
case '\'': // single quotes
#region '
if (CannotReplace) goto default;
// if there's no space on the right, force closing
if (i > 0 && i + 1 < text.Length
&& char.IsLetterOrDigit(text[i - 1])
&& char.IsLetterOrDigit(text[i + 1]))
{
hb.Append("’"); // stack ignored
}
else if (quoteStack.Count > 0 && quoteStack.Peek() == '\'')
{
hb.Append("’");
quoteStack.Pop();
}
else
{
// there is nothing on the quote stack
// it's anyone's guess whether the quote is opening or closing
if (i + 1 < text.Length && text[i + 1] == ' ')
hb.Append("’");
else
hb.Append("‘");
quoteStack.Push('\'');
}
break;
#endregion
case '\"': // double quotes
#region "
//Debug.WriteLine("Double quote!");
if (CannotReplace) goto default;
// if there is a quote on the stack, close it
if (quoteStack.Count > 0 && quoteStack.Peek() == '\"')
{
if (options.AngledDoubleQuotes)
hb.Append("»");
else
hb.Append("”");
quoteStack.Pop();
}
else
{
if (options.AngledDoubleQuotes)
hb.Append("«");
else
hb.Append("“");
quoteStack.Push('\"');
}
break;
#endregion
case '.': // possible ellipsis
#region .
if (i + 2 < text.Length && text[i + 1] == '.' && text[i + 2] == '.')
{
if (i > 0 && i + 3 <= text.Length)
{
// the ellipsis only 'counts' if it does not coexist with other elements
bool isValidFront = true;
bool isValidBack = true;
for (int b = i - 1; b >= 0; --b)
{
isValidBack &= (Environment.NewLine + " \t").Contains(text[b].ToString());
if (Environment.NewLine.Contains(text[b].ToString()) || !isValidBack) break;
}
for (int f = i + 3; f < len; ++f)
{
isValidFront &= (Environment.NewLine + " \t").Contains(text[f].ToString());
if (Environment.NewLine.Contains(text[f].ToString()) || !isValidFront) break;
}
if (isValidFront && isValidBack)
{
hb.Append("⋮");
i += 2;
continue;
}
}
if (CannotReplace) goto default;
hb.Append("…");
i += 2;
}
else goto default;
break;
#endregion
case 'x':
#region x
if (CannotReplace) goto default;
if (i > 0 && char.IsDigit(text[i - 1])
&&
((
i + 1 < text.Length &&
(char.IsDigit(text[i + 1]) || char.IsWhiteSpace(text[i + 1]) || text[i + 1] == '<')
) || (i + 1 == text.Length)))
{
hb.Append("×");
}
else goto default;
break;
#endregion
case '&':
#region &
if (CannotReplace) goto default;
if (i + 1 < text.Length &&
(text[i + 1] == '<' || text[i + 1] == ' '))
{
hb.Append("&");
}
else if (i + 5 < text.Length)
{
if (text[i + 1] == 'l' && text[i + 2] == 't' && text[i + 3] == ';')
{
if (text[i + 4] == '-' && text[i + 5] == '-')
{
hb.Append("←");
i += 5;
}
else if (text[i + 4] == '=' && text[i + 5] == '=')
{
hb.Append("⇐");
i += 5;
}
else goto default;
}
else goto default;
}
else goto default;
#endregion
break;
case '+':
#region +
if (CannotReplace || !options.ConvertPM || i + 1 == text.Length || text[i + 1] != '-')
goto default;
hb.Append("±");
i++;
break;
#endregion
case '^':
#region ^
if (CannotReplace || !options.ConvertSuper || i + 1 == text.Length || "123".IndexOf(text[i + 1]) == -1)
goto default;
hb.Append("&sup" + text[i + 1] + ";");
i++;
break;
#endregion
case '#':
#region # (diactrics)
if (CannotReplace || options.DiactricsOptions == DiactricsOptions.None
|| i == 0 || i + 1 == len)
goto default;
if (options.DiactricsOptions == DiactricsOptions.Swedish)
{
if (text[i - 1] == 'o' && text[i + 1] == '2')
{
// alter last character in buffer
hb.Replace('o', 'ö', hb.Length - 1, 1);
++i;
continue;
}
if (text[i - 1] == 'O' && text[i + 1] == '2')
{
// alter last character in buffer
hb.Replace('O', 'Ö', hb.Length - 1, 1);
++i;
continue;
}
if (text[i - 1] == 'a' && text[i + 1] == '1')
{
// alter last character in buffer
hb.Replace('a', 'Ã¥', hb.Length - 1, 1);
++i;
continue;
}
if (text[i - 1] == 'A' && text[i + 1] == '1')
{
// alter last character in buffer
hb.Replace('A', 'Ã…', hb.Length - 1, 1);
++i;
continue;
}
if (text[i - 1] == 'a' && text[i + 1] == '2')
{
// alter last character in buffer
hb.Replace('a', 'ä', hb.Length - 1, 1);
++i;
continue;
}
if (text[i - 1] == 'A' && text[i + 1] == '2')
{
// alter last character in buffer
hb.Replace('A', 'Ä', hb.Length - 1, 1);
++i;
continue;
}
}
else goto default;
break;
#endregion
case '@':
if (CannotReplace || !options.ObfuscateAt)
goto default;
hb.Append("/aτ/");
break;
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '0':
{
#region digits
if (CannotReplace) goto default;
// keep accumulating while digit
int j;
StringBuilder acc = new StringBuilder();
for (j = i; j < text.Length; ++j)
{
if (char.IsDigit(text[j]) || text[j] == ',')
acc.Append(text[j]);
else break;
}
if (j + 1 >= text.Length)
goto default;
// st
string accStr = acc.ToString();
char lastChar = accStr[accStr.Length - 1];
if (lastChar == '1' && text[j] == 's' && text[j + 1] == 't')
{
hb.Append(acc + "<sup>st</sup>");
i += acc.Length + 1;
}
else if (lastChar == '2' && text[j] == 'n' && text[j + 1] == 'd')
{
hb.Append(acc + "<sup>nd</sup>");
i += acc.Length + 1;
}
else if (lastChar == '3' && text[j] == 'r' && text[j + 1] == 'd')
{
hb.Append(acc + "<sup>rd</sup>");
i += acc.Length + 1;
}
else if (text[j] == 't' && text[j + 1] == 'h')
{
hb.Append(acc + "<sup>th</sup>");
i += acc.Length + 1;
}
else goto default;
#endregion
}
break;
#region reverse substitions - in case you paste from Word
case '‘':
if (CannotReplace) goto default;
hb.Append("‘");
break;
case '’':
if (CannotReplace) goto default;
hb.Append("’");
break;
case '“':
if (CannotReplace) goto default;
hb.Append("“");
break;
case 'â€':
if (CannotReplace) goto default;
hb.Append("”");
break;
case '…':
if (CannotReplace) goto default;
hb.Append("…");
break;
case '–':
if (CannotReplace) goto default;
hb.Append("–");
break;
#endregion
default:
// control characters are only allowed inside the no-fly zone (i.e., code, pre, script tags)
switch (CannotReplace)
{
case false:
if (!char.IsControl(text[i]) || Environment.NewLine.Contains(text[i].ToString()))
goto default;
break;
default:
//if (options.AutoParagraphs)
//{
// // if we're 0th or last char in line break
// if (i == 0 || Environment.NewLine.Contains(text[i - 1].ToString()))
// {
// // if we're in P already, close it
// if (InParagraph)
// {
// hb.AppendLine("</p>");
// while (context.Pop().ToUpperInvariant() != "P") ;
// context.Pop(); // woo-hoo
// }
// // now add the P, damn it
// hb.AppendLine("<p>");
// context.Push("p");
// }
//}
hb.Append(text[i]);
break;
}
break;
}
if (st.ElapsedMilliseconds > timeoutMsec)
{
st.Stop();
throw new TimeoutException();
}
}
st.Stop();
return hb.ToString();
}
Refactorings
No refactoring yet !
Nik Radford
February 26, 2009, February 26, 2009 14:17, permalink
Is this just trying to html encode strings?
The code below is evil. It's a massive state machine that needs to be refactored to F#, WF, or anything, so long as it becomes more manageable.