li.的博客
li.的首页 > li.的博客 > 浏览文章

C#检查HTML是否闭合以及自动修复代码

分类:.net应用  人气:3988  评论:1  时间:2012-03-05 21:42
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Collections;
using System.Diagnostics;

namespace 查HTML是否闭合
{
    class TagsList
    {
        private ArrayList data;

        public int Size
        {
            get
            {
                return data.Count;
            }
        }

        public TagsList()
        {
            data = new ArrayList();
        }

        public void add(String str)
        {
            data.Add(str);
        }

        public string get(int index)
        {
            if (index < data.Count)
                return (string)data[index];
            else
                return null;
        }

        public bool remove(string str)
        {
            if (data.IndexOf(str) == -1) return false;
            data.Remove(str);
            return true;
        }

        public void remove(int index)
        {
            data.RemoveAt(index);
        }
    }

    public class TagsChecker
    {
        public static bool check(string str)
        {
            TagsList[] unclosedTags = getUnclosedTags(str);

            if (unclosedTags[0].Size != 0)
            {
                return false;
            }
            for (int i = 0; i < unclosedTags[1].Size; i++)
            {
                if (unclosedTags[1].get(i) != null)
                    return false;
            }

            return true;
        }

        public static string fix(String str)
        {
            StringBuilder fixeds = new StringBuilder(); // 存放修复后的字符串
            TagsList[] unclosedTags = getUnclosedTags(str);

            // 生成新字符串
            for (int i = unclosedTags[0].Size - 1; i > -1; i--)
            {
                fixeds.Append("<" + unclosedTags[0].get(i) + ">");
            }

            fixeds.Append(str);

            for (int i = unclosedTags[1].Size - 1; i > -1; i--)
            {
                String s = null;
                if ((s = unclosedTags[1].get(i)) != null)
                {
                    fixeds.Append("</" + s + ">");
                }
            }

            return fixeds.ToString();
        }

        private static TagsList[] getUnclosedTags(String str)
        {
            StringBuilder temp = new StringBuilder(); // 存放标签
            TagsList[] unclosedTags = new TagsList[2];
            unclosedTags[0] = new TagsList(); // 前不闭合,如有</div>而前面没有<div>
            unclosedTags[1] = new TagsList(); // 后不闭合,如有<div>而后面没有</div>
            bool flag = false; // 记录双引号"或单引号'
            char currentJump = ' '; // 记录需要跳过''还是""

            char current = ' ', last = ' '; // 当前 & 上一个

            // 开始判断
            for (int i = 0; i < str.Length; )
            {
                current = str[i++]; // 读取一个字符
                if (current == '"' || current == '\'')
                {
                    flag = flag ? false : true; // 若为引号,flag翻转
                    currentJump = current;
                    if (flag)
                    {
                        while (i < str.Length && str[i++] != currentJump)
                            ; // 跳过引号之间的部分
                        flag = false;
                    }
                }
                else if (current == '<')
                { // 开始提取标签
                    current = str[i++];
                    if (current == '/')
                    { // 标签的闭合部分,如</div>
                        current = str[i++];

                        // 读取标签
                        while (i < str.Length && current != '>')
                        {
                            temp.Append(current);
                            current = str[i++];
                        }

                        // 从tags_bottom移除一个闭合的标签
                        if (!unclosedTags[1].remove(temp.ToString()))
                        { // 若移除失败,说明前面没有需要闭合的标签
                            unclosedTags[0].add(temp.ToString()); // 此标签需要前闭合
                        }
                        temp.Remove(0, temp.Length); // 清空temp
                    }
                    else
                    { // 标签的前部分,如<div>
                        last = current;
                        while (i < str.Length && current != ' '
                                && current != ' ' && current != '>')
                        {
                            temp.Append(current);
                            last = current;
                            current = str[i++];
                        }

                        // 已经读取到标签,跳过其他内容,如<div id=test>跳过id=test
                        while (i < str.Length && current != '>')
                        {
                            last = current;
                            current = str[i++];
                            if (current == '"' || current == '\'')
                            { // 判断双引号
                                flag = flag ? false : true;
                                currentJump = current;
                                if (flag)
                                { // 若引号不闭合,跳过到下一个引号之间的内容
                                    while (i < str.Length && str[i++] != currentJump)
                                        ;
                                    current = str[i++];
                                    flag = false;
                                }
                            }
                        }
                        if (last != '/' && current == '>') // 判断这种类型:<TagName />
                            unclosedTags[1].add(temp.ToString());
                        temp.Remove(0, temp.Length);
                    }
                }
            }
            return unclosedTags;
        }
    }

    class Program
    {
        static void Main(string[] args)
        {
            Console.WriteLine("--功能测试--");
            //string str1 = "tt</u>ss</a>aa<div name=\"<test>\" id='3' other='<test>'><b>sff";
            string str1 = "<p>tt<table><tr><td>kdkf</td></tr><tr><td>323</td></p>";
            string str2 = "tt<u>ss</u><div id=test name=\"<test>\"><a>fds</a></div>";
            Console.WriteLine("检查文本 " + str1);
            Console.WriteLine("结果:" + TagsChecker.check(str1));
            Console.WriteLine("检查文本 " + str2);
            Console.WriteLine("结果:" + TagsChecker.check(str2));
            Console.WriteLine("修复文本 " + str1);
            Console.WriteLine("结果:" + TagsChecker.fix(str1));

            for (int i = 0; i < 10; i++)
            {
                str1 += str1;
            }

            Console.WriteLine();
            Console.WriteLine("--效率测试--");
            Console.WriteLine("文本长度:" + str1.Length);

            long t1 = DateTime.Now.Ticks;
            bool closed = TagsChecker.check(str1);
            long t2 = DateTime.Now.Ticks;
            String fixedStr = TagsChecker.fix(str1);
            long t3 = DateTime.Now.Ticks;
            Console.WriteLine("检查用时:" + (t2 - t1) + " 毫秒 结果:" + closed);
            Console.WriteLine("修复用时:" + (t3 - t2) + " 毫秒");
        }
    }
}
标签(Tag):c# asp.net 闭合标签 html
评论(1)
li.
li.2015年6月23日
本代码仍有不完美之处
我来评论
(800字以内)