【教程】将WordPress内容导出转化为知识库TXT

本文最后更新于 517 天前，其中的信息可能已经有所发展或是发生改变。

方法

用WP All Export Pro¹插件将文章导出为XLSX格式的文件，再用C#做一个程序：

安装依赖：

Html2Markdown
ExcelDataReader
System.Text.Encoding.CodePages

ExcelToTxt类：

using ExcelDataReader;
using System.IO;
using System.Text.RegularExpressions;
using Html2Markdown;

namespace TextConvert
{
    internal class ExcelToTxt
    {
        public void ConvertToTxt(string excelFilePath)
        {
            string baseFolderPath = "./txt"; // 程序所在文件夹下的txt文件夹
            string excelFileNameWithoutExtension = Path.GetFileNameWithoutExtension(excelFilePath); // 获取Excel文件名（不包括扩展名）

            using (var stream = File.Open(excelFilePath, FileMode.Open, FileAccess.Read))
            {
                using (var reader = ExcelReaderFactory.CreateReader(stream))
                {
                    while (reader.Read())
                    {
                        string articleId = GetValueAsString(reader, 0);
                        string articleTitle = GetValueAsString(reader, 1);
                        string articleSummary = GetValueAsString(reader, 2);
                        string articleCreateDate = GetValueAsString(reader, 3);
                        string articleLink = GetValueAsString(reader, 4);
                        string category = GetValueAsString(reader, 5);
                        string tags = GetValueAsString(reader, 6);
                        string articleContent = HtmlToMarkdown(GetValueAsString(reader, 7));

                        string txtContent = $"文章ID：{articleId}\n文章标题：{articleTitle}\n文章摘要：{articleSummary}\n文章创建日期：{articleCreateDate}\n文章链接：{articleLink}\n分类目录：{category}\n标签：{tags}\n文章内容：\n{articleContent}";

                        string txtFileName = $"{SanitizeFileName(articleId)}-{SanitizeFileName(articleTitle)}.txt";
                        string secondLevelFolderPath = Path.Combine(baseFolderPath, excelFileNameWithoutExtension);

                        // 检查二级文件夹是否存在，如果不存在则创建
                        if (!Directory.Exists(secondLevelFolderPath))
                        {
                            Directory.CreateDirectory(secondLevelFolderPath);
                        }

                        // 将txt文件写入二级文件夹
                        string fullPath = Path.Combine(secondLevelFolderPath, txtFileName);
                        File.WriteAllText(fullPath, txtContent);
                    }
                }
            }
        }

        private string SanitizeFileName(string name)
        {
            var invalidChars = Path.GetInvalidFileNameChars();
            return new string(name.Where(ch => !invalidChars.Contains(ch)).ToArray());
        }

        private string GetValueAsString(IExcelDataReader reader, int columnIndex)
        {
            object value = reader.GetValue(columnIndex);
            if (value != null)
            {
                if (value is double) // 如果是数字类型，则转换为字符串
                {
                    return ((double)value).ToString();
                }
                else
                {
                    return value.ToString();
                }
            }
            return string.Empty;
        }

        public static string HtmlToMarkdown(string html)
        {
            // Convert <p> tags to Markdown paragraphs
            string markdown = Regex.Replace(html, @"<p.*?>", "", RegexOptions.IgnoreCase);

            var converter = new Converter();
            markdown = converter.Convert(markdown);

            markdown = Regex.Replace(markdown, @"</p>", "\n\n", RegexOptions.IgnoreCase);

            // Convert <strong> tags to Markdown bold
            markdown = Regex.Replace(markdown, @"<strong.*?>(.*?)</strong>", "**$1**", RegexOptions.IgnoreCase);

            // Convert <em> tags to Markdown italic
            markdown = Regex.Replace(markdown, @"<em.*?>(.*?)</em>", "_$1_", RegexOptions.IgnoreCase);

            // Convert <ul> and <ol> tags to Markdown lists
            markdown = Regex.Replace(markdown, @"<ul.*?>", "", RegexOptions.IgnoreCase);
            markdown = Regex.Replace(markdown, @"</ul>", "", RegexOptions.IgnoreCase);
            markdown = Regex.Replace(markdown, @"<ol.*?>", "", RegexOptions.IgnoreCase);
            markdown = Regex.Replace(markdown, @"</ol>", "", RegexOptions.IgnoreCase);
            markdown = Regex.Replace(markdown, @"<li.*?>(.*?)</li>", "- $1", RegexOptions.IgnoreCase);

            // Convert <h1> to <h6> tags to Markdown headers
            markdown = Regex.Replace(markdown, @"<h([1-6]).*?>(.*?)</h\1>", "######$2", RegexOptions.IgnoreCase);

            // Remove other HTML tags
            markdown = Regex.Replace(markdown, @"<[^>]*>", "", RegexOptions.IgnoreCase);

            // Replace HTML entities
            markdown = Regex.Replace(markdown, @"&nbsp;", " ", RegexOptions.IgnoreCase);
            markdown = Regex.Replace(markdown, @"&amp;", "&", RegexOptions.IgnoreCase);
            markdown = Regex.Replace(markdown, @"&lt;", "<", RegexOptions.IgnoreCase);
            markdown = Regex.Replace(markdown, @"&gt;", ">", RegexOptions.IgnoreCase);
            markdown = Regex.Replace(markdown, @"&quot;", "\"", RegexOptions.IgnoreCase);
            markdown = Regex.Replace(markdown, @"&#39;", "'", RegexOptions.IgnoreCase);

            markdown = Regex.Replace(markdown, @"^\s*\n", "", RegexOptions.Multiline);

            return markdown.Trim();
        }
    }
}

调用方法：

using System.IO;
using System.Text;
using System.Windows;

public void Main()
{ 
    // 注册编码提供程序
    Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
    ExcelToTxt excelToTxt = new ExcelToTxt();
    string directoryPath = "./excel"; // 当前目录下的excel文件夹
    string excelExtension = ".xls|.xlsx"; // Excel文件扩展名

    // 检查目录是否存在，如果不存在则创建
    if (!Directory.Exists(directoryPath))
    {
        Directory.CreateDirectory(directoryPath);
    }
    else
    {
        // 遍历目录中的所有Excel文件
        string[] excelFiles = Directory.GetFiles(directoryPath, "*.*", SearchOption.TopDirectoryOnly)
            .Where(f => excelExtension.Contains(new FileInfo(f).Extension.ToLower())).ToArray();

        foreach (string excelFile in excelFiles)
        {
            // 调用ConvertToTxt方法
            // Console.WriteLine($"正在处理文件：{excelFile}");
            excelToTxt.ConvertToTxt(excelFile);
        }
    }
}

由于导出来是html格式的，所以先将html格式的转为markdown减少文字，再去除无用标记符号，再去除多余换行，这样得出来的txt就可以放到知识库中转化了，可以试试小站右侧的AI小助手，我就是这样做的。

脚注

下载链接：
分享名称：WP All Export Pro
分享链接：https://kb.itpno.com/#s/-b0tgO9w
访问密码：iTPno. ↩︎

发送评论编辑评论

方法

脚注

发送评论 编辑评论

推荐文章

发送评论编辑评论