本文最后更新于 181 天前,其中的信息可能已经有所发展或是发生改变。
方法
用WP All Export Pro1插件将文章导出为XLSX格式的文件,再用C#做一个程序:
安装依赖:
Html2Markdown
ExcelDataReader
System.Text.Encoding.CodePages
ExcelToTxt类:
using ExcelDataReader;
using System.IO;
using System.Text.RegularExpressions;
using Html2Markdown;
namespace TextConvert
{
internal class ExcelToTxt
{
public void ConvertToTxt(string excelFilePath)
{
string baseFolderPath = "./txt"; // 程序所在文件夹下的txt文件夹
string excelFileNameWithoutExtension = Path.GetFileNameWithoutExtension(excelFilePath); // 获取Excel文件名(不包括扩展名)
using (var stream = File.Open(excelFilePath, FileMode.Open, FileAccess.Read))
{
using (var reader = ExcelReaderFactory.CreateReader(stream))
{
while (reader.Read())
{
string articleId = GetValueAsString(reader, 0);
string articleTitle = GetValueAsString(reader, 1);
string articleSummary = GetValueAsString(reader, 2);
string articleCreateDate = GetValueAsString(reader, 3);
string articleLink = GetValueAsString(reader, 4);
string category = GetValueAsString(reader, 5);
string tags = GetValueAsString(reader, 6);
string articleContent = HtmlToMarkdown(GetValueAsString(reader, 7));
string txtContent = $"文章ID:{articleId}\n文章标题:{articleTitle}\n文章摘要:{articleSummary}\n文章创建日期:{articleCreateDate}\n文章链接:{articleLink}\n分类目录:{category}\n标签:{tags}\n文章内容:\n{articleContent}";
string txtFileName = $"{SanitizeFileName(articleId)}-{SanitizeFileName(articleTitle)}.txt";
string secondLevelFolderPath = Path.Combine(baseFolderPath, excelFileNameWithoutExtension);
// 检查二级文件夹是否存在,如果不存在则创建
if (!Directory.Exists(secondLevelFolderPath))
{
Directory.CreateDirectory(secondLevelFolderPath);
}
// 将txt文件写入二级文件夹
string fullPath = Path.Combine(secondLevelFolderPath, txtFileName);
File.WriteAllText(fullPath, txtContent);
}
}
}
}
private string SanitizeFileName(string name)
{
var invalidChars = Path.GetInvalidFileNameChars();
return new string(name.Where(ch => !invalidChars.Contains(ch)).ToArray());
}
private string GetValueAsString(IExcelDataReader reader, int columnIndex)
{
object value = reader.GetValue(columnIndex);
if (value != null)
{
if (value is double) // 如果是数字类型,则转换为字符串
{
return ((double)value).ToString();
}
else
{
return value.ToString();
}
}
return string.Empty;
}
public static string HtmlToMarkdown(string html)
{
// Convert <p> tags to Markdown paragraphs
string markdown = Regex.Replace(html, @"<p.*?>", "", RegexOptions.IgnoreCase);
var converter = new Converter();
markdown = converter.Convert(markdown);
markdown = Regex.Replace(markdown, @"</p>", "\n\n", RegexOptions.IgnoreCase);
// Convert <strong> tags to Markdown bold
markdown = Regex.Replace(markdown, @"<strong.*?>(.*?)</strong>", "**$1**", RegexOptions.IgnoreCase);
// Convert <em> tags to Markdown italic
markdown = Regex.Replace(markdown, @"<em.*?>(.*?)</em>", "_$1_", RegexOptions.IgnoreCase);
// Convert <ul> and <ol> tags to Markdown lists
markdown = Regex.Replace(markdown, @"<ul.*?>", "", RegexOptions.IgnoreCase);
markdown = Regex.Replace(markdown, @"</ul>", "", RegexOptions.IgnoreCase);
markdown = Regex.Replace(markdown, @"<ol.*?>", "", RegexOptions.IgnoreCase);
markdown = Regex.Replace(markdown, @"</ol>", "", RegexOptions.IgnoreCase);
markdown = Regex.Replace(markdown, @"<li.*?>(.*?)</li>", "- $1", RegexOptions.IgnoreCase);
// Convert <h1> to <h6> tags to Markdown headers
markdown = Regex.Replace(markdown, @"<h([1-6]).*?>(.*?)</h\1>", "######$2", RegexOptions.IgnoreCase);
// Remove other HTML tags
markdown = Regex.Replace(markdown, @"<[^>]*>", "", RegexOptions.IgnoreCase);
// Replace HTML entities
markdown = Regex.Replace(markdown, @" ", " ", RegexOptions.IgnoreCase);
markdown = Regex.Replace(markdown, @"&", "&", RegexOptions.IgnoreCase);
markdown = Regex.Replace(markdown, @"<", "<", RegexOptions.IgnoreCase);
markdown = Regex.Replace(markdown, @">", ">", RegexOptions.IgnoreCase);
markdown = Regex.Replace(markdown, @""", "\"", RegexOptions.IgnoreCase);
markdown = Regex.Replace(markdown, @"'", "'", RegexOptions.IgnoreCase);
markdown = Regex.Replace(markdown, @"^\s*\n", "", RegexOptions.Multiline);
return markdown.Trim();
}
}
}
调用方法:
using System.IO;
using System.Text;
using System.Windows;
public void Main()
{
// 注册编码提供程序
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
ExcelToTxt excelToTxt = new ExcelToTxt();
string directoryPath = "./excel"; // 当前目录下的excel文件夹
string excelExtension = ".xls|.xlsx"; // Excel文件扩展名
// 检查目录是否存在,如果不存在则创建
if (!Directory.Exists(directoryPath))
{
Directory.CreateDirectory(directoryPath);
}
else
{
// 遍历目录中的所有Excel文件
string[] excelFiles = Directory.GetFiles(directoryPath, "*.*", SearchOption.TopDirectoryOnly)
.Where(f => excelExtension.Contains(new FileInfo(f).Extension.ToLower())).ToArray();
foreach (string excelFile in excelFiles)
{
// 调用ConvertToTxt方法
// Console.WriteLine($"正在处理文件:{excelFile}");
excelToTxt.ConvertToTxt(excelFile);
}
}
}
由于导出来是html格式的,所以先将html格式的转为markdown减少文字,再去除无用标记符号,再去除多余换行,这样得出来的txt就可以放到知识库中转化了,可以试试小站右侧的AI小助手,我就是这样做的。
脚注
- 下载链接:
分享名称:WP All Export Pro
分享链接:https://kb.itpno.com/#s/-b0tgO9w
访问密码:iTPno.
↩︎