整理PDF电子书的方法谈

拿到一本非常好的电子书的时候，如果苦于该PDF里文字都是以图片形式保存的，没法复制粘贴的话，可以按照以下的步骤做成电子书。我整理《正宗马礼堂养气功》一书，就是这样的：

1. 把PDF转换成JPG格式

使用pdf2jpg 这个软件（美国人写的，功能短小精悍，非常棒。速度很快）

2. 使用OCR（优化型文字识别）软件进行识别

汉王的几个快捷方式：
自动倾斜校正：按下“Ctrl+D”进行自动倾斜校正。
手动倾斜校正：按下“Ctrl+M”进行手动倾斜校正。
版面分析：按下“F5”键，对选中的文件进行版面分析。
版面识别：按下“F8”键，对选中的文件进行识别。
撤消：按下 “Ctrl+Z”键取消上一步操作。
向后找可疑字：按下“Ctrl+Tab”键。
向前找可疑字：按下“Shift+Tab”键。

3. 手工校对。

真的很麻烦。首先需要把产生的碎片txt文件合并成一个大的。（我使用了ruby程序）
然后在打印机上打印出来，人工校对。

只能通过阅读是否流畅来校对。不可能一个字一个字的对。

4. 把得到的TXT文件重新排版，整理成word或者pdf文件。

我是通过整理成 docbook，然后通过CDBE，生成了pdf, html 不同格式的文件的。

就这样，《正宗马礼堂养气功》，360页，耗时近40天，完成。

ps. 附：
ruby 代码：

require 'test/unit'
require 'text_formatter'
require 'logger'
require 'find'

class TextFormatterTest<Test::Unit::TestCase
@@log = Logger.new("D:\\log.txt", 5, 1000*1024)
# @@log = Logger.new("book.txt")
@@log.level=Logger::INFO
DIR ="book_of_malitang_origin"
DIR_OF_PRINT= "book_of_malitang_for_print"
DIR_OF_NEW ="new"
file=""
@@text = ""

FILE_1= "1.前言与概论.txt"
FILE_2= "2.六字诀.txt"
FILE_3="3.洗髓金经.txt"
FILE_4="4.站坐卧功.txt"
FILE_5="5.问答.txt"
FILE_5_TEMP="5.问答.txt.temp"

FILE_6="6.痊愈病例.txt"
FILE_7="附来自网络的痊愈病例.txt"
FILE_8="附马礼堂事迹3.txt"

def test_substitute_qq_number_with_wenhao_in_a_file
file = "temp.txt"
TextFormatter.substitute_qq_number_with_wenhao_in_a_file(file)
end

def test_substitute_qq_number_with_wenhao
result = TextFormatter.substitute_qq_number_with_wenhao("510045240")
assert result == "5100452??"
result = TextFormatter.substitute_qq_number_with_wenhao("123456789012")
assert result == "1234567890??"
result = TextFormatter.substitute_qq_number_with_wenhao("12345")
assert result == "123??"
end

def test_regexp
string = "    1．问：什么是气功？"
assert TextFormatter.a_question?(string)
string2 = "    答：元气又称“原气”、“真气”。它是由精化生、随着生命而来的，"
assert !TextFormatter.a_question?(string2)

end

def test_add_sect1_marks_to_a_line
line="    1．问：什么是气功？"
formatter = TextFormatter.new
formatter.text=("")
formatter.add_sect1_marks_to_a_line(line)
assert formatter.text=="</sect1><sect1><title>问：什么是气功？</title>"
end

def test_add_para_marks_to_a_line
formatter = TextFormatter.new
formatter.text=("")
line="    汤饵针灸不能收效的胃下垂患者，通过练养气功腹式呼。"
formatter.add_para_marks_to_a_line(line)
assert formatter.text =="<para>"+line+"</para>"
line="第一章六字诀"
formatter.text=("")
formatter.add_para_marks_to_a_line(line)
assert formatter.text == line
formatter.text=("")
end

# change the FILE_X to process some file.
def not_test_add_marks_to_a_file
TextFormatter.add_para_marks_to_a_file(DIR_OF_NEW+File::SEPARATOR+
FILE_7)
end

def not_test_add_sect1_marks_to_a_file
TextFormatter.add_sect1_title_and_para_marks_to_a_file(
DIR_OF_NEW+File::SEPARATOR+
FILE_5)
end

# remove all the marks such as :" <!-原书第    1页结束->
def not_test_remove_all_the_marks_of_original_book

Find.find(DIR_OF_PRINT) do |f|
remove_all_the_marks_of_one_file("output",f)
end
end

def not_test_remove_all_the_marks_of_one_file
#     file= "2.六字诀.txt"
#    file= "1.前言与概论.txt"
#file="3.洗髓金经.txt"
#file="4.站坐卧功.txt"
#    file="5.问答.txt"
file="6.痊愈病例.txt"
remove_all_the_marks_of_one_file("output",DIR_OF_PRINT+File::SEPARATOR+file)
end

def remove_all_the_marks_of_one_file(output_dir,f)
@@log.debug "processing file: #{f}..."
if f.include? "txt"
substitute_the_marks(f)
TextFormatter.write_to_file(output_dir+File::SEPARATOR+File.basename(f), @@text)
@@text=""
end

end

def test_write_to_file
TextFormatter.write_to_file("1.test.txt", "hahahaha")
end

def not_test_substitute_the_marks
substitute_the_marks(DIR_OF_PRINT+File::SEPARATOR+"1.前言与概论.txt")
@@log.info @@text
end

# rename this method name to "test_..." if you want to make it work.
def not_test_read_head_part
files_part1 = ["9-14(15~20略).TXT", "33-39(21~32略).TXT", "40-43.txt", "44-47.txt", "48-55.TXT",
"56-57概述结束.TXT", "58-59.TXT"]
files_part2 = Array.new
#从60开始，到 471
(60..471).to_a.each{ |i|
files_part2 << i.to_s+".TXT"
}
read_from_files(files_part1 + files_part2)
end

def read_from_files(files_array)
files_array.each {|i|
begin
read_from_a_txt_file(i)
rescue
@@log.warn "#{i} not exists..."
end
}
@@log.info "result:"+@@text
end

# file_name ,e.g: 100.TXT
def read_from_a_txt_file(file_name)
#    file = DIR+ "/100.TXT"
file = DIR+ "/"+ file_name
File.open(file) do |file|
while line= file.gets
if line != nil
@@log.debug "original: -->|" + line +"|<--end"
process_line(line)
end
end
end
#    @@log.debug "#{file}'s text:"+@@text
puts "file #{file_name} processed ."
end

def substitute_the_marks(file_name)
regexp = Regexp.new("<!-.*->")
File.open(file_name) do |txt_file|
while line=txt_file.gets
@@log.debug line
if !line.index(regexp)
@@log.debug ""
@@text << line
else
# replace <!-原书第    1页结束-> to ""
line.gsub(regexp) { |match|
@@log.info "#{match}"
@@log.debug "#{match}: #{$`}, #{$&}, #{$'}"
@@text << $` << $'
}
end

end
end
end

# FIXED ：未处理 “第X章第X节”开头的行
# 处理文本
def process_line(line)
# ? 如果这一行的内容，仅仅有空格，或者换行，那么保留它。
if !blank_line?(line)
# 首先去掉该文本的回车 \n
line_without_separator = line.chomp
# 如果这一行是接上一行的描述，他们组成了一个段落，那么加到上一行。
if content_that_follows_the_previous_line?(line_without_separator)
@@text << line_without_separator
# 如果这一行的内容，仅仅是一个数字，那么视它为一个页码。
elsif page_number?(line_without_separator)
@@text << "<!-原书第"+line_without_separator+"页结束->"
# 其他情况与（如果这一行是以空格开始的行数，那么视它为一个新的段落的开头。）
else
@@text << "\n" + line_without_separator
end
# 对空白行，什么也不做。
end
end

def test_page_number?
assert page_number?("   333")
assert page_number?("333   ")
assert !page_number?(" aaa")
assert !page_number?("")
end

def page_number?(line)
begin
return Integer(line.strip)
rescue
return false
end
end

# this is needed
def test_content_that_follows_the_previous_line?
assert !content_that_follows_the_previous_line?(" 汉字某段开头")
assert !content_that_follows_the_previous_line?(" 63")
assert !content_that_follows_the_previous_line?("   ")
assert content_that_follows_the_previous_line?("继续上一句话………………")
end

def content_that_follows_the_previous_line?(content)
return !page_number?(content) && content.index(" ")!=0 && (!blank_line?(content))
end

def test_blank_line
assert !blank_line?("lala")
assert blank_line?("")
assert !blank_line?("    lala")
assert blank_line?("   ")
assert blank_line?("\n")
end

def blank_line? (content)
return content.strip.size == 0
end

end

代码2：

require 'logger'
require 'iconv'

class TextFormatter

@@log = Logger.new("D:\\log.txt", 5, 1000*1024)
@@log.level=Logger::INFO
XML_SUFFIX=".xml"
@@text=""
TOPIC_STRING_LENGTH= 24

def text
return @@text
end

def text=(value)
@@text =value
end

def TextFormatter.substitute_qq_number_with_wenhao_in_a_file(file)
qq_number_regexp=Regexp.new('\d{5,12}')
File.open(file) do |file|
while line=file.gets

if !line.index(qq_number_regexp)
@@text <<line
else
@@text << TextFormatter.substitute_qq_number_with_wenhao(line)
end
end
end
TextFormatter.write_to_file("result.txt", @@text)

end

def TextFormatter.substitute_qq_number_with_wenhao(number_string)

number_string.gsub(Regexp.new('\d{5,12}')) { |match|
@@log.debug "#{match}"
return $`+ match[0, match.length-2]+"??" + $'
}
end

def TextFormatter.add_para_marks_to_a_file(file)
@@log.debug File.basename(file, ".*")
TextFormatter.new.read_from_a_file(file)
write_to_file(file.gsub(".txt",XML_SUFFIX), @@text)
end

def TextFormatter.add_sect1_title_and_para_marks_to_a_file(file)
TextFormatter.new.read_from_a_QA_file(file)
write_to_file(file.gsub(".txt",XML_SUFFIX), Iconv.conv("UTF-8","GBK", @@text))
end

# TODO: not convert GBK to UTF encoding...
def read_from_a_QA_file(file)
File.open(file) do |file|
while line= file.gets
if line != nil
add_sect1_marks_to_a_line(line)
end
end
end
end

def read_from_a_file(file)
File.open(file) do |file|
while line= file.gets
#        line = Iconv.conv('UTF-8',"GBK",line)
#        line << conv.iconv(nil)
if line != nil
add_para_marks_to_a_line(line)
end
end
end
end

QUESTION_REGEXP = Regexp.new('问：')

# if some string contains '问：'
# e.g. : '1．问：什么是气功？'
def TextFormatter.a_question?(line)
return line.index(QUESTION_REGEXP)
end

def add_sect1_marks_to_a_line(line)
if TextFormatter.a_question?(line)
@@log.info "a question: #{line}"
line.gsub(QUESTION_REGEXP) { |match|
@@text << "</sect1><sect1><title>"+"问："+ $' +"</title>"
}
else
add_para_marks_to_a_line(line)
#      @@log.debug "not a question: #{line}"
end
#    @@log.debug "text: #{@@text}"
end

# add <para>text</para> to a "text" string
def add_para_marks_to_a_line(line)
if line.length > TOPIC_STRING_LENGTH
@@text << "<para>"+line+"</para>"
else
@@log.warn "not a para? #{line}"
@@text <<line
end
#    @@log.debug "text: #{@@text}"
end

def TextFormatter.write_to_file(file_name, text)
file = File.new(file_name, "w")
file.puts text
file.close
end

end

require 'fileutils'

include FileUtils

SRC_DIR="D:/workspace/doc/"
DIST_DIR="D:/obp/books/malitang/zh-cn/src/"
FIGURE_DIR=SRC_DIR+"new/figure/"

def copy_xml(src_file, dist_file=src_file)
cp(SRC_DIR+src_file, DIST_DIR+File.basename(dist_file))
end

def copy_images
cp_r Dir.glob(FIGURE_DIR+"*.jpg"), DIST_DIR+"figure/"
end

copy_images
copy_xml("malitang.xml")
copy_xml("new/1.前言与概论.xml", "1.xml")
copy_xml("new/2.六字诀.xml", "2.xml")
copy_xml("new/3.洗髓金经.xml", "3.xml")
copy_xml("new/4.站坐卧功.xml", "4.xml")
copy_xml("new/5.问答.xml", "5.xml")
copy_xml("new/6.痊愈病例.xml", "6.xml")
copy_xml("new/附 VCD六字诀中马老原音.xml", "appendix1.xml")
copy_xml("new/附 VCD洗髓金经马老原音.xml", "appendix2.xml")
copy_xml("new/附六字诀出场人物.xml", "appendix_people.xml")
copy_xml("new/附马礼堂的相关事迹.xml", "appendix_story.xml")
copy_xml("new/附来自网络的痊愈病例.xml", "appendix_cured_cases.xml")
copy_xml("new/附交流与讨论.xml", "appendix_discussion.xml")

start_time = Time.new
#system('bd_html malitang zh-cn')
#system('bd_chunk malitang zh-cn')
system('bd_fo malitang zh-cn')
system('bdj_pdf malitang zh-cn')
puts start_time
puts Time.new