diff options
Diffstat (limited to 'plugin/media-ebook-plugin.cpp')
-rw-r--r-- | plugin/media-ebook-plugin.cpp | 334 |
1 files changed, 334 insertions, 0 deletions
diff --git a/plugin/media-ebook-plugin.cpp b/plugin/media-ebook-plugin.cpp new file mode 100644 index 0000000..49ea76b --- /dev/null +++ b/plugin/media-ebook-plugin.cpp @@ -0,0 +1,334 @@ +/* + * libmedia-service + * + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ +#include <podofo/podofo.h> +#include <stack> +#include <memory> +#include <dlog.h> +#include <glib.h> +#include <stdbool.h> + +#include <zip.h> +#include <libxml/xmlmemory.h> +#include <libxml/parser.h> +#include <libxml/HTMLparser.h> + +#include <regex> + +#ifdef LOG_TAG +#undef LOG_TAG +#endif + +#define LOG_TAG "MEDIA_SERVICE" + +class TextFinderInterface +{ +public: + virtual ~TextFinderInterface() = default; + virtual bool find(const char *keyword) = 0; +}; + +/*---------------- PDF -----------------------*/ + +class PdfTextFinder : public TextFinderInterface +{ +public: + explicit PdfTextFinder(const char *path); + bool find(const char *keyword) override; + +private: + std::string parseTextFromPage(unsigned int index); + bool match(std::string& text, const char *keyword); + bool loaded {}; + + PoDoFo::PdfMemDocument pdf {}; +}; + +PdfTextFinder::PdfTextFinder(const char *path) +{ + if (!path) { + LOGE("invalid path"); + return; + } + + LOGD("%s", path); + + try { + pdf.Load(path); + loaded = true; + } catch (const PoDoFo::PdfError& e) { + LOGE("Initialization failed : %s", e.what()); + } +} + +bool PdfTextFinder::find(const char *keyword) +{ + if (!loaded) + return false; + + if (!keyword) { + LOGE("Invalid keyword"); + return false; + } + + for (int n = 0; n < pdf.GetPageCount(); ++n) { + auto text = parseTextFromPage(n); + if (match(text, keyword)) + return true; + } + + return false; +} + +std::string PdfTextFinder::parseTextFromPage(unsigned int index) +{ + std::string fullText; + + PoDoFo::EPdfContentsType type; + PoDoFo::PdfVariant var; + PoDoFo::PdfFont *cur_font = NULL; + bool text_block = false; + const char *tok; + std::stack<PoDoFo::PdfVariant> stack; + PoDoFo::PdfString unicode; + PoDoFo::PdfArray array; + + PoDoFo::PdfPage* page = pdf.GetPage(index); + if (!page) + return fullText; + + PoDoFo::PdfContentsTokenizer tokenizer(page); + + while (tokenizer.ReadNext(type, tok, var)) { + if (type != PoDoFo::ePdfContentsType_Keyword) { + if (text_block) + stack.push(var); + + continue; + } + + if (!text_block && strcmp(tok, "BT") == 0) { + text_block = true; + continue; + } else if (text_block && strcmp(tok, "ET") == 0) { + text_block = false; + } + + if (!text_block) + continue; + + if (strcmp(tok, "Tf") == 0) { + if (stack.size() < 2) { + cur_font = NULL; + continue; + } + + stack.pop(); + cur_font = pdf.GetFont(page->GetFromResources(PoDoFo::PdfName("Font"), stack.top().GetName())); + } else if (strcmp(tok, "Tj") == 0 || strcmp(tok, "'") == 0 || strcmp(tok, "\"") == 0) { + if (stack.empty()) + continue; + + if (!cur_font || !cur_font->GetEncoding()) + continue; + + unicode = cur_font->GetEncoding()->ConvertToUnicode(stack.top().GetString(), cur_font); + fullText += unicode.GetStringUtf8(); + + stack.pop(); + } else if (strcmp(tok, "TJ") == 0) { + if (stack.empty()) + continue; + + array = stack.top().GetArray(); + stack.pop(); + + for (int i = 0; i < static_cast<int>(array.GetSize()); i++) { + if (array[i].IsString() || array[i].IsHexString()) { + if (!cur_font || !cur_font->GetEncoding()) + continue; + + unicode = cur_font->GetEncoding()->ConvertToUnicode(array[i].GetString(), cur_font); + fullText += unicode.GetStringUtf8(); + } + } + } + } + + return fullText; +} + +/* ToDo : match can be passed to EbookText */ +bool PdfTextFinder::match(std::string& text, const char *keyword) +{ + if (!keyword) + return false; + + if (text.empty()) + return false; + + std::regex re(keyword, std::regex::icase); + + if (std::regex_search(text, re)) { + LOGD("Found [%s]", keyword); + return true; + } + + return false; +} + +/*---------------- EPUB -----------------------*/ + +class EpubTextFinder : public TextFinderInterface +{ +public: + explicit EpubTextFinder(const char *path); + bool find(const char *keyword) override; + + ~EpubTextFinder() override; + +private: + bool match(const char *text, const char *keyword); + bool htmlNodeFindRecursive(xmlNodePtr node, const char *keyword); + bool htmlFind(const char *html_buf, int buf_size, const char *keyword); + + zip_t *z{}; +}; + +EpubTextFinder::EpubTextFinder(const char *path) +{ + if (!path) { + LOGE("invalid path"); + return; + } + + LOGD("%s", path); + + int err = 0; + z = zip_open(path, ZIP_RDONLY, &err); + if (err != 0) + LOGE("zip_open failed"); +} + +EpubTextFinder::~EpubTextFinder() +{ + if (!z) + return; + + zip_close(z); + z = nullptr; +} + +bool EpubTextFinder::find(const char *keyword) +{ + zip_stat_t sb = {0, }; + + if (!keyword) { + LOGE("Invalid keyword"); + return false; + } + + int entry_len = zip_get_num_entries(z, ZIP_FL_UNCHANGED); + for (int i = 0; i < entry_len; i++) { + if (!g_str_has_suffix(zip_get_name(z, i, ZIP_FL_ENC_GUESS), "html")) + continue; + + if (zip_stat_index(z, i, 0, &sb) != 0) + continue; + + zip_file_t *file = zip_fopen_index(z, i, 0); + if (!file) + continue; + + std::vector<char> file_buf(sb.size); + + zip_int64_t readn = zip_fread(file, file_buf.data(), sb.size); + zip_fclose(file); + + if ((readn == static_cast<zip_int64_t>(sb.size)) && + htmlFind(file_buf.data(), sb.size, keyword)) + return true; + } + + return false; +} + + +bool EpubTextFinder::match(const char *text, const char *keyword) +{ + if (!keyword) + return false; + + if (!text) + return false; + + std::regex re(keyword, std::regex::icase); + + if (std::regex_search(text, re)) { + LOGD("Found [%s]", keyword); + return true; + } + + return false; +} + +bool EpubTextFinder::htmlNodeFindRecursive(xmlNodePtr node, const char *keyword) +{ + for (xmlNodePtr cur = node; cur; cur = cur->next) { + if (cur->type == XML_TEXT_NODE && match((const char *)cur->content, keyword)) + return true; + + if (htmlNodeFindRecursive(cur->children, keyword)) + return true; + } + + return false; +} + +bool EpubTextFinder::htmlFind(const char *html_buf, int buf_size, const char *keyword) +{ + htmlDocPtr doc = htmlReadMemory(html_buf, buf_size, "/", NULL, + HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET); + + if (!doc) { + LOGE("htmlReadMemory failed"); + return false; + } + + bool result = htmlNodeFindRecursive(xmlDocGetRootElement(doc), keyword); + + xmlFreeDoc(doc); + + return result; +} + +extern "C" bool media_svc_pdf_is_keyword_included(const char *path, const char *keyword) +{ + /* ToDo: factory pattern */ + std::unique_ptr<TextFinderInterface> ebookText = std::make_unique<PdfTextFinder>(path); + + return ebookText->find(keyword); +} + +extern "C" bool media_svc_epub_is_keyword_included(const char *path, const char *keyword) +{ + /* ToDo: factory pattern */ + std::unique_ptr<TextFinderInterface> ebookText = std::make_unique<EpubTextFinder>(path); + + return ebookText->find(keyword); +} |