summaryrefslogtreecommitdiff
path: root/plugin/media-ebook-plugin.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'plugin/media-ebook-plugin.cpp')
-rw-r--r--plugin/media-ebook-plugin.cpp334
1 files changed, 334 insertions, 0 deletions
diff --git a/plugin/media-ebook-plugin.cpp b/plugin/media-ebook-plugin.cpp
new file mode 100644
index 0000000..49ea76b
--- /dev/null
+++ b/plugin/media-ebook-plugin.cpp
@@ -0,0 +1,334 @@
+/*
+ * libmedia-service
+ *
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+#include <podofo/podofo.h>
+#include <stack>
+#include <memory>
+#include <dlog.h>
+#include <glib.h>
+#include <stdbool.h>
+
+#include <zip.h>
+#include <libxml/xmlmemory.h>
+#include <libxml/parser.h>
+#include <libxml/HTMLparser.h>
+
+#include <regex>
+
+#ifdef LOG_TAG
+#undef LOG_TAG
+#endif
+
+#define LOG_TAG "MEDIA_SERVICE"
+
+class TextFinderInterface
+{
+public:
+ virtual ~TextFinderInterface() = default;
+ virtual bool find(const char *keyword) = 0;
+};
+
+/*---------------- PDF -----------------------*/
+
+class PdfTextFinder : public TextFinderInterface
+{
+public:
+ explicit PdfTextFinder(const char *path);
+ bool find(const char *keyword) override;
+
+private:
+ std::string parseTextFromPage(unsigned int index);
+ bool match(std::string& text, const char *keyword);
+ bool loaded {};
+
+ PoDoFo::PdfMemDocument pdf {};
+};
+
+PdfTextFinder::PdfTextFinder(const char *path)
+{
+ if (!path) {
+ LOGE("invalid path");
+ return;
+ }
+
+ LOGD("%s", path);
+
+ try {
+ pdf.Load(path);
+ loaded = true;
+ } catch (const PoDoFo::PdfError& e) {
+ LOGE("Initialization failed : %s", e.what());
+ }
+}
+
+bool PdfTextFinder::find(const char *keyword)
+{
+ if (!loaded)
+ return false;
+
+ if (!keyword) {
+ LOGE("Invalid keyword");
+ return false;
+ }
+
+ for (int n = 0; n < pdf.GetPageCount(); ++n) {
+ auto text = parseTextFromPage(n);
+ if (match(text, keyword))
+ return true;
+ }
+
+ return false;
+}
+
+std::string PdfTextFinder::parseTextFromPage(unsigned int index)
+{
+ std::string fullText;
+
+ PoDoFo::EPdfContentsType type;
+ PoDoFo::PdfVariant var;
+ PoDoFo::PdfFont *cur_font = NULL;
+ bool text_block = false;
+ const char *tok;
+ std::stack<PoDoFo::PdfVariant> stack;
+ PoDoFo::PdfString unicode;
+ PoDoFo::PdfArray array;
+
+ PoDoFo::PdfPage* page = pdf.GetPage(index);
+ if (!page)
+ return fullText;
+
+ PoDoFo::PdfContentsTokenizer tokenizer(page);
+
+ while (tokenizer.ReadNext(type, tok, var)) {
+ if (type != PoDoFo::ePdfContentsType_Keyword) {
+ if (text_block)
+ stack.push(var);
+
+ continue;
+ }
+
+ if (!text_block && strcmp(tok, "BT") == 0) {
+ text_block = true;
+ continue;
+ } else if (text_block && strcmp(tok, "ET") == 0) {
+ text_block = false;
+ }
+
+ if (!text_block)
+ continue;
+
+ if (strcmp(tok, "Tf") == 0) {
+ if (stack.size() < 2) {
+ cur_font = NULL;
+ continue;
+ }
+
+ stack.pop();
+ cur_font = pdf.GetFont(page->GetFromResources(PoDoFo::PdfName("Font"), stack.top().GetName()));
+ } else if (strcmp(tok, "Tj") == 0 || strcmp(tok, "'") == 0 || strcmp(tok, "\"") == 0) {
+ if (stack.empty())
+ continue;
+
+ if (!cur_font || !cur_font->GetEncoding())
+ continue;
+
+ unicode = cur_font->GetEncoding()->ConvertToUnicode(stack.top().GetString(), cur_font);
+ fullText += unicode.GetStringUtf8();
+
+ stack.pop();
+ } else if (strcmp(tok, "TJ") == 0) {
+ if (stack.empty())
+ continue;
+
+ array = stack.top().GetArray();
+ stack.pop();
+
+ for (int i = 0; i < static_cast<int>(array.GetSize()); i++) {
+ if (array[i].IsString() || array[i].IsHexString()) {
+ if (!cur_font || !cur_font->GetEncoding())
+ continue;
+
+ unicode = cur_font->GetEncoding()->ConvertToUnicode(array[i].GetString(), cur_font);
+ fullText += unicode.GetStringUtf8();
+ }
+ }
+ }
+ }
+
+ return fullText;
+}
+
+/* ToDo : match can be passed to EbookText */
+bool PdfTextFinder::match(std::string& text, const char *keyword)
+{
+ if (!keyword)
+ return false;
+
+ if (text.empty())
+ return false;
+
+ std::regex re(keyword, std::regex::icase);
+
+ if (std::regex_search(text, re)) {
+ LOGD("Found [%s]", keyword);
+ return true;
+ }
+
+ return false;
+}
+
+/*---------------- EPUB -----------------------*/
+
+class EpubTextFinder : public TextFinderInterface
+{
+public:
+ explicit EpubTextFinder(const char *path);
+ bool find(const char *keyword) override;
+
+ ~EpubTextFinder() override;
+
+private:
+ bool match(const char *text, const char *keyword);
+ bool htmlNodeFindRecursive(xmlNodePtr node, const char *keyword);
+ bool htmlFind(const char *html_buf, int buf_size, const char *keyword);
+
+ zip_t *z{};
+};
+
+EpubTextFinder::EpubTextFinder(const char *path)
+{
+ if (!path) {
+ LOGE("invalid path");
+ return;
+ }
+
+ LOGD("%s", path);
+
+ int err = 0;
+ z = zip_open(path, ZIP_RDONLY, &err);
+ if (err != 0)
+ LOGE("zip_open failed");
+}
+
+EpubTextFinder::~EpubTextFinder()
+{
+ if (!z)
+ return;
+
+ zip_close(z);
+ z = nullptr;
+}
+
+bool EpubTextFinder::find(const char *keyword)
+{
+ zip_stat_t sb = {0, };
+
+ if (!keyword) {
+ LOGE("Invalid keyword");
+ return false;
+ }
+
+ int entry_len = zip_get_num_entries(z, ZIP_FL_UNCHANGED);
+ for (int i = 0; i < entry_len; i++) {
+ if (!g_str_has_suffix(zip_get_name(z, i, ZIP_FL_ENC_GUESS), "html"))
+ continue;
+
+ if (zip_stat_index(z, i, 0, &sb) != 0)
+ continue;
+
+ zip_file_t *file = zip_fopen_index(z, i, 0);
+ if (!file)
+ continue;
+
+ std::vector<char> file_buf(sb.size);
+
+ zip_int64_t readn = zip_fread(file, file_buf.data(), sb.size);
+ zip_fclose(file);
+
+ if ((readn == static_cast<zip_int64_t>(sb.size)) &&
+ htmlFind(file_buf.data(), sb.size, keyword))
+ return true;
+ }
+
+ return false;
+}
+
+
+bool EpubTextFinder::match(const char *text, const char *keyword)
+{
+ if (!keyword)
+ return false;
+
+ if (!text)
+ return false;
+
+ std::regex re(keyword, std::regex::icase);
+
+ if (std::regex_search(text, re)) {
+ LOGD("Found [%s]", keyword);
+ return true;
+ }
+
+ return false;
+}
+
+bool EpubTextFinder::htmlNodeFindRecursive(xmlNodePtr node, const char *keyword)
+{
+ for (xmlNodePtr cur = node; cur; cur = cur->next) {
+ if (cur->type == XML_TEXT_NODE && match((const char *)cur->content, keyword))
+ return true;
+
+ if (htmlNodeFindRecursive(cur->children, keyword))
+ return true;
+ }
+
+ return false;
+}
+
+bool EpubTextFinder::htmlFind(const char *html_buf, int buf_size, const char *keyword)
+{
+ htmlDocPtr doc = htmlReadMemory(html_buf, buf_size, "/", NULL,
+ HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET);
+
+ if (!doc) {
+ LOGE("htmlReadMemory failed");
+ return false;
+ }
+
+ bool result = htmlNodeFindRecursive(xmlDocGetRootElement(doc), keyword);
+
+ xmlFreeDoc(doc);
+
+ return result;
+}
+
+extern "C" bool media_svc_pdf_is_keyword_included(const char *path, const char *keyword)
+{
+ /* ToDo: factory pattern */
+ std::unique_ptr<TextFinderInterface> ebookText = std::make_unique<PdfTextFinder>(path);
+
+ return ebookText->find(keyword);
+}
+
+extern "C" bool media_svc_epub_is_keyword_included(const char *path, const char *keyword)
+{
+ /* ToDo: factory pattern */
+ std::unique_ptr<TextFinderInterface> ebookText = std::make_unique<EpubTextFinder>(path);
+
+ return ebookText->find(keyword);
+}