if (typeof ExtractContentJS == 'undefined') { var ExtractContentJS = {}; } if (typeof ExtractContentJS.Lib == 'undefined') { ExtractContentJS.Lib = {}; } ExtractContentJS.Lib.Util = (function() { var Util = {}; Util.BenchmarkTimer = function() { var now = function() { var d = new Date(); var t = 0; t = d.getHours(); t = t * 60 + d.getMinutes(); t = t * 60 + d.getSeconds(); t = t * 1000 + d.getMilliseconds(); return t; }; var Timer = function() { var self = { elapsed: 0 }; self.reset = function() { self.elapsed = 0; return self }; self.start = function() { self.msec = now(); return self }; self.stop = function() { self.elapsed += now() - self.msec; return self; }; return self.start(); }; var self = { timers: {} }; self.get = function(name) { if (!self.timers[name]) { self.timers[name] = new Timer(); } return self.timers[name]; }; self.reset = function(name) { return self.get(name).reset(); }; self.start = function(name) { return self.get(name).start(); }; self.stop = function(name) { return self.get(name).stop(); }; return self; }; Util.Token = function(word) { var regex = { // hiragana: /[あ-んが-ぼぁ-ょゎっー]/, hiragana: /[\u3042-\u3093\u304C-\u307C\u3041-\u3087\u308E\u3063\u30FC]/, // katakana: /[ア-ンガ-ボァ-ョヮッー]/, katakana: /[\u30A2-\u30F3\u30AC-\u30DC\u30A1-\u30E7\u30EE\u30C3\u30FC]/, kanji: { test: function(w) { // return '一' <= w && w <= '龠' || w === '々'; return '\u4E00' <= w && w <= '\u9FA0' || w === '\u3005'; } }, alphabet: /[a-zA-Z]/, digit: /[0-9]/ }; var tests = function(w) { var match = {}; for (var r in regex) { if (regex[r].test(w)) { match[r] = regex[r]; } } return match; }; var self = { first: tests(word.charAt(0)), last: tests(word.charAt(word.length - 1)) }; self.isTokenized = function(prev, next) { var p = prev.length ? prev.charAt(prev.length - 1) : ''; var n = next.length ? next.charAt(0) : ''; var check = function(w, test) { if (w.length) { for (var t in test) { if (test[t].test(w)) return false; } } return true; }; return check(p, self.first) && check(n, self.last); }; return self; }; Util.inherit = function(child, parent) { var obj = child || {}; for (var prop in parent) { if (typeof obj[prop] == 'undefined') { obj[prop] = parent[prop]; } } return obj; }; Util.countMatch = function(text, regex) { return text.split(regex).length - 1; // var n=0; // for (var i=0;;) { // i = text.search(regex); // if (i < 0) break; // n++; // text = text.substr(i+1); // } // return n; }; Util.countMatchTokenized = function(text, word) { var count = 0; var prev = null; var tok = new Util.Token(word); var texts = text.split(word); var len = texts.length; for (var i = 0; i < len; i++) { if (prev && tok.isTokenized(prev, texts[i])) count++; prev = texts[i] } return count; }; Util.indexOfTokenized = function(text, word) { var index = text.indexOf(word); if (index >= 0) { var tok = new Util.Token(word); var p = index > 1 ? text.substr(index - 1, 1) : ''; var n = text.substr(index + word.length, 1); if (tok.isTokenized(p, n)) { return index; } } return -1; }; Util.dump = function(obj) { if (typeof obj == 'undefined') return 'undefined'; if (typeof obj == 'string') return '"' + obj + '"'; if (typeof obj != 'object') return '' + obj; if (obj === null) return 'null'; if (obj instanceof Array) { return '[' + obj.map(function(v) { return 'obj' /*Util.dump(v)*/ ; }).join(',') + ']'; } else { var arr = []; for (var prop in obj) { arr.push(prop + ':' + 'obj' /*Util.dump(obj[prop])*/ ); } return '{' + arr.join(',') + '}'; } }; return Util; })(); ExtractContentJS.Lib.A = (function() { var A = {}; A.indexOf = Array.indexOf || function(self, elt /*, from*/ ) { var argi = 2; var len = self.length; var from = Number(arguments[argi++]) || 0; from = (from < 0) ? Math.ceil(from) : Math.floor(from); if (from < 0) from += len; for (; from < len; from++) { if (from in self && self[from] === elt) return from; } return -1; }; A.filter = Array.filter || function(self, fun /*, thisp*/ ) { var argi = 2; var len = self.length; if (typeof fun != "function") { throw new TypeError('A.filter: not a function'); } var rv = new Array(); var thisp = arguments[argi++]; for (var i = 0; i < len; i++) { if (i in self) { var val = self[i]; // in case fun mutates this if (fun.call(thisp, val, i, self)) rv.push(val); } } return rv; }; A.forEach = Array.forEach || function(self, fun /*, thisp*/ ) { var argi = 2; var len = self.length; if (typeof fun != 'function') { throw new TypeError('A.forEach: not a function'); } var thisp = arguments[argi++]; for (var i = 0; i < len; i++) { if (i in self) fun.call(thisp, self[i], i, self); } }; A.every = Array.every || function(self, fun /*, thisp*/ ) { var argi = 2; var len = self.length; if (typeof fun != 'function') { throw new TypeError('A.every: not a function'); } var thisp = arguments[argi++]; for (var i = 0; i < len; i++) { if (i in self && !fun.call(thisp, self[i], i, self)) { return false; } } return true; }; A.map = Array.map || function(self, fun /*, thisp*/ ) { var argi = 2; var len = self.length; if (typeof fun != 'function') { throw new TypeError('A.map: not a function'); } var rv = new Array(len); var thisp = arguments[argi++]; for (var i = 0; i < len; i++) { if (i in self) { rv[i] = fun.call(thisp, self[i], i, self); } } return rv; }; A.some = Array.some || function(self, fun /*, thisp*/ ) { var argi = 2; var len = self.length; if (typeof fun != "function") { throw new TypeError('A.some: not a function'); } var thisp = arguments[argi++]; for (var i = 0; i < len; i++) { if (i in self && fun.call(thisp, self[i], i, self)) { return true; } } return false; }; A.reduce = Array.reduce || function(self, fun /*, initial*/ ) { var argi = 2; var len = self.length; if (typeof fun != 'function') { throw TypeError('A.reduce: not a function '); } var i = 0; var prev; if (arguments.length > argi) { var rv = arguments[argi++]; } else { do { if (i in self) { rv = self[i++]; break; } if (++i >= len) { throw new TypeError('A.reduce: empty array'); } } while (true); } for (; i < len; i++) { if (i in self) rv = fun.call(null, rv, self[i], i, self); } return rv; }; A.zip = function(self) { if (self[0] instanceof Array) { var l = self[0].length; var len = self.length; var z = new Array(l); for (var i = 0; i < l; i++) { z[i] = []; for (var j = 0; j < len; j++) { z[i].push(self[j][i]); } } return z; } return []; }; A.first = function(self) { return self ? self[0] : null; }; A.last = function(self) { return self ? self[self.length - 1] : null; }; A.push = function(self, other) { return Array.prototype.push.apply(self, other); }; return A; })(); ExtractContentJS.Lib.DOM = (function() { var A = ExtractContentJS.Lib.A; var DOM = {}; DOM.getElementStyle = function(elem, prop) { var style = elem.style ? elem.style[prop] : null; if (!style) { var dv = elem.ownerDocument.defaultView; if (dv && dv.getComputedStyle) { try { var styles = dv.getComputedStyle(elem, null); } catch (e) { return null; } prop = prop.replace(/([A-Z])/g, '-$1').toLowerCase(); style = styles ? styles.getPropertyValue(prop) : null; } else if (elem.currentStyle) { style = elem.currentStyle[prop]; } } return style; }; DOM.text = function(node) { if (typeof node.textContent != 'undefined') { return node.textContent; } else if (node.nodeName == '#text') { return node.nodeValue; } else if (typeof node.innerText != 'undefined') { return node.innerText; // IE } return null; }; DOM.ancestors = function(e) { var body = e.ownerDocument.body; var r = []; var it = e; while (it != body) { r.push(it); it = it.parentNode; } r.push(body); return r; // [e .. document.body] }; DOM.commonAncestor = function(e1, e2) { var a1 = DOM.ancestors(e1).reverse(); var a2 = DOM.ancestors(e2).reverse(); var r = null; for (var i = 0; a1[i] && a2[i] && a1[i] == a2[i]; i++) { r = a1[i]; } return r; }; DOM.countMatchTagAttr = function(node, tag, attr, regexs) { var test = function(v) { return v.test(node[attr]); }; if ((node.tagName || '').toLowerCase() == tag && A.some(regexs, test)) { return 1; } var n = 0; var children = node.childNodes; for (var i = 0, len = children.length; i < len; i++) { n += DOM.countMatchTagAttr(children[i], tag, attr, regexs); } return n; }; DOM.matchTag = function(node, pat) { return A.some(pat, function(v) { if (typeof v == 'string') { try { return v == (node.tagName || '').toLowerCase(); } catch (e) { return '' } } else if (v instanceof Array) { return v[0] == (node.tagName || '').toLowerCase() && DOM.matchAttr(node, v[1]); } else { return false; } }); }; DOM.matchAttr = function(node, pat) { var test = function(pat, val) { if (typeof pat == 'string') { return pat == val; } else if (pat instanceof RegExp) { return pat.test(val); } else if (pat instanceof Array) { return A.some(pat, function(v) { return test(v, val); }); } else if (pat instanceof Object) { for (var prop in pat) { var n = node[prop]; if (n && DOM.matchAttr(n, pat[prop])) { return true; } } } return false; }; for (var prop in pat) { var attr = node[prop]; var ar = pat[prop]; if (attr) { return test(ar, attr); } } return false; }; DOM.matchStyle = function(node, pat) { var test = function(pat, val) { if (typeof pat == 'string') { return pat == val; } else if (pat instanceof RegExp) { return pat.test(val); } else if (pat instanceof Array) { return A.some(pat, function(v) { return test(v, val); }); } return false; }; for (var prop in pat) { if (test(pat[prop], DOM.getElementStyle(node, prop))) { return true; } } return false; }; return DOM; })(); if (typeof ExtractContentJS == 'undefined') { var ExtractContentJS = {}; } (function(ns) { var Util = ns.Lib.Util; var A = ns.Lib.A; var DOM = ns.Lib.DOM; var Leaf = Util.inherit(function(node /*, depth, inside, limit*/ ) { var depth = arguments[1] || 0; var inside = arguments[2] || {}; var limit = arguments[3] || 1048576; var leaf = { node: node, depth: depth, inside: inside }; leaf.statistics = function() { var t = (DOM.text(node) || '').replace(/\s+/g, ' '); var l = t.length; return { text: t.substr(0, limit), noLinkText: (inside.link || inside.form) ? '' : t, listTextLength: inside.list ? l : 0, noListTextLength: inside.list ? 0 : l, linkCount: inside.link ? 1 : 0, listCount: inside.li ? 1 : 0, linkListCount: (inside.li && inside.link) ? 1 : 0 }; }; return leaf; }, { commonAncestor: function( /* leaves */ ) { var ar = A.map(arguments, function(v) { return v.node; }); if (ar.length < 2) { return ar[0]; } return A.reduce(ar, function(prev, curr) { return DOM.commonAncestor(prev, curr); }); }, mergeStatistics: function(a, b) { var r = {}; for (var prop in a) { r[prop] = a[prop] + b[prop]; } return r; } }); var Block = function(leaves) { leaves = A.filter(leaves, function(v) { var s = DOM.text(v.node) || ''; s = s.replace(/\s+/g, ''); return s.length != 0; }); var block = { score: 0, leaves: leaves }; block.commonAncestor = function() { return Leaf.commonAncestor.apply(null, block.leaves); }; return block; }; var Content = function(c) { var self = { _content: c }; self.asLeaves = function() { return self._content; }; self.asNode = function() { if (self._node) return self._node; self._node = Leaf.commonAncestor.apply(null, self._content); return self._node; }; self.asTextFragment = function() { if (self._textFragment) return self._textFragment; if (self._content.length < 1) return ''; self._textFragment = A.reduce(self._content, function(prev, curr) { var s = DOM.text(curr.node); s = s.replace(/^\s+/g, '').replace(/\s+$/g, ''); s = s.replace(/\s+/g, ' '); return prev + s; }, ''); return self._textFragment; }; self.asText = function() { if (self._text) return self._text; // covering node var node = self.asNode(); self._text = node ? DOM.text(node) : ''; return self._text; }; self.toString = function() { return self.asTextFragment(); }; return self; }; ns.LayeredExtractor = function( /* handler, filter */ ) { var self = { handler: arguments[0] || [], filter: arguments[1] || {} }; self.factory = { getHandler: function(name) { if (typeof ns.LayeredExtractor.Handler != 'undefined') { return new ns.LayeredExtractor.Handler[name]; } return null; } }; self.addHandler = function(handler) { if (typeof handler != 'undefined') { self.handler.push(handler); } return self; }; self.filterFor = function(url) { // TODO }; self.extract = function(d) { var url = d.location.href; var res = { title: d.title, url: d.location.href }; var len = self.handler.length; for (var i = 0; i < len; i++) { var content = self.handler[i].extract(d, url, res); if (!content) continue; var f = self.filterFor(url); if (f) { content = f.filter(content); } content = new Content(content); if (!content.toString().length) continue; res.content = content; res.isSuccess = true; res.engine = res.engine || self.handler[i]; break; } return res; }; return self; }; ns.LayeredExtractor.Handler = {}; ns.LayeredExtractor.Handler.Heuristics = function( /*option, pattern*/ ) { var self = { name: 'Heuristics', content: [], opt: Util.inherit(arguments[0], { threshold: 180, minLength: 150, factor: { decay: 0.75, noBody: 0.72, continuous: 1.16 //1.62 }, punctuationWeight: 10, minNoLink: 8, noListRatio: 0.2, limit: { leaves: 800, recursion: 20, text: 1048576 }, debug: false }), pat: Util.inherit(arguments[1], { sep: [ 'div', 'center', 'td', 'h1', 'h2'], waste: [ /Copyright|All\s*Rights?\s*Reserved?/i], affiliate: [ /amazon[a-z0-9\.\/\-\?&]+-22/i], list: ['ul', 'dl', 'ol'], li: ['li', 'dd'], a: ['a'], form: ['form'], noContent: ['frameset'], ignore: [ 'iframe', //'img', 'script', 'style', 'select', 'noscript', ['div', { id: [/more/, /menu/, /side/, /navi/, /foot/], className: [/more/, /menu/, /side/, /navi/, /foot/] }]], ignoreStyle: { display: 'none', visibility: 'hidden' }, // punctuations: /[。、.,!?]|\.[^A-Za-z0-9]|,[^0-9]|!|\?/ punctuations: /[\u3002\u3001\uFF0E\uFF0C\uFF01\uFF1F]|\.[^A-Za-z0-9]|,[^0-9]|!|\?/ }) }; var MyBlock = Util.inherit(function(leaves) { var block = new Block(leaves); block.eliminateLinks = function() { var st = A.map(block.leaves, function(v) { return v.statistics(); }); if (!st.length) return ''; if (st.length == 1) { st = st[0]; } else { st = A.reduce(st, function(prev, curr) { return Leaf.mergeStatistics(prev, curr); }); } var nolinklen = st.noLinkText.length; var links = st.linkCount; var listlen = st.listTextLength; if (nolinklen < self.opt.minNoLink * links) { return ''; } // isLinklist var rate = st.linkListCount / (st.listCount || 1); rate *= rate; var limit = self.opt.noListRatio * rate * listlen; if (nolinklen < limit) { return ''; } return st.noLinkText; }; block.noBodyRate = function() { var val = 0; if (block.leaves.length > 0) { val += A.reduce(block.leaves, function(prev, curr) { return prev + DOM.countMatchTagAttr(curr.node, 'a', 'href', self.pat.affiliate); }, 0); } val /= 2.0; val += A.reduce(self.pat.waste, function(prev, curr) { return prev + Util.countMatch(block._nolink, curr); }, 0); return val; }; block.calcScore = function(factor, continuous) { // ignore link list block block._nolink = block.eliminateLinks(); if (block._nolink.length < self.opt.minLength) return 0; var c = Util.countMatch(block._nolink, self.pat.punctuations); c *= self.opt.punctuationWeight; c += block._nolink.length; c *= factor; // anti-scoring factors var noBodyRate = block.noBodyRate(); // scores c *= Math.pow(self.opt.factor.noBody, noBodyRate); block._c = block.score = c; block._c1 = c * continuous; return c; }; block.isAccepted = function() { return block._c > self.opt.threshold; }; block.isContinuous = function() { return block._c1 > self.opt.threshold; }; block.merge = function(other) { block.score += other._c1; block.depth = Math.min(block.depth, other.depth); A.push(block.leaves, other.leaves); return block; }; return block; }, { split: function(node) { var r = []; var buf = []; var leaves = 0; //mark note var limit = self.opt.limit.text; var flush = function(flag) { if (flag && buf.length) { r.push(new MyBlock(buf)); buf = []; } }; var rec = function(node, depth, inside) { // depth-first recursion if (leaves >= self.opt.limit.leaves) return r; if (depth >= self.opt.limit.recursion) return r; if (node.nodeName == '#comment') return r; if (DOM.matchTag(node, self.pat.ignore)) return r; if (DOM.matchStyle(node, self.pat.ignoreStyle)) return r; var children = node.childNodes; var sep = self.pat.sep; var len = children.length; var flags = { form: inside.form || DOM.matchTag(node, self.pat.form), link: inside.link || DOM.matchTag(node, self.pat.a), list: inside.list || DOM.matchTag(node, self.pat.list), li: inside.li || DOM.matchTag(node, self.pat.li) }; for (var i = 0; i < len; i++) { var c = children[i]; var f = DOM.matchTag(c, sep); flush(f); rec(c, depth + 1, flags); flush(f); } if (!len) { leaves++; buf.push(new Leaf(node, depth, flags, limit)); } return r; }; rec(node, 0, {}); flush(true); return r; } }); self.extract = function(d /*, url, res*/ ) { var isNoContent = function(v) { return d.getElementsByTagName(v).length != 0; }; if (A.some(self.pat.noContent, isNoContent)) return self; var factor = 1.0; var continuous = 1.0; var score = 0; var res = []; var blocks = MyBlock.split(d.body); var last; var len = blocks.length; for (var i = 0; i < len; i++) { var block = blocks[i]; if (last) { continuous /= self.opt.factor.continuous; } // score if (!block.calcScore(factor, continuous)) continue; factor *= self.opt.factor.decay; // clustor scoring if (block.isAccepted()) { if (block.isContinuous() && last) { last.merge(block); } else { last = block; res.push(block); } continuous = self.opt.factor.continuous; } else { // rejected if (!last) { // do not decay if no block is pushed factor = 1.0 } } } self.blocks = res.sort(function(a, b) { return b.score - a.score; }); var best = A.first(self.blocks); if (best) { self.content = best.leaves; } return self.content; }; return self; }; ns.LayeredExtractor.Handler.GoogleAdSection = function( /*opt*/ ) { var self = { name: 'GoogleAdSection', content: [], state: [], opt: Util.inherit(arguments[0], { limit: { leaves: 800, recursion: 20 }, debug: false }) }; var pat = { ignore: /google_ad_section_start\(weight=ignore\)/i, section: /google_ad_section_start/i, end: /google_ad_section_end/i }; var stIgnore = 1; var stSection = 2; self.inSection = function() { return A.last(self.state) == stSection; }; self.ignore = function() { self.state.push(stIgnore); } self.section = function() { self.state.push(stSection); } self.end = function() { if (self.state.length) self.state.pop(); }; self.parse = function(node /*, depth*/ ) { var depth = arguments[1] || 0; if (node.nodeName == '#comment') { if (pat.ignore.test(node.nodeValue)) { self.ignore(); } else if (pat.section.test(node.nodeValue)) { self.section(); } else if (pat.end.test(node.nodeValue)) { self.end(); } return; } if (self.content.length >= self.opt.limit.leaves) return; if (depth >= self.opt.limit.recursion) return; var children = node.childNodes; var len = children.length; for (var i = 0; i < len; i++) { var c = children[i]; self.parse(c, depth + 1); } if (!len && self.inSection()) { self.content.push(new Leaf(node, depth)); } return; }; self.extract = function(d /*, url, res*/ ) { self.parse(d); self.blocks = [new Block(self.content)]; return self.content; }; return self; }; })(ExtractContentJS);